January 07, 2025

CSV to SQL Table

CSV to SQL Table (pandas Code)

CSV to SQL Table (pandas Code)

Below is a sample code snippet that demonstrates how to read a CSV file using pandas and convert it into a SQL table.

Note: Download CSV File Locations: Traffic census data

The Destination SQL Table Schema:
Column Name Data Type
Col_1 INTEGER
Col_2 INTEGER
Col_3 INTEGER
Col_4 INTEGER
Col_5 INTEGER
Col_6 INTEGER
Col_7 INTEGER
Col_10 INTEGER
Col_9 INTEGER
Col_10 INTEGER
device_id STRING
lane_number STRING
cross_datetime DATETIME
speed_kmph_39 INTEGER
speed_kmph_40_49 INTEGER
speed_kmph_50_59 INTEGER
speed_kmph_60_69 INTEGER
speed_kmph_70_79 INTEGER
speed_kmph_80_89 INTEGER
speed_kmph_90_99 INTEGER
speed_kmph_100_109 INTEGER
speed_kmph_110_119 INTEGER
speed_kmph_120_129 INTEGER
speed_kmph_130_more INTEGER
    
CREATE TABLE dbo.tbl_traffic_census(
Col_1 int NULL, Col_2 int NULL, Col_3 int NULL,
Col_4 int NULL, Col_5 int NULL, Col_6 int NULL,
Col_7 int NULL, Col_8 int NULL, Col_9 int NULL,
Col_10 int NULL, device_id varchar(255) NOT NULL,
lane_number varchar(255) NOT NULL,
cross_datetime datetime NOT NULL,
speed_kmph_39 int NULL, speed_kmph_40_49 int NULL,
speed_kmph_50_59 int NULL, speed_kmph_60_69 int NULL,
speed_kmph_70_79 int NULL, speed_kmph_80_89 int NULL,
speed_kmph_90_99 int NULL, speed_kmph_100_109 int NULL,
speed_kmph_110_119 int NULL, speed_kmph_120_129 int NULL,
speed_kmph_130_more int NULL,
PRIMARY KEY CLUSTERED 
(device_id ASC, lane_number ASC, cross_datetime ASC)
WITH (PAD_INDEX = OFF, STATISTICS_NORECOMPUTE = OFF, IGNORE_DUP_KEY = OFF, 
ALLOW_ROW_LOCKS = ON, ALLOW_PAGE_LOCKS = ON, 
OPTIMIZE_FOR_SEQUENTIAL_KEY = OFF) ON PRIMARY
);
    

1. Read the CSV file into a pandas DataFrame.
2. Clean the data by removing duplicates and converting data types.
Alternatively, we can use tools like the BCP command, import utility, or SSIS packages to load data from files into SQL tables.
    
import pandas as pd
import pyodbc

# Read the CSV file into a DataFrame
df = pd.read_csv('.../path/filename.csv')

# Clean the data: Remove duplicates
df_cleaned = df.drop_duplicates(subset=['id', 'kanal', 'aeg'])

# Convert 'aeg' column to datetime
df_cleaned['aeg'] = pd.to_datetime(df_cleaned['aeg'])

# Convert 'id' and 'kanal' to strings (varchar equivalent)
df_cleaned['id'] = df_cleaned['id'].astype(str)
df_cleaned['kanal'] = df_cleaned['kanal'].astype(str)

# Convert other columns to numeric, replacing non-numeric values with NaN and filling with 0
for col in df_cleaned.columns:
    if col not in ['aeg', 'id', 'kanal']:  # Exclude 'aeg', 'id', and 'kanal' columns
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(0).astype(int)

# Strip leading/trailing spaces in column names if any
df_cleaned.columns = df_cleaned.columns.str.strip()

# Step 2: Set up the SQL Server connection
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};'
                      'SERVER=localhost;'
                      'DATABASE=<>;'
                      'UID=<>;'
                      'PWD=<>')

cursor = conn.cursor()

# Step 3: Create the corrected SQL insert query
insert_query = """
INSERT INTO [dbo].[tbl_traffic_census] (
    [Col_1], [Col_2], [Col_3], [Col_4], [Col_5], [Col_6], [Col_7], [Col_8], 
    [Col_9], [Col_10], [device_id], [lane_number], [cross_datetime], 
    [speed_kmph_39], [speed_kmph_40_49], [speed_kmph_50_59], [speed_kmph_60_69], 
    [speed_kmph_70_79], [speed_kmph_80_89], [speed_kmph_90_99], [speed_kmph_100_109], 
    [speed_kmph_110_119], [speed_kmph_120_129], [speed_kmph_130_more]
) 
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
"""

# Step 4: Insert data row by row from the DataFrame into the SQL Server table
for index, row in df_cleaned.iterrows():
    cursor.execute(insert_query, 
                   row['1'], row['2'], row['3'], row['4'], row['5'], 
                   row['6'], row['7'], row['8'], row['9'], row['10'],
                   row['id'], row['kanal'], row['aeg'], 
                   row['<40Kph'], row['40-<50'], row['50-<60'], row['60-<70'], 
                   row['70-<80'], row['80-<90'], row['90-<100'], row['100-<110'], 
                   row['110-<120'], row['120-<130'], row['=>130'])

# Step 5: Commit the changes to the database and close the connection
conn.commit()
cursor.close()
conn.close()

print("Data has been successfully inserted into the SQL Server table.")