men_d / bronze.py
JAYASREESS's picture
Upload 8 files
d77c194 verified
import duckdb
import os
def setup_bronze_layer():
"""
Connects to DuckDB, creates the bronze_transactions table,
and ingests data from the CSV files.
"""
db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
con = duckdb.connect(database=db_path, read_only=False)
# Create schema for raw data
con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")
# Create bronze table from CSV files
# The read_csv_auto function will infer schemas and combine files.
# Using glob to read both train and test data.
train_file = os.path.join('..', 'data', 'fraudTrain.csv')
test_file = os.path.join('..', 'data', 'fraudTest.csv')
# It's better to load them separately and then combine if needed,
# but for simplicity in bronze, we can create two tables or load into one.
# Let's load them into one table with an indicator of the source if needed.
# For now, we just load the training data. We can add test data later.
print("Ingesting data into bronze_transactions table...")
con.execute(f"""
CREATE OR REPLACE TABLE bronze.bronze_transactions AS
SELECT * FROM read_csv_auto('{train_file}');
""")
# To add the test data, you could use an INSERT statement:
con.execute(f"""
INSERT INTO bronze.bronze_transactions
SELECT * FROM read_csv_auto('{test_file}');
""")
print("Data ingestion complete.")
# Verify the data is loaded
record_count = con.execute("SELECT COUNT(*) FROM bronze.bronze_transactions;").fetchone()[0]
print(f"Total records in bronze_transactions: {record_count}")
con.close()
if __name__ == "__main__":
setup_bronze_layer()