File size: 1,688 Bytes
d1fb1ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
import duckdb
import os

def setup_bronze_layer():
    """
    Connects to DuckDB, creates the bronze_transactions table,
    and ingests data from the CSV files.
    """
    db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
    con = duckdb.connect(database=db_path, read_only=False)

    # Create schema for raw data
    con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")

    # Create bronze table from CSV files
    # The read_csv_auto function will infer schemas and combine files.
    # Using glob to read both train and test data.
    train_file = os.path.join('..', 'data', 'fraudTrain.csv')
    test_file = os.path.join('..', 'data', 'fraudTest.csv')

    # It's better to load them separately and then combine if needed,
    # but for simplicity in bronze, we can create two tables or load into one.
    # Let's load them into one table with an indicator of the source if needed.
    # For now, we just load the training data. We can add test data later.

    print("Ingesting data into bronze_transactions table...")
    con.execute(f"""
        CREATE OR REPLACE TABLE bronze.bronze_transactions AS
        SELECT * FROM read_csv_auto('{train_file}');
    """)

    # To add the test data, you could use an INSERT statement:
    con.execute(f"""
        INSERT INTO bronze.bronze_transactions
        SELECT * FROM read_csv_auto('{test_file}');
    """)

    print("Data ingestion complete.")
    
    # Verify the data is loaded
    record_count = con.execute("SELECT COUNT(*) FROM bronze.bronze_transactions;").fetchone()[0]
    print(f"Total records in bronze_transactions: {record_count}")

    con.close()

if __name__ == "__main__":
    setup_bronze_layer()