Spaces:

JAYASREESS
/

duckdb

Runtime error

App Files Files Community

JAYASREESS commited on Jan 28

Commit

d1fb1ab

verified ·

1 Parent(s): d6c3923

upload

Browse files

Files changed (7) hide show

Dockerfile +22 -0
README.md +33 -10
bronze.py +47 -0
gold.py +50 -0
requirements.txt +4 -0
silver.py +51 -0
train.py +74 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,22 @@

+# Use an official Python slim image as a parent image for a smaller size
+FROM python:3.12-slim
+# Set the working directory inside the container
+WORKDIR /app
+# Copy the requirements file first to leverage Docker's layer caching
+COPY requirements.txt .
+# Install the Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy the rest of your application's code, including the trained models
+# The .dockerignore file will ensure unnecessary files are excluded
+COPY . .
+# Expose the port that Gradio runs on
+EXPOSE 7860
+# The command to run when the container starts
+# This will launch your Gradio application
+CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,10 +1,33 @@
----
-title: Duckdb
-emoji: 🦀
-colorFrom: pink
-colorTo: purple
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# Credit Card Fraud Detection with DuckDB and Medallion Architecture
+This project demonstrates an end-to-end pipeline for credit card fraud detection. It uses DuckDB to process data in a Medallion Architecture (Bronze, Silver, Gold) and trains a Random Forest model to identify fraudulent transactions.
+## Project Structure
+- `data/`: Contains the raw CSV datasets (`fraudTrain.csv`, `fraudTest.csv`).
+- `src/`: Contains the Python scripts for the data pipeline and model training.
+  - `bronze.py`: Ingests raw data into the bronze layer.
+  - `silver.py`: Cleans and transforms data for the silver layer.
+  - `gold.py`: Creates aggregated features for the gold (analytics) layer.
+  - `train.py`: Trains a `RandomForestClassifier` on the gold data and saves the model.
+- `models/`: Directory where the trained model is saved.
+- `requirements.txt`: Lists the required Python packages.
+## How to Run
+1.  **Install dependencies:**
+    ```bash
+    pip install -r requirements.txt
+    ```
+2.  **Run the training pipeline:**
+    This command executes the entire data pipeline (Bronze, Silver, Gold) and trains the model.
+    ```bash
+    python src/train.py
+    ```
+## Medallion Architecture
+-   **Bronze Layer**: Raw, unfiltered data ingested directly from the source CSVs.
+-   **Silver Layer**: Cleaned and transformed data. Timestamps are corrected, and new features like cardholder `age` are derived.
+-   **Gold Layer**: Analytics-ready data with aggregated features (e.g., `avg_merch_spend`) suitable for machine learning.

bronze.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import duckdb
+import os
+def setup_bronze_layer():
+    """
+    Connects to DuckDB, creates the bronze_transactions table,
+    and ingests data from the CSV files.
+    """
+    db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
+    con = duckdb.connect(database=db_path, read_only=False)
+    # Create schema for raw data
+    con.execute("CREATE SCHEMA IF NOT EXISTS bronze;")
+    # Create bronze table from CSV files
+    # The read_csv_auto function will infer schemas and combine files.
+    # Using glob to read both train and test data.
+    train_file = os.path.join('..', 'data', 'fraudTrain.csv')
+    test_file = os.path.join('..', 'data', 'fraudTest.csv')
+    # It's better to load them separately and then combine if needed,
+    # but for simplicity in bronze, we can create two tables or load into one.
+    # Let's load them into one table with an indicator of the source if needed.
+    # For now, we just load the training data. We can add test data later.
+    print("Ingesting data into bronze_transactions table...")
+    con.execute(f"""
+        CREATE OR REPLACE TABLE bronze.bronze_transactions AS
+        SELECT * FROM read_csv_auto('{train_file}');
+    """)
+    # To add the test data, you could use an INSERT statement:
+    con.execute(f"""
+        INSERT INTO bronze.bronze_transactions
+        SELECT * FROM read_csv_auto('{test_file}');
+    """)
+    print("Data ingestion complete.")
+    # Verify the data is loaded
+    record_count = con.execute("SELECT COUNT(*) FROM bronze.bronze_transactions;").fetchone()[0]
+    print(f"Total records in bronze_transactions: {record_count}")
+    con.close()
+if __name__ == "__main__":
+    setup_bronze_layer()

gold.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import duckdb
+import os
+from silver import setup_silver_layer
+def setup_gold_layer():
+    """
+    Connects to DuckDB, reads from the silver layer,
+    and creates aggregated features for the gold table.
+    """
+    # Ensure the silver layer exists before proceeding
+    setup_silver_layer()
+    db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
+    con = duckdb.connect(database=db_path, read_only=False)
+    # Create schema for gold data
+    con.execute("CREATE SCHEMA IF NOT EXISTS gold;")
+    print("Creating aggregated features for the gold layer...")
+    # Create the gold table with aggregated features
+    con.execute("""
+        CREATE OR REPLACE TABLE gold.gold_transactions AS
+        SELECT
+            *,
+            -- Average transaction amount for the merchant
+            AVG(amt) OVER (PARTITION BY merchant) AS avg_merch_spend,
+            -- Lag feature: amount of the previous transaction for the card
+            LAG(amt, 1, 0) OVER (PARTITION BY cc_num ORDER BY trans_date_time) AS prev_trans_amt,
+            -- Lead feature: amount of the next transaction for the card
+            LEAD(amt, 1, 0) OVER (PARTITION BY cc_num ORDER BY trans_date_time) AS next_trans_amt
+        FROM silver.silver_transactions;
+    """)
+    print("Gold layer setup complete.")
+    # Verify the new columns in the gold table
+    print("Columns in gold.gold_transactions:")
+    print(con.execute("DESCRIBE gold.gold_transactions;").fetchall())
+    record_count = con.execute("SELECT COUNT(*) FROM gold.gold_transactions;").fetchone()[0]
+    print(f"Total records in gold_transactions: {record_count}")
+    con.close()
+if __name__ == "__main__":
+    # For direct execution, this will now run the full pipeline up to gold
+    print("Setting up gold layer (which includes bronze and silver)...")
+    setup_gold_layer()
+    print("Gold layer setup finished.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+duckdb
+pandas
+scikit-learn
+joblib

silver.py ADDED Viewed

	@@ -0,0 +1,51 @@

+import duckdb
+import os
+from bronze import setup_bronze_layer
+def setup_silver_layer():
+    """
+    Connects to DuckDB, reads from the bronze layer,
+    and applies transformations to create the silver table.
+    """
+    # Ensure the bronze layer exists before proceeding
+    setup_bronze_layer()
+    db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
+    con = duckdb.connect(database=db_path, read_only=False)
+    # Create schema for silver data
+    con.execute("CREATE SCHEMA IF NOT EXISTS silver;")
+    print("Transforming data for the silver layer...")
+    # Perform transformations and create the silver table
+    con.execute("""
+        CREATE OR REPLACE TABLE silver.silver_transactions AS
+        SELECT
+            *,
+            -- The column is already a timestamp, so just alias it
+            trans_date_trans_time AS trans_date_time,
+            -- Calculate age of the cardholder at the time of transaction
+            date_part('year', trans_date_trans_time) - date_part('year', dob) AS age,
+            -- Extract hour of day from transaction time
+            date_part('hour', trans_date_trans_time) AS trans_hour
+        FROM bronze.bronze_transactions;
+    """)
+    print("Silver layer setup complete.")
+    # Verify the new columns in the silver table
+    print("Columns in silver.silver_transactions:")
+    print(con.execute("DESCRIBE silver.silver_transactions;").fetchall())
+    record_count = con.execute("SELECT COUNT(*) FROM silver.silver_transactions;").fetchone()[0]
+    print(f"Total records in silver_transactions: {record_count}")
+    con.close()
+if __name__ == "__main__":
+    # For direct execution, this will now run the full pipeline up to silver
+    print("Setting up silver layer (which includes bronze)...")
+    setup_silver_layer()
+    print("Silver layer setup finished.")

train.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import duckdb
+import os
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import classification_report, confusion_matrix
+import joblib
+from gold import setup_gold_layer
+def train_model():
+    """
+    Trains a RandomForestClassifier on the gold layer data.
+    """
+    # Ensure the full data pipeline has been run
+    setup_gold_layer()
+    db_path = os.path.join('..', 'data', 'fraud_detection.duckdb')
+    con = duckdb.connect(database=db_path, read_only=False)
+    print("Loading data from gold.gold_transactions...")
+    # Load the entire table into a pandas DataFrame
+    df = con.execute("SELECT * FROM gold.gold_transactions").fetchdf()
+    con.close()
+    print("Preparing data for training...")
+    # Define features (X) and target (y)
+    # Exclude identifiers, raw timestamps, and the target variable itself
+    features = [col for col in df.columns if col not in [
+        'cc_num', 'first', 'last', 'street', 'city', 'state', 'zip', 'dob',
+        'trans_num', 'trans_date_trans_time', 'trans_date_time', 'is_fraud'
+    ]]
+    X = df[features]
+    y = df['is_fraud']
+    # One-hot encode categorical features
+    categorical_features = ['merchant', 'category', 'gender', 'job']
+    X = pd.get_dummies(X, columns=categorical_features, drop_first=True)
+    # Align columns for prediction later - crucial if test set has different categories
+    train_cols = X.columns
+    # Split data into training and testing sets
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
+    print("Training RandomForestClassifier model...")
+    # Initialize and train the model
+    model = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
+    model.fit(X_train, y_train)
+    print("Evaluating model performance...")
+    # Make predictions and evaluate
+    y_pred = model.predict(X_test)
+    print("Classification Report:")
+    print(classification_report(y_test, y_pred))
+    print("Confusion Matrix:")
+    print(confusion_matrix(y_test, y_pred))
+    # Save the trained model and the column list
+    model_path = os.path.join('..', 'models')
+    if not os.path.exists(model_path):
+        os.makedirs(model_path)
+    joblib.dump(model, os.path.join(model_path, 'fraud_detection_model.joblib'))
+    joblib.dump(train_cols, os.path.join(model_path, 'model_columns.joblib'))
+    print(f"Model saved to {model_path}")
+if __name__ == "__main__":
+    # The train_model function now handles the full pipeline run and training
+    train_model()