import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.utils import resample from sklearn.metrics import accuracy_score, classification_report from sklearn.linear_model import LogisticRegression from imblearn.over_sampling import SMOTE from transformers import pipeline import gradio as gr from google.colab import files # Load the creditcard.csv dataset from Google Drive or Colab local file upload uploaded = files.upload() # This will prompt you to upload your file df = pd.read_csv('creditcard.csv') # Display basic information print("Columns in the dataset:", df.columns) print(df.head()) # Preprocessing: Selecting relevant columns # Assuming the dataset has 'Time', 'Amount', and 'Class' columns along with 'V1' to 'V28' features time_col = 'Time' amount_col = 'Amount' class_col = 'Class' feature_cols = [col for col in df.columns if col not in [class_col, time_col]] # Handle missing values df = df.fillna(df.mean()) # Downsample the majority class to handle class imbalance df_majority = df[df[class_col] == 0] df_minority = df[df[class_col] == 1] df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority)) df_balanced = pd.concat([df_majority_downsampled, df_minority]) # Feature scaling X = df_balanced[feature_cols] y = df_balanced[class_col] scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # Balancing the dataset using SMOTE smote = SMOTE() X_train, y_train = smote.fit_resample(X_train, y_train) # Logistic Regression Model model = LogisticRegression(max_iter=1000) model.fit(X_train, y_train) # Predictions y_pred = model.predict(X_test) # Model evaluation print("Accuracy:", accuracy_score(y_test, y_pred)) print("Classification Report:\n", classification_report(y_test, y_pred)) # Initialize the retrieval pipeline with a lightweight model (if required) retrieval_pipeline = pipeline("feature-extraction", model="distilbert-base-uncased") def retrieve_explanation(prediction): if prediction == 1: explanation = "The transaction is classified as fraudulent based on the provided features." else: explanation = "The transaction is classified as non-fraudulent based on the provided features." return explanation # Gradio prediction function with complete feature padding def fraud_detection_predictor(V1, V2, V3, Amount): # Create a list of features with default zero values for missing ones input_features = [0] * len(feature_cols) # Map the provided features to their indices (ensure they are in correct feature_cols) v1_index = feature_cols.index('V1') # Ensure these columns exist in feature_cols v2_index = feature_cols.index('V2') v3_index = feature_cols.index('V3') amount_index = feature_cols.index('Amount') # Assign user inputs to the correct feature indices input_features[v1_index] = V1 input_features[v2_index] = V2 input_features[v3_index] = V3 input_features[amount_index] = Amount # Scale input data using the pre-fitted scaler input_data = scaler.transform([input_features]) # Make a prediction prediction = model.predict(input_data)[0] fraud_status = "Fraudulent" if prediction == 1 else "Non-Fraudulent" # Get explanation explanation = retrieve_explanation(prediction) return fraud_status, explanation # Define Gradio Interface interface = gr.Interface( fn=fraud_detection_predictor, inputs=[ gr.Number(label="V1"), gr.Number(label="V2"), gr.Number(label="V3"), gr.Number(label="Amount") ], outputs=[ gr.Textbox(label="Fraud Status"), gr.Textbox(label="Explanation") ], title="Simplified Credit Card Fraud Detection", description="Enter a few transaction features (V1, V2, V3, Amount) to predict fraud status." ) # Launch Gradio Interface interface.launch()