import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from transformers import pipeline
import gradio as gr
from google.colab import files

# Load the creditcard.csv dataset from Google Drive or Colab local file upload
uploaded = files.upload()  # This will prompt you to upload your file
df = pd.read_csv('creditcard.csv')

# Display basic information
print("Columns in the dataset:", df.columns)
print(df.head())

# Preprocessing: Selecting relevant columns
# Assuming the dataset has 'Time', 'Amount', and 'Class' columns along with 'V1' to 'V28' features
time_col = 'Time'
amount_col = 'Amount'
class_col = 'Class'
feature_cols = [col for col in df.columns if col not in [class_col, time_col]]

# Handle missing values
df = df.fillna(df.mean())

# Downsample the majority class to handle class imbalance
df_majority = df[df[class_col] == 0]
df_minority = df[df[class_col] == 1]
df_majority_downsampled = resample(df_majority, replace=False, n_samples=len(df_minority))
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Feature scaling
X = df_balanced[feature_cols]
y = df_balanced[class_col]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Balancing the dataset using SMOTE
smote = SMOTE()
X_train, y_train = smote.fit_resample(X_train, y_train)

# Logistic Regression Model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Initialize the retrieval pipeline with a lightweight model (if required)
retrieval_pipeline = pipeline("feature-extraction", model="distilbert-base-uncased")

def retrieve_explanation(prediction):
    if prediction == 1:
        explanation = "The transaction is classified as fraudulent based on the provided features."
    else:
        explanation = "The transaction is classified as non-fraudulent based on the provided features."
    return explanation

# Gradio prediction function with complete feature padding
def fraud_detection_predictor(V1, V2, V3, Amount):
    # Create a list of features with default zero values for missing ones
    input_features = [0] * len(feature_cols)
    
    # Map the provided features to their indices (ensure they are in correct feature_cols)
    v1_index = feature_cols.index('V1')  # Ensure these columns exist in feature_cols
    v2_index = feature_cols.index('V2')
    v3_index = feature_cols.index('V3')
    amount_index = feature_cols.index('Amount')
    
    # Assign user inputs to the correct feature indices
    input_features[v1_index] = V1
    input_features[v2_index] = V2
    input_features[v3_index] = V3
    input_features[amount_index] = Amount

    # Scale input data using the pre-fitted scaler
    input_data = scaler.transform([input_features])
    
    # Make a prediction
    prediction = model.predict(input_data)[0]
    fraud_status = "Fraudulent" if prediction == 1 else "Non-Fraudulent"
    
    # Get explanation
    explanation = retrieve_explanation(prediction)
    return fraud_status, explanation

# Define Gradio Interface
interface = gr.Interface(
    fn=fraud_detection_predictor,
    inputs=[
        gr.Number(label="V1"),
        gr.Number(label="V2"),
        gr.Number(label="V3"),
        gr.Number(label="Amount")
    ],
    outputs=[
        gr.Textbox(label="Fraud Status"),
        gr.Textbox(label="Explanation")
    ],
    title="Simplified Credit Card Fraud Detection",
    description="Enter a few transaction features (V1, V2, V3, Amount) to predict fraud status."
)

# Launch Gradio Interface
interface.launch()