Spaces:

NeerajCodz
/

creditCardFraudDetection

Sleeping

App Files Files Community

Neeraj Sathish Kumar commited on Dec 5, 2025

Commit

b031340

1 Parent(s): 8795976

/get-random and /llm-analyse added

Browse files

Files changed (12) hide show

app.py +217 -14
data/filteredTest.parquet +3 -0
data/filteredTrain.parquet +3 -0
stats/graphs/metrics.png +0 -0
stats/graphs/precision-recall.png +0 -0
stats/graphs/predict.png +0 -0
stats/graphs/request_ram.png +0 -0
stats/graphs/roc.png +0 -0
stats/graphs/speed.png +0 -0
stats/graphs/stats.png +0 -0
stats/graphs/training_summary.png +0 -0
test.py +95 -0

app.py CHANGED Viewed

@@ -3,10 +3,14 @@ import sys
 import joblib
 import pandas as pd
 from typing import Dict, Any, List, Union, Optional
-from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel, Field
 import numpy as np
 import warnings
 # Suppress sklearn version warnings
 warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.base")
@@ -59,6 +63,10 @@ NUMERICAL_FEATURES = [
 # Ensure the order matches the columns fed to the ColumnTransformer during training
 EXPECTED_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
 # --- FASTAPI SETUP ---
 app = FastAPI(
     title="Credit Card Fraud Detection API",
@@ -67,13 +75,24 @@ app = FastAPI(
 )
 class SingleTransactionPayload(BaseModel):
-    model_name: str = Field(..., description="Model alias (e.g., 'calibrated_xgb', 'calibrated_rf', 'calibrated_dt').")
     features: Dict[str, Any] = Field(..., description="Single transaction record for prediction.")
 class MultipleTransactionsPayload(BaseModel):
-    model_name: str = Field(..., description="Model alias (e.g., 'calibrated_xgb', 'calibrated_rf', 'calibrated_dt').")
     features: List[Dict[str, Any]] = Field(..., description="List of transaction records for prediction.")
 # --- LOAD MODELS AT STARTUP ---
 def load_pipelines():
     """Load all ML model pipelines"""
@@ -88,7 +107,7 @@ def load_pipelines():
             if not os.path.exists(filename):
                 abs_path = os.path.abspath(filename)
                 print(f"❌ Model file not found: {filename}")
-                print(f"   Expected at: {abs_path}")
                 continue
             # Get file info
@@ -101,24 +120,51 @@ def load_pipelines():
         except AttributeError as e:
             print(f"❌ Compatibility error loading {filename}")
-            print(f"   Error: {e}")
-            print(f"   💡 This usually means the model was saved with a different sklearn version")
-            print(f"   💡 Try re-training and saving the model with sklearn {sklearn.__version__}")
         except Exception as e:
             print(f"❌ Failed to load {filename}")
-            print(f"   Error type: {type(e).__name__}")
-            print(f"   Error message: {e}")
     if not MODELS:
-        print("⚠️  No models loaded. Predictions will fail.")
-        print("   💡 Ensure .pkl files are in the same directory as app.py (or subdirectories like model_outputs/)")
-        print("   💡 Check that models were saved with compatible sklearn version")
     else:
         print(f"✅ Successfully loaded {len(MODELS)} model(s): {list(MODELS.keys())}")
 # Load models on import
 load_pipelines()
 # --- HELPER FUNCTION: PREPARE FEATURES (WITH FIX) ---
 def prepare_features(features_list: List[Dict[str, Any]]) -> pd.DataFrame:
     """
@@ -144,10 +190,32 @@ def prepare_features(features_list: List[Dict[str, Any]]) -> pd.DataFrame:
     # Convert categorical columns to category dtype (as done during training)
     for col in CATEGORICAL_FEATURES:
         df_features[col] = df_features[col].astype("category")
     return df_features
 # --- FASTAPI ENDPOINTS ---
 @app.get("/")
 async def root():
@@ -162,6 +230,8 @@ async def root():
             "models": "/models",
             "predict": "/predict (POST) - Single transaction",
             "predict_multiple": "/predict_multiple (POST) - Multiple transactions",
             "docs": "/docs"
         },
         "response_format": {
@@ -175,6 +245,10 @@ async def root():
                     "min_fraud_score": "float",
                     "max_fraud_score": "float"
                 }
             }
         }
     }
@@ -183,10 +257,12 @@ async def root():
 async def health_check():
     """Health check endpoint"""
     return {
-        "status": "healthy" if MODELS else "degraded",
         "version": VERSION,
         "models_loaded": list(MODELS.keys()),
-        "model_count": len(MODELS)
     }
 @app.get("/models")
@@ -198,6 +274,59 @@ async def list_models():
         "model_files": MODEL_MAP,
         "version": VERSION
     }
 @app.post("/predict")
 async def predict_single(payload: SingleTransactionPayload):
@@ -308,6 +437,80 @@ async def predict_multiple(payload: MultipleTransactionsPayload):
             detail=f"Prediction execution failed: {type(e).__name__}: {str(e)}"
         )
 # For local development
 if __name__ == "__main__":
     import uvicorn

 import joblib
 import pandas as pd
 from typing import Dict, Any, List, Union, Optional
+from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel, Field
 import numpy as np
 import warnings
+import random
+import google.generativeai as genai
+import json
+import re
 # Suppress sklearn version warnings
 warnings.filterwarnings("ignore", category=UserWarning, module="sklearn.base")
 # Ensure the order matches the columns fed to the ColumnTransformer during training
 EXPECTED_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
+# --- DATA CONSTANTS ---
+DATA_FILE_PATH = "data/filteredTest.parquet"
+DATA_DF: Optional[pd.DataFrame] = None # Global variable to cache the data
 # --- FASTAPI SETUP ---
 app = FastAPI(
     title="Credit Card Fraud Detection API",
 )
 class SingleTransactionPayload(BaseModel):
+    model_name: str = Field(..., description="Model alias (e.g., 'decision_tree', 'random_forest', 'xgboost').")
     features: Dict[str, Any] = Field(..., description="Single transaction record for prediction.")
 class MultipleTransactionsPayload(BaseModel):
+    model_name: str = Field(..., description="Model alias (e.g., 'decision_tree', 'random_forest', 'xgboost').")
     features: List[Dict[str, Any]] = Field(..., description="List of transaction records for prediction.")
+class LLMAnalysePayload(BaseModel):
+    transactions: List[Dict[str, Any]] = Field(..., description="List of transaction records with 22 fields including fraud_score, STATUS, etc.")
+# Configure Gemini API
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+if GEMINI_API_KEY:
+    genai.configure(api_key=GEMINI_API_KEY)
+    print("✅ Gemini API configured")
+else:
+    print("⚠️ GEMINI_API_KEY not set in environment variables. LLM endpoint will fail.")
 # --- LOAD MODELS AT STARTUP ---
 def load_pipelines():
     """Load all ML model pipelines"""
             if not os.path.exists(filename):
                 abs_path = os.path.abspath(filename)
                 print(f"❌ Model file not found: {filename}")
+                print(f"   Expected at: {abs_path}")
                 continue
             # Get file info
         except AttributeError as e:
             print(f"❌ Compatibility error loading {filename}")
+            print(f"   Error: {e}")
+            print(f"   💡 This usually means the model was saved with a different sklearn version")
+            print(f"   💡 Try re-training and saving the model with sklearn {sklearn.__version__}")
         except Exception as e:
             print(f"❌ Failed to load {filename}")
+            print(f"   Error type: {type(e).__name__}")
+            print(f"   Error message: {e}")
     if not MODELS:
+        print("⚠️  No models loaded. Predictions will fail.")
+        print("   💡 Ensure .pkl files are in the same directory as app.py (or subdirectories like model_outputs/)")
+        print("   💡 Check that models were saved with compatible sklearn version")
     else:
         print(f"✅ Successfully loaded {len(MODELS)} model(s): {list(MODELS.keys())}")
 # Load models on import
 load_pipelines()
+# --- HELPER FUNCTION: CACHE DATA ---
+def load_data_file() -> Optional[pd.DataFrame]:
+    """Load the Parquet data file into the global DATA_DF variable."""
+    global DATA_DF
+    if DATA_DF is not None:
+        return DATA_DF
+    try:
+        if not os.path.exists(DATA_FILE_PATH):
+            abs_path = os.path.abspath(DATA_FILE_PATH)
+            print(f"❌ Data file not found: {DATA_FILE_PATH}")
+            print(f"   Expected at: {abs_path}")
+            return None
+        print(f"💾 Loading data from {DATA_FILE_PATH}...")
+        # Use pyarrow engine for better performance with parquet
+        DATA_DF = pd.read_parquet(DATA_FILE_PATH, engine='pyarrow')
+        print(f"✅ Successfully loaded data with {len(DATA_DF)} rows.")
+        return DATA_DF
+    except Exception as e:
+        print(f"❌ Failed to load data file: {e}")
+        return None
+# Load data on import for the new endpoint
+load_data_file()
 # --- HELPER FUNCTION: PREPARE FEATURES (WITH FIX) ---
 def prepare_features(features_list: List[Dict[str, Any]]) -> pd.DataFrame:
     """
     # Convert categorical columns to category dtype (as done during training)
     for col in CATEGORICAL_FEATURES:
+        # NOTE: Ensure that all categories present here were also present during training
+        # For a simple API, we rely on the model's pipeline to handle unseen categories
+        # (usually by converting them to NaN or a dummy 'unseen' category).
         df_features[col] = df_features[col].astype("category")
     return df_features
+def extract_json_from_markdown(text: str) -> str:
+    """
+    Extract JSON content from markdown code block.
+    Handles cases where the LLM wraps the output in ```json ... ```
+    """
+    # Look for ```json ... ```
+    match = re.search(r'```(?:json)?\s*\n?(.*?)\n?```', text, re.DOTALL | re.IGNORECASE)
+    if match:
+        json_str = match.group(1).strip()
+    else:
+        # Fallback: strip any leading/trailing whitespace and assume it's raw JSON
+        json_str = text.strip()
+    # Clean up common issues: remove extra newlines, fix quotes if needed
+    json_str = re.sub(r'\n\s*', ' ', json_str)  # Collapse newlines to spaces
+    json_str = re.sub(r'\\n', ' ', json_str)   # Replace escaped newlines
+    return json_str
 # --- FASTAPI ENDPOINTS ---
 @app.get("/")
 async def root():
             "models": "/models",
             "predict": "/predict (POST) - Single transaction",
             "predict_multiple": "/predict_multiple (POST) - Multiple transactions",
+            "random_data": "/get-random-data (GET) - Get sample data for testing", # ADDED
+            "llm_analyse": "/llm-analyse (POST) - LLM analysis of transactions",
             "docs": "/docs"
         },
         "response_format": {
                     "min_fraud_score": "float",
                     "max_fraud_score": "float"
                 }
+            },
+            "llm_analyse": {
+                "fraud_score": "float (0-1, e.g., 0.12 for 12%)",
+                "explanation": "str"
             }
         }
     }
 async def health_check():
     """Health check endpoint"""
     return {
+        "status": "healthy" if MODELS and DATA_DF is not None else "degraded",
         "version": VERSION,
         "models_loaded": list(MODELS.keys()),
+        "model_count": len(MODELS),
+        "data_loaded": DATA_DF is not None,
+        "gemini_configured": GEMINI_API_KEY is not None
     }
 @app.get("/models")
         "model_files": MODEL_MAP,
         "version": VERSION
     }
+@app.get("/get-random-data")
+async def get_random_data(
+    num_rows: int = Query(
+        10,
+        ge=1,
+        le=1000,
+        description="The number of random rows to return (between 1 and 1000)."
+    )
+):
+    """
+    Retrieves a specified number of random transaction records from the dataset,
+    excluding the 'is_fraud' column, suitable for testing the prediction endpoints.
+    """
+    df = load_data_file()
+    if df is None:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Data file not loaded. Check server logs for {DATA_FILE_PATH}"
+        )
+    total_rows = len(df)
+    if num_rows > total_rows:
+        num_rows = total_rows
+    try:
+        # Sample the data randomly
+        random_sample_df = df.sample(n=num_rows).copy()
+        # Drop the 'is_fraud' column as requested
+        if 'is_fraud' in random_sample_df.columns:
+            random_sample_df = random_sample_df.drop(columns=['is_fraud'])
+        # Ensure the output columns match the expected input features for the predict endpoints
+        final_cols = [col for col in EXPECTED_FEATURES if col in random_sample_df.columns]
+        random_sample_df = random_sample_df[final_cols]
+        # Convert to a list of dicts (JSON serializable format)
+        data_records = random_sample_df.to_dict(orient='records')
+        return {
+            "success": True,
+            "message": f"Returned {len(data_records)} random records.",
+            "data": data_records
+        }
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Error processing data request: {str(e)}"
+        )
 @app.post("/predict")
 async def predict_single(payload: SingleTransactionPayload):
             detail=f"Prediction execution failed: {type(e).__name__}: {str(e)}"
         )
+@app.post("/llm-analyse")
+async def llm_analyse(payload: LLMAnalysePayload):
+    """
+    LLM-based analysis of transactions using Gemini.
+    Expects a list of transactions with fields: fraud_score, STATUS, cc_num, merchant, category, amt, gender, state, zip, lat, long, city_pop, job, unix_time, merch_lat, merch_long, is_fraud, age, trans_hour, trans_day, trans_month, trans_weekday, distance
+    Converts to CSV, analyzes with Gemini, returns overall fraud_score (0-1) and explanation.
+    """
+    if not GEMINI_API_KEY:
+        raise HTTPException(
+            status_code=500,
+            detail="Gemini API key not configured. Set GEMINI_API_KEY environment variable."
+        )
+    transactions = payload.transactions
+    if not transactions:
+        raise HTTPException(
+            status_code=422,
+            detail="No transactions provided."
+        )
+    try:
+        # Convert to DataFrame and CSV string
+        df = pd.DataFrame(transactions)
+        csv_string = df.to_csv(index=False)
+        # Craft prompt
+        prompt = f"""
+Analyze the following credit card transaction data (CSV format). Each row includes fraud_score (0-100 from ML model), STATUS, and other transaction details.
+CSV Data:
+{csv_string}
+Instructions:
+- Compute an overall fraud_score (0-1 scale, where 0.12 means 12% fraud probability) based on patterns in fraud_score, amounts (amt), categories (category), locations (lat/long vs merch_lat/merch_long), times (trans_hour, trans_day, etc.), and is_fraud labels.
+- Consider thresholds: <0.5 good (low risk), 0.5-0.6 uncertain, >0.6 suspicious/critical.
+- Provide a concise explanation of the overall assessment, highlighting key patterns (e.g., high average fraud_score, unusual spending).
+- For CRITICAL (>0.6) or UNCERTAIN (0.5-0.6) transactions, specifically explain reasons for suspicion, such as unreasonably high amounts spent on categories like 'gas', 'grocery', etc., unusual distances, or time anomalies.
+- Output ONLY valid JSON in this exact format: {{"fraud_score": <float 0-1>, "explanation": "<string explanation in brief>"}}
+- Ensure fraud_score is a float (e.g., 0.12), rounded to 2 decimals if needed.
+- explanation should be a brief string without line breaks or any formatting. And dont reveal any file structure or CSV data directly in the explanation
+"""
+        # Generate with Gemini
+        model = genai.GenerativeModel('gemini-2.5-flash-lite-preview-09-2025')
+        response = model.generate_content(prompt)
+        # Parse response as JSON with markdown extraction
+        try:
+            raw_response = response.text
+            json_str = extract_json_from_markdown(raw_response)
+            analysis_json = json.loads(json_str)
+            if not isinstance(analysis_json.get('fraud_score'), (int, float)) or not isinstance(analysis_json.get('explanation'), str):
+                raise ValueError("Invalid JSON structure from LLM")
+        except json.JSONDecodeError as je:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Failed to parse LLM response as JSON: {str(je)}. Raw response: {raw_response}"
+            )
+        except ValueError as ve:
+            raise HTTPException(
+                status_code=500,
+                detail=f"Invalid LLM response structure: {str(ve)}. Raw response: {raw_response}"
+            )
+        return analysis_json
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"LLM analysis failed: {type(e).__name__}: {str(e)}"
+        )
 # For local development
 if __name__ == "__main__":
     import uvicorn

data/filteredTest.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3308f6df15a5b76b07a081d2df179e774acfbad01eaf01908bed9c1c2192a4f3
+size 24188561

data/filteredTrain.parquet ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f64208a7c5b840aea3fe8c5bc4037d53675d7ec940b89ede69daa597e36c76c
+size 55994057

stats/graphs/metrics.png ADDED Viewed

stats/graphs/precision-recall.png ADDED Viewed

stats/graphs/predict.png ADDED Viewed

stats/graphs/request_ram.png ADDED Viewed

stats/graphs/roc.png ADDED Viewed

stats/graphs/speed.png ADDED Viewed

stats/graphs/stats.png ADDED Viewed

stats/graphs/training_summary.png ADDED Viewed

test.py ADDED Viewed

	@@ -0,0 +1,95 @@

+import requests
+import json
+from typing import List, Dict, Any
+# Endpoint URL (assuming the FastAPI server is running locally on port 7860)
+BASE_URL = "http://localhost:7860"
+def create_sample_transactions(num_transactions: int = 3) -> List[Dict[str, Any]]:
+    """
+    Generate sample transaction data for testing the /llm-analyse endpoint.
+    Includes all 22 required fields: fraud_score, STATUS, cc_num, merchant, category,
+    amt, gender, state, zip, lat, long, city_pop, job, unix_time, merch_lat,
+    merch_long, is_fraud, age, trans_hour, trans_day, trans_month, trans_weekday, distance.
+    """
+    samples = []
+    for i in range(num_transactions):
+        transaction = {
+            "fraud_score": round(10 + (i * 20), 2),  # Vary fraud_score: 10, 30, 50 for example
+            "STATUS": "approved" if i < 2 else "declined",  # Mix statuses
+            "cc_num": 4532015112830366 + i,  # Fake CC numbers
+            "merchant": f"merchant_{i+1}",
+            "category": ["gas", "grocery", "entertainment"][i % 3],
+            "amt": round(50 + (i * 100), 2),  # Increasing amounts: 50, 150, 250
+            "gender": "F" if i % 2 == 0 else "M",
+            "state": ["NY", "CA", "TX"][i % 3],
+            "zip": 10001 + i * 100,
+            "lat": 40.7128 + (i * 0.1),
+            "long": -74.0060 + (i * 0.1),
+            "city_pop": 8000000 - (i * 1000000),
+            "job": ["Lawyer", "Doctor", "Engineer"][i % 3],
+            "unix_time": 1640995200 + (i * 3600),  # Sequential hours
+            "merch_lat": 40.7589 + (i * 0.05),
+            "merch_long": -73.9851 + (i * 0.05),
+            "is_fraud": 0 if i < 2 else 1,
+            "age": 30 + i * 5,
+            "trans_hour": (12 + i) % 24,
+            "trans_day": i + 1,
+            "trans_month": 12,
+            "trans_weekday": (i % 7) + 1,
+            "distance": round(5 + (i * 10), 2)  # Increasing distance
+        }
+        samples.append(transaction)
+    return samples
+def test_llm_analyse():
+    """
+    Test the /llm-analyse endpoint by sending sample transactions and printing the response.
+    """
+    endpoint = f"{BASE_URL}/llm-analyse"
+    # Prepare payload
+    payload = {
+        "transactions": create_sample_transactions(3)
+    }
+    print("📤 Sending request to /llm-analyse...")
+    print(json.dumps(payload, indent=2))
+    print("-" * 50)
+    try:
+        response = requests.post(endpoint, json=payload)
+        response.raise_for_status()  # Raise an HTTPError for bad responses
+        result = response.json()
+        print("✅ Response received:")
+        print(json.dumps(result, indent=2))
+        # Additional checks
+        if "fraud_score" in result and "explanation" in result:
+            fraud_score = result["fraud_score"]
+            explanation = result["explanation"]
+            print(f"\n📊 Overall Fraud Score: {fraud_score} ({fraud_score * 100:.1f}%)")
+            print(f"💡 Explanation: {explanation}")
+            # Simple categorization
+            if fraud_score < 0.5:
+                print("🟢 Assessment: Good (Low Risk)")
+            elif 0.5 <= fraud_score <= 0.6:
+                print("🟡 Assessment: Uncertain")
+            else:
+                print("🔴 Assessment: Suspicious/Critical")
+        else:
+            print("⚠️ Unexpected response format.")
+    except requests.exceptions.RequestException as e:
+        print(f"❌ Request failed: {e}")
+        if hasattr(e.response, 'text'):
+            print(f"Server response: {e.response.text}")
+    except json.JSONDecodeError as e:
+        print(f"❌ Failed to parse JSON response: {e}")
+        print(f"Raw response: {response.text}")
+if __name__ == "__main__":
+    # Run the test
+    test_llm_analyse()