Spaces:

NeerajCodz
/

creditCardFraudDetection

Sleeping

App Files Files Community

NeerajCodz commited on Dec 5, 2025

Commit

e09031a

1 Parent(s): 86a23de

pyarrow

Browse files

Files changed (2) hide show

app.py +73 -21
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import joblib
 import pandas as pd
 from typing import Dict, Any, List, Union, Optional
 from fastapi import FastAPI, HTTPException, Query
 from pydantic import BaseModel, Field
 import numpy as np
 import warnings
@@ -67,12 +68,25 @@ EXPECTED_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
 DATA_FILE_PATH = "data/filteredTest.parquet"
 DATA_DF: Optional[pd.DataFrame] = None # Global variable to cache the data
 # --- FASTAPI SETUP ---
 app = FastAPI(
     title="Credit Card Fraud Detection API",
     version=VERSION,
     description="Pure API server for fraud detection using ML models. Returns fraud_score (probability 0-100%)."
 )
 class SingleTransactionPayload(BaseModel):
     model_name: str = Field(..., description="Model alias (e.g., 'decision_tree', 'random_forest', 'xgboost').")
@@ -285,8 +299,9 @@ async def get_random_data(
     )
 ):
     """
-    Retrieves a specified number of random transaction records from the dataset,
-    excluding the 'is_fraud' column, suitable for testing the prediction endpoints.
     """
     df = load_data_file()
@@ -301,23 +316,51 @@ async def get_random_data(
         num_rows = total_rows
     try:
-        # Sample the data randomly
-        random_sample_df = df.sample(n=num_rows).copy()
-        # Drop the 'is_fraud' column as requested
-        if 'is_fraud' in random_sample_df.columns:
-            random_sample_df = random_sample_df.drop(columns=['is_fraud'])
         # Ensure the output columns match the expected input features for the predict endpoints
-        final_cols = [col for col in EXPECTED_FEATURES if col in random_sample_df.columns]
-        random_sample_df = random_sample_df[final_cols]
         # Convert to a list of dicts (JSON serializable format)
         data_records = random_sample_df.to_dict(orient='records')
         return {
             "success": True,
-            "message": f"Returned {len(data_records)} random records.",
             "data": data_records
         }
@@ -327,7 +370,6 @@ async def get_random_data(
             detail=f"Error processing data request: {str(e)}"
         )
 @app.post("/predict")
 async def predict_single(payload: SingleTransactionPayload):
     """
@@ -444,7 +486,7 @@ async def llm_analyse(payload: LLMAnalysePayload):
     Expects a list of transactions with fields: fraud_score, STATUS, cc_num, merchant, category, amt, gender, state, zip, lat, long, city_pop, job, unix_time, merch_lat, merch_long, is_fraud, age, trans_hour, trans_day, trans_month, trans_weekday, distance
-    Converts to CSV, analyzes with Gemini, returns overall fraud_score (0-1) and explanation.
     """
     if not GEMINI_API_KEY:
         raise HTTPException(
@@ -464,7 +506,7 @@ async def llm_analyse(payload: LLMAnalysePayload):
         df = pd.DataFrame(transactions)
         csv_string = df.to_csv(index=False)
-        # Craft prompt
         prompt = f"""
 Analyze the following credit card transaction data (CSV format). Each row includes fraud_score (0-100 from ML model), STATUS, and other transaction details.
@@ -472,13 +514,13 @@ CSV Data:
 {csv_string}
 Instructions:
-- Compute an overall fraud_score (0-1 scale, where 0.12 means 12% fraud probability) based on patterns in fraud_score, amounts (amt), categories (category), locations (lat/long vs merch_lat/merch_long), times (trans_hour, trans_day, etc.), and is_fraud labels.
-- Consider thresholds: <0.5 good (low risk), 0.5-0.6 uncertain, >0.6 suspicious/critical.
-- Provide a concise explanation of the overall assessment, highlighting key patterns (e.g., high average fraud_score, unusual spending).
-- For CRITICAL (>0.6) or UNCERTAIN (0.5-0.6) transactions, specifically explain reasons for suspicion, such as unreasonably high amounts spent on categories like 'gas', 'grocery', etc., unusual distances, or time anomalies.
-- Output ONLY valid JSON in this exact format: {{"fraud_score": <float 0-1>, "explanation": "<string explanation in brief>"}}
 - Ensure fraud_score is a float (e.g., 0.12), rounded to 2 decimals if needed.
-- explanation should be a brief string without line breaks or any formatting. And dont reveal any file structure or CSV data directly in the explanation
 """
         # Generate with Gemini
@@ -490,8 +532,18 @@ Instructions:
             raw_response = response.text
             json_str = extract_json_from_markdown(raw_response)
             analysis_json = json.loads(json_str)
-            if not isinstance(analysis_json.get('fraud_score'), (int, float)) or not isinstance(analysis_json.get('explanation'), str):
-                raise ValueError("Invalid JSON structure from LLM")
         except json.JSONDecodeError as je:
             raise HTTPException(
                 status_code=500,

 import pandas as pd
 from typing import Dict, Any, List, Union, Optional
 from fastapi import FastAPI, HTTPException, Query
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, Field
 import numpy as np
 import warnings
 DATA_FILE_PATH = "data/filteredTest.parquet"
 DATA_DF: Optional[pd.DataFrame] = None # Global variable to cache the data
+origins = [
+    "http://localhost:3000",
+    "http://127.0.0.1:3000",
+    "https://your-frontend-domain.com"  # Update with your actual frontend domain
+]
 # --- FASTAPI SETUP ---
 app = FastAPI(
     title="Credit Card Fraud Detection API",
     version=VERSION,
     description="Pure API server for fraud detection using ML models. Returns fraud_score (probability 0-100%)."
 )
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,       # The list of allowed origins defined above
+    allow_credentials=True,       # Allow cookies/authorization headers
+    allow_methods=["*"],          # Allow all HTTP methods (GET, POST, PUT, etc.)
+    allow_headers=["*"],          # Allow all headers
+)
 class SingleTransactionPayload(BaseModel):
     model_name: str = Field(..., description="Model alias (e.g., 'decision_tree', 'random_forest', 'xgboost').")
     )
 ):
     """
+    Retrieves a specified number of random transaction records from the dataset.
+    It ensures that at least one fraudulent (is_fraud=True) record is included,
+    suitable for testing the prediction endpoints.
     """
     df = load_data_file()
         num_rows = total_rows
     try:
+        # 1. Separate fraudulent and non-fraudulent transactions
+        fraud_df = df[df['is_fraud'] == 1].copy()
+        non_fraud_df = df[df['is_fraud'] == 0].copy()
+        final_sample_df = pd.DataFrame()
+        # 2. Ensure at least one fraudulent transaction is included (if available)
+        if not fraud_df.empty:
+            # Take 1 fraudulent transaction
+            fraud_sample = fraud_df.sample(n=1)
+            final_sample_df = pd.concat([final_sample_df, fraud_sample])
+            # Reduce the remaining rows needed
+            rows_needed = num_rows - 1
+        else:
+            # If no fraud data, just take the requested number of rows from non-fraud
+            rows_needed = num_rows
+        # 3. Fill the rest of the sample from the remaining data
+        if rows_needed > 0:
+            # Max rows to sample from non-fraudulent data, limited by available data
+            non_fraud_sample_size = min(rows_needed, len(non_fraud_df))
+            if non_fraud_sample_size > 0:
+                non_fraud_sample = non_fraud_df.sample(n=non_fraud_sample_size)
+                final_sample_df = pd.concat([final_sample_df, non_fraud_sample])
+        # 4. Final processing
+        # Drop the 'is_fraud' column
+        if 'is_fraud' in final_sample_df.columns:
+            final_sample_df = final_sample_df.drop(columns=['is_fraud'])
         # Ensure the output columns match the expected input features for the predict endpoints
+        final_cols = [col for col in EXPECTED_FEATURES if col in final_sample_df.columns]
+        random_sample_df = final_sample_df[final_cols]
         # Convert to a list of dicts (JSON serializable format)
         data_records = random_sample_df.to_dict(orient='records')
+        # Shuffle the final list to avoid placing the guaranteed fraud row always first
+        random.shuffle(data_records)
         return {
             "success": True,
+            "message": f"Returned {len(data_records)} random records (guaranteed at least one fraud if available).",
             "data": data_records
         }
             detail=f"Error processing data request: {str(e)}"
         )
 @app.post("/predict")
 async def predict_single(payload: SingleTransactionPayload):
     """
     Expects a list of transactions with fields: fraud_score, STATUS, cc_num, merchant, category, amt, gender, state, zip, lat, long, city_pop, job, unix_time, merch_lat, merch_long, is_fraud, age, trans_hour, trans_day, trans_month, trans_weekday, distance
+    Converts to CSV, analyzes with Gemini, returns overall fraud_score (0-1), insights, and recommendation.
     """
     if not GEMINI_API_KEY:
         raise HTTPException(
         df = pd.DataFrame(transactions)
         csv_string = df.to_csv(index=False)
+        # Craft prompt (Cleaned up JSON instruction and content requests)
         prompt = f"""
 Analyze the following credit card transaction data (CSV format). Each row includes fraud_score (0-100 from ML model), STATUS, and other transaction details.
 {csv_string}
 Instructions:
+- Compute an overall fraud_score (0-1 scale, where 0.12 means 12% fraud probability) based on patterns in fraud_score, amounts (amt), categories (category), locations, times, and is_fraud labels.
+- Provide detailed **insights** (a brief paragraph) summarizing the overall assessment and highlighting key patterns (e.g., high average fraud_score, unusual spending).
+- Provide a detailed **recommendation** (a brief paragraph) outlining specific actions based on the risk level.
+- Output ONLY valid JSON in this exact format: {{"fraud_score": <float 0-1>, "insights": "<string insights paragraph>", "recommendation": "<string recommendation paragraph>"}}.
 - Ensure fraud_score is a float (e.g., 0.12), rounded to 2 decimals if needed.
+- **insights** and **recommendation** should be brief paragraphs (minimum 100 chars total for each) without line breaks or any formatting. Do not reveal any file structure or CSV data directly in the output strings.
+- No preamble or additional text, ONLY the JSON object.
 """
         # Generate with Gemini
             raw_response = response.text
             json_str = extract_json_from_markdown(raw_response)
             analysis_json = json.loads(json_str)
+            # --- CRITICAL FIX: Update validation to check for 'insights' and 'recommendation' ---
+            if not isinstance(analysis_json.get('fraud_score'), (int, float)) or \
+               not isinstance(analysis_json.get('insights'), str) or \
+               not isinstance(analysis_json.get('recommendation'), str):
+                # Re-raise with descriptive error if keys are missing or types are wrong
+                missing_keys = [k for k in ['fraud_score', 'insights', 'recommendation'] if k not in analysis_json or not isinstance(analysis_json.get(k), (int, float, str))]
+                raise ValueError(f"Invalid JSON structure from LLM. Missing/Wrong type keys: {missing_keys}")
+            # --- END CRITICAL FIX ---
         except json.JSONDecodeError as je:
             raise HTTPException(
                 status_code=500,

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ joblib
 numpy
 scikit-learn==1.6.1
 xgboost
-google-generativeai

 numpy
 scikit-learn==1.6.1
 xgboost
+google-generativeai
+pyarrow