NeerajCodz commited on
Commit
e09031a
·
1 Parent(s): 86a23de
Files changed (2) hide show
  1. app.py +73 -21
  2. requirements.txt +2 -1
app.py CHANGED
@@ -4,6 +4,7 @@ import joblib
4
  import pandas as pd
5
  from typing import Dict, Any, List, Union, Optional
6
  from fastapi import FastAPI, HTTPException, Query
 
7
  from pydantic import BaseModel, Field
8
  import numpy as np
9
  import warnings
@@ -67,12 +68,25 @@ EXPECTED_FEATURES = CATEGORICAL_FEATURES + NUMERICAL_FEATURES
67
  DATA_FILE_PATH = "data/filteredTest.parquet"
68
  DATA_DF: Optional[pd.DataFrame] = None # Global variable to cache the data
69
 
 
 
 
 
 
 
70
  # --- FASTAPI SETUP ---
71
  app = FastAPI(
72
  title="Credit Card Fraud Detection API",
73
  version=VERSION,
74
  description="Pure API server for fraud detection using ML models. Returns fraud_score (probability 0-100%)."
75
  )
 
 
 
 
 
 
 
76
 
77
  class SingleTransactionPayload(BaseModel):
78
  model_name: str = Field(..., description="Model alias (e.g., 'decision_tree', 'random_forest', 'xgboost').")
@@ -285,8 +299,9 @@ async def get_random_data(
285
  )
286
  ):
287
  """
288
- Retrieves a specified number of random transaction records from the dataset,
289
- excluding the 'is_fraud' column, suitable for testing the prediction endpoints.
 
290
  """
291
  df = load_data_file()
292
 
@@ -301,23 +316,51 @@ async def get_random_data(
301
  num_rows = total_rows
302
 
303
  try:
304
- # Sample the data randomly
305
- random_sample_df = df.sample(n=num_rows).copy()
 
306
 
307
- # Drop the 'is_fraud' column as requested
308
- if 'is_fraud' in random_sample_df.columns:
309
- random_sample_df = random_sample_df.drop(columns=['is_fraud'])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
  # Ensure the output columns match the expected input features for the predict endpoints
312
- final_cols = [col for col in EXPECTED_FEATURES if col in random_sample_df.columns]
313
- random_sample_df = random_sample_df[final_cols]
314
 
315
  # Convert to a list of dicts (JSON serializable format)
316
  data_records = random_sample_df.to_dict(orient='records')
317
 
 
 
 
318
  return {
319
  "success": True,
320
- "message": f"Returned {len(data_records)} random records.",
321
  "data": data_records
322
  }
323
 
@@ -327,7 +370,6 @@ async def get_random_data(
327
  detail=f"Error processing data request: {str(e)}"
328
  )
329
 
330
-
331
  @app.post("/predict")
332
  async def predict_single(payload: SingleTransactionPayload):
333
  """
@@ -444,7 +486,7 @@ async def llm_analyse(payload: LLMAnalysePayload):
444
 
445
  Expects a list of transactions with fields: fraud_score, STATUS, cc_num, merchant, category, amt, gender, state, zip, lat, long, city_pop, job, unix_time, merch_lat, merch_long, is_fraud, age, trans_hour, trans_day, trans_month, trans_weekday, distance
446
 
447
- Converts to CSV, analyzes with Gemini, returns overall fraud_score (0-1) and explanation.
448
  """
449
  if not GEMINI_API_KEY:
450
  raise HTTPException(
@@ -464,7 +506,7 @@ async def llm_analyse(payload: LLMAnalysePayload):
464
  df = pd.DataFrame(transactions)
465
  csv_string = df.to_csv(index=False)
466
 
467
- # Craft prompt
468
  prompt = f"""
469
  Analyze the following credit card transaction data (CSV format). Each row includes fraud_score (0-100 from ML model), STATUS, and other transaction details.
470
 
@@ -472,13 +514,13 @@ CSV Data:
472
  {csv_string}
473
 
474
  Instructions:
475
- - Compute an overall fraud_score (0-1 scale, where 0.12 means 12% fraud probability) based on patterns in fraud_score, amounts (amt), categories (category), locations (lat/long vs merch_lat/merch_long), times (trans_hour, trans_day, etc.), and is_fraud labels.
476
- - Consider thresholds: <0.5 good (low risk), 0.5-0.6 uncertain, >0.6 suspicious/critical.
477
- - Provide a concise explanation of the overall assessment, highlighting key patterns (e.g., high average fraud_score, unusual spending).
478
- - For CRITICAL (>0.6) or UNCERTAIN (0.5-0.6) transactions, specifically explain reasons for suspicion, such as unreasonably high amounts spent on categories like 'gas', 'grocery', etc., unusual distances, or time anomalies.
479
- - Output ONLY valid JSON in this exact format: {{"fraud_score": <float 0-1>, "explanation": "<string explanation in brief>"}}
480
  - Ensure fraud_score is a float (e.g., 0.12), rounded to 2 decimals if needed.
481
- - explanation should be a brief string without line breaks or any formatting. And dont reveal any file structure or CSV data directly in the explanation
 
482
  """
483
 
484
  # Generate with Gemini
@@ -490,8 +532,18 @@ Instructions:
490
  raw_response = response.text
491
  json_str = extract_json_from_markdown(raw_response)
492
  analysis_json = json.loads(json_str)
493
- if not isinstance(analysis_json.get('fraud_score'), (int, float)) or not isinstance(analysis_json.get('explanation'), str):
494
- raise ValueError("Invalid JSON structure from LLM")
 
 
 
 
 
 
 
 
 
 
495
  except json.JSONDecodeError as je:
496
  raise HTTPException(
497
  status_code=500,
 
4
  import pandas as pd
5
  from typing import Dict, Any, List, Union, Optional
6
  from fastapi import FastAPI, HTTPException, Query
7
+ from fastapi.middleware.cors import CORSMiddleware
8
  from pydantic import BaseModel, Field
9
  import numpy as np
10
  import warnings
 
68
  DATA_FILE_PATH = "data/filteredTest.parquet"
69
  DATA_DF: Optional[pd.DataFrame] = None # Global variable to cache the data
70
 
71
+ origins = [
72
+ "http://localhost:3000",
73
+ "http://127.0.0.1:3000",
74
+ "https://your-frontend-domain.com" # Update with your actual frontend domain
75
+ ]
76
+
77
  # --- FASTAPI SETUP ---
78
  app = FastAPI(
79
  title="Credit Card Fraud Detection API",
80
  version=VERSION,
81
  description="Pure API server for fraud detection using ML models. Returns fraud_score (probability 0-100%)."
82
  )
83
+ app.add_middleware(
84
+ CORSMiddleware,
85
+ allow_origins=origins, # The list of allowed origins defined above
86
+ allow_credentials=True, # Allow cookies/authorization headers
87
+ allow_methods=["*"], # Allow all HTTP methods (GET, POST, PUT, etc.)
88
+ allow_headers=["*"], # Allow all headers
89
+ )
90
 
91
  class SingleTransactionPayload(BaseModel):
92
  model_name: str = Field(..., description="Model alias (e.g., 'decision_tree', 'random_forest', 'xgboost').")
 
299
  )
300
  ):
301
  """
302
+ Retrieves a specified number of random transaction records from the dataset.
303
+ It ensures that at least one fraudulent (is_fraud=True) record is included,
304
+ suitable for testing the prediction endpoints.
305
  """
306
  df = load_data_file()
307
 
 
316
  num_rows = total_rows
317
 
318
  try:
319
+ # 1. Separate fraudulent and non-fraudulent transactions
320
+ fraud_df = df[df['is_fraud'] == 1].copy()
321
+ non_fraud_df = df[df['is_fraud'] == 0].copy()
322
 
323
+ final_sample_df = pd.DataFrame()
324
+
325
+ # 2. Ensure at least one fraudulent transaction is included (if available)
326
+ if not fraud_df.empty:
327
+ # Take 1 fraudulent transaction
328
+ fraud_sample = fraud_df.sample(n=1)
329
+ final_sample_df = pd.concat([final_sample_df, fraud_sample])
330
+
331
+ # Reduce the remaining rows needed
332
+ rows_needed = num_rows - 1
333
+ else:
334
+ # If no fraud data, just take the requested number of rows from non-fraud
335
+ rows_needed = num_rows
336
+
337
+ # 3. Fill the rest of the sample from the remaining data
338
+ if rows_needed > 0:
339
+ # Max rows to sample from non-fraudulent data, limited by available data
340
+ non_fraud_sample_size = min(rows_needed, len(non_fraud_df))
341
+
342
+ if non_fraud_sample_size > 0:
343
+ non_fraud_sample = non_fraud_df.sample(n=non_fraud_sample_size)
344
+ final_sample_df = pd.concat([final_sample_df, non_fraud_sample])
345
+
346
+ # 4. Final processing
347
+ # Drop the 'is_fraud' column
348
+ if 'is_fraud' in final_sample_df.columns:
349
+ final_sample_df = final_sample_df.drop(columns=['is_fraud'])
350
 
351
  # Ensure the output columns match the expected input features for the predict endpoints
352
+ final_cols = [col for col in EXPECTED_FEATURES if col in final_sample_df.columns]
353
+ random_sample_df = final_sample_df[final_cols]
354
 
355
  # Convert to a list of dicts (JSON serializable format)
356
  data_records = random_sample_df.to_dict(orient='records')
357
 
358
+ # Shuffle the final list to avoid placing the guaranteed fraud row always first
359
+ random.shuffle(data_records)
360
+
361
  return {
362
  "success": True,
363
+ "message": f"Returned {len(data_records)} random records (guaranteed at least one fraud if available).",
364
  "data": data_records
365
  }
366
 
 
370
  detail=f"Error processing data request: {str(e)}"
371
  )
372
 
 
373
  @app.post("/predict")
374
  async def predict_single(payload: SingleTransactionPayload):
375
  """
 
486
 
487
  Expects a list of transactions with fields: fraud_score, STATUS, cc_num, merchant, category, amt, gender, state, zip, lat, long, city_pop, job, unix_time, merch_lat, merch_long, is_fraud, age, trans_hour, trans_day, trans_month, trans_weekday, distance
488
 
489
+ Converts to CSV, analyzes with Gemini, returns overall fraud_score (0-1), insights, and recommendation.
490
  """
491
  if not GEMINI_API_KEY:
492
  raise HTTPException(
 
506
  df = pd.DataFrame(transactions)
507
  csv_string = df.to_csv(index=False)
508
 
509
+ # Craft prompt (Cleaned up JSON instruction and content requests)
510
  prompt = f"""
511
  Analyze the following credit card transaction data (CSV format). Each row includes fraud_score (0-100 from ML model), STATUS, and other transaction details.
512
 
 
514
  {csv_string}
515
 
516
  Instructions:
517
+ - Compute an overall fraud_score (0-1 scale, where 0.12 means 12% fraud probability) based on patterns in fraud_score, amounts (amt), categories (category), locations, times, and is_fraud labels.
518
+ - Provide detailed **insights** (a brief paragraph) summarizing the overall assessment and highlighting key patterns (e.g., high average fraud_score, unusual spending).
519
+ - Provide a detailed **recommendation** (a brief paragraph) outlining specific actions based on the risk level.
520
+ - Output ONLY valid JSON in this exact format: {{"fraud_score": <float 0-1>, "insights": "<string insights paragraph>", "recommendation": "<string recommendation paragraph>"}}.
 
521
  - Ensure fraud_score is a float (e.g., 0.12), rounded to 2 decimals if needed.
522
+ - **insights** and **recommendation** should be brief paragraphs (minimum 100 chars total for each) without line breaks or any formatting. Do not reveal any file structure or CSV data directly in the output strings.
523
+ - No preamble or additional text, ONLY the JSON object.
524
  """
525
 
526
  # Generate with Gemini
 
532
  raw_response = response.text
533
  json_str = extract_json_from_markdown(raw_response)
534
  analysis_json = json.loads(json_str)
535
+
536
+ # --- CRITICAL FIX: Update validation to check for 'insights' and 'recommendation' ---
537
+ if not isinstance(analysis_json.get('fraud_score'), (int, float)) or \
538
+ not isinstance(analysis_json.get('insights'), str) or \
539
+ not isinstance(analysis_json.get('recommendation'), str):
540
+
541
+ # Re-raise with descriptive error if keys are missing or types are wrong
542
+ missing_keys = [k for k in ['fraud_score', 'insights', 'recommendation'] if k not in analysis_json or not isinstance(analysis_json.get(k), (int, float, str))]
543
+
544
+ raise ValueError(f"Invalid JSON structure from LLM. Missing/Wrong type keys: {missing_keys}")
545
+ # --- END CRITICAL FIX ---
546
+
547
  except json.JSONDecodeError as je:
548
  raise HTTPException(
549
  status_code=500,
requirements.txt CHANGED
@@ -5,4 +5,5 @@ joblib
5
  numpy
6
  scikit-learn==1.6.1
7
  xgboost
8
- google-generativeai
 
 
5
  numpy
6
  scikit-learn==1.6.1
7
  xgboost
8
+ google-generativeai
9
+ pyarrow