Spaces:
Sleeping
Sleeping
File size: 3,722 Bytes
fd99b61 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
"""Data processor for fraud detection datasets."""
import logging
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from src.config.config import settings
logger = logging.getLogger(__name__)
class FraudDataProcessor:
"""Processor for fraud detection data."""
def __init__(self) -> None:
"""Initialize data processor."""
self.train_df: Optional[pd.DataFrame] = None
def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame:
"""Load training data.
Args:
path: Path to training data CSV. If None, uses default path.
Returns:
Training dataframe.
"""
data_path = path or settings.train_data_path
if not data_path.exists():
raise FileNotFoundError(f"Training data not found: {data_path}")
try:
logger.info(f"Loading training data from {data_path}")
# Load full dataset for accurate statistics
self.train_df = pd.read_csv(data_path)
# Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
if 'merchant' in self.train_df.columns:
self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False)
logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)")
return self.train_df
except Exception as e:
logger.error(f"Error loading training data: {str(e)}")
raise
def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict:
"""Get summary of a transaction or all transactions.
Args:
transaction_id: Optional transaction ID. If None, returns overall summary.
Returns:
Transaction summary dictionary.
"""
if self.train_df is None:
self.load_train_data()
df = self.train_df
if transaction_id is not None:
transaction = df[df.index == transaction_id]
if transaction.empty:
raise ValueError(f"Transaction {transaction_id} not found")
return transaction.iloc[0].to_dict()
# Overall summary
summary = {
"total_transactions": len(df),
"fraud_count": int(df["is_fraud"].sum()),
"fraud_percentage": float(df["is_fraud"].mean() * 100),
"total_amount": float(df["amt"].sum()),
"average_amount": float(df["amt"].mean()),
"categories": df["category"].value_counts().to_dict(),
}
return summary
def format_transaction_for_llm(self, transaction: Dict) -> str:
"""Format a transaction dictionary for LLM analysis.
Args:
transaction: Transaction dictionary.
Returns:
Formatted string representation.
"""
formatted = f"""
Transaction Details:
- Date/Time: {transaction.get('trans_date_trans_time', 'N/A')}
- Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')}
- Category: {transaction.get('category', 'N/A')}
- Amount: ${transaction.get('amt', 'N/A')}
- Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}
- Gender: {transaction.get('gender', 'N/A')}
- Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}
- Job: {transaction.get('job', 'N/A')}
- City Population: {transaction.get('city_pop', 'N/A')}
- Distance from Merchant: Calculated from coordinates
"""
return formatted.strip()
|