Spaces:
Sleeping
Sleeping
| """Data processor for fraud detection datasets.""" | |
| import logging | |
| from pathlib import Path | |
| from typing import Dict, List, Optional | |
| import pandas as pd | |
| from src.config.config import settings | |
| logger = logging.getLogger(__name__) | |
| class FraudDataProcessor: | |
| """Processor for fraud detection data.""" | |
| def __init__(self) -> None: | |
| """Initialize data processor.""" | |
| self.train_df: Optional[pd.DataFrame] = None | |
| def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame: | |
| """Load training data. | |
| Args: | |
| path: Path to training data CSV. If None, uses default path. | |
| Returns: | |
| Training dataframe. | |
| """ | |
| data_path = path or settings.train_data_path | |
| if not data_path.exists(): | |
| raise FileNotFoundError(f"Training data not found: {data_path}") | |
| try: | |
| logger.info(f"Loading training data from {data_path}") | |
| # Load full dataset for accurate statistics | |
| self.train_df = pd.read_csv(data_path) | |
| # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets) | |
| if 'merchant' in self.train_df.columns: | |
| self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False) | |
| logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)") | |
| return self.train_df | |
| except Exception as e: | |
| logger.error(f"Error loading training data: {str(e)}") | |
| raise | |
| def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict: | |
| """Get summary of a transaction or all transactions. | |
| Args: | |
| transaction_id: Optional transaction ID. If None, returns overall summary. | |
| Returns: | |
| Transaction summary dictionary. | |
| """ | |
| if self.train_df is None: | |
| self.load_train_data() | |
| df = self.train_df | |
| if transaction_id is not None: | |
| transaction = df[df.index == transaction_id] | |
| if transaction.empty: | |
| raise ValueError(f"Transaction {transaction_id} not found") | |
| return transaction.iloc[0].to_dict() | |
| # Overall summary | |
| summary = { | |
| "total_transactions": len(df), | |
| "fraud_count": int(df["is_fraud"].sum()), | |
| "fraud_percentage": float(df["is_fraud"].mean() * 100), | |
| "total_amount": float(df["amt"].sum()), | |
| "average_amount": float(df["amt"].mean()), | |
| "categories": df["category"].value_counts().to_dict(), | |
| } | |
| return summary | |
| def format_transaction_for_llm(self, transaction: Dict) -> str: | |
| """Format a transaction dictionary for LLM analysis. | |
| Args: | |
| transaction: Transaction dictionary. | |
| Returns: | |
| Formatted string representation. | |
| """ | |
| formatted = f""" | |
| Transaction Details: | |
| - Date/Time: {transaction.get('trans_date_trans_time', 'N/A')} | |
| - Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')} | |
| - Category: {transaction.get('category', 'N/A')} | |
| - Amount: ${transaction.get('amt', 'N/A')} | |
| - Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')} | |
| - Gender: {transaction.get('gender', 'N/A')} | |
| - Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')} | |
| - Job: {transaction.get('job', 'N/A')} | |
| - City Population: {transaction.get('city_pop', 'N/A')} | |
| - Distance from Merchant: Calculated from coordinates | |
| """ | |
| return formatted.strip() | |