"""Data processor for fraud detection datasets.""" import logging from pathlib import Path from typing import Dict, List, Optional import pandas as pd from src.config.config import settings logger = logging.getLogger(__name__) class FraudDataProcessor: """Processor for fraud detection data.""" def __init__(self) -> None: """Initialize data processor.""" self.train_df: Optional[pd.DataFrame] = None def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame: """Load training data. Args: path: Path to training data CSV. If None, uses default path. Returns: Training dataframe. """ data_path = path or settings.train_data_path if not data_path.exists(): raise FileNotFoundError(f"Training data not found: {data_path}") try: logger.info(f"Loading training data from {data_path}") # Load full dataset for accurate statistics self.train_df = pd.read_csv(data_path) # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets) if 'merchant' in self.train_df.columns: self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False) logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)") return self.train_df except Exception as e: logger.error(f"Error loading training data: {str(e)}") raise def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict: """Get summary of a transaction or all transactions. Args: transaction_id: Optional transaction ID. If None, returns overall summary. Returns: Transaction summary dictionary. """ if self.train_df is None: self.load_train_data() df = self.train_df if transaction_id is not None: transaction = df[df.index == transaction_id] if transaction.empty: raise ValueError(f"Transaction {transaction_id} not found") return transaction.iloc[0].to_dict() # Overall summary summary = { "total_transactions": len(df), "fraud_count": int(df["is_fraud"].sum()), "fraud_percentage": float(df["is_fraud"].mean() * 100), "total_amount": float(df["amt"].sum()), "average_amount": float(df["amt"].mean()), "categories": df["category"].value_counts().to_dict(), } return summary def format_transaction_for_llm(self, transaction: Dict) -> str: """Format a transaction dictionary for LLM analysis. Args: transaction: Transaction dictionary. Returns: Formatted string representation. """ formatted = f""" Transaction Details: - Date/Time: {transaction.get('trans_date_trans_time', 'N/A')} - Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')} - Category: {transaction.get('category', 'N/A')} - Amount: ${transaction.get('amt', 'N/A')} - Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')} - Gender: {transaction.get('gender', 'N/A')} - Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')} - Job: {transaction.get('job', 'N/A')} - City Population: {transaction.get('city_pop', 'N/A')} - Distance from Merchant: Calculated from coordinates """ return formatted.strip()