Fraud-Chatbot / src /data /processor.py
ahmzakif's picture
feat: add new project
fd99b61 verified
"""Data processor for fraud detection datasets."""
import logging
from pathlib import Path
from typing import Dict, List, Optional
import pandas as pd
from src.config.config import settings
logger = logging.getLogger(__name__)
class FraudDataProcessor:
"""Processor for fraud detection data."""
def __init__(self) -> None:
"""Initialize data processor."""
self.train_df: Optional[pd.DataFrame] = None
def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame:
"""Load training data.
Args:
path: Path to training data CSV. If None, uses default path.
Returns:
Training dataframe.
"""
data_path = path or settings.train_data_path
if not data_path.exists():
raise FileNotFoundError(f"Training data not found: {data_path}")
try:
logger.info(f"Loading training data from {data_path}")
# Load full dataset for accurate statistics
self.train_df = pd.read_csv(data_path)
# Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
if 'merchant' in self.train_df.columns:
self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False)
logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)")
return self.train_df
except Exception as e:
logger.error(f"Error loading training data: {str(e)}")
raise
def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict:
"""Get summary of a transaction or all transactions.
Args:
transaction_id: Optional transaction ID. If None, returns overall summary.
Returns:
Transaction summary dictionary.
"""
if self.train_df is None:
self.load_train_data()
df = self.train_df
if transaction_id is not None:
transaction = df[df.index == transaction_id]
if transaction.empty:
raise ValueError(f"Transaction {transaction_id} not found")
return transaction.iloc[0].to_dict()
# Overall summary
summary = {
"total_transactions": len(df),
"fraud_count": int(df["is_fraud"].sum()),
"fraud_percentage": float(df["is_fraud"].mean() * 100),
"total_amount": float(df["amt"].sum()),
"average_amount": float(df["amt"].mean()),
"categories": df["category"].value_counts().to_dict(),
}
return summary
def format_transaction_for_llm(self, transaction: Dict) -> str:
"""Format a transaction dictionary for LLM analysis.
Args:
transaction: Transaction dictionary.
Returns:
Formatted string representation.
"""
formatted = f"""
Transaction Details:
- Date/Time: {transaction.get('trans_date_trans_time', 'N/A')}
- Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')}
- Category: {transaction.get('category', 'N/A')}
- Amount: ${transaction.get('amt', 'N/A')}
- Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}
- Gender: {transaction.get('gender', 'N/A')}
- Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}
- Job: {transaction.get('job', 'N/A')}
- City Population: {transaction.get('city_pop', 'N/A')}
- Distance from Merchant: Calculated from coordinates
"""
return formatted.strip()