File size: 3,722 Bytes
fd99b61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
"""Data processor for fraud detection datasets."""

import logging
from pathlib import Path
from typing import Dict, List, Optional

import pandas as pd

from src.config.config import settings

logger = logging.getLogger(__name__)


class FraudDataProcessor:
    """Processor for fraud detection data."""

    def __init__(self) -> None:
        """Initialize data processor."""
        self.train_df: Optional[pd.DataFrame] = None

    def load_train_data(self, path: Optional[Path] = None) -> pd.DataFrame:
        """Load training data.



        Args:

            path: Path to training data CSV. If None, uses default path.



        Returns:

            Training dataframe.

        """
        data_path = path or settings.train_data_path

        if not data_path.exists():
            raise FileNotFoundError(f"Training data not found: {data_path}")

        try:
            logger.info(f"Loading training data from {data_path}")
            # Load full dataset for accurate statistics
            self.train_df = pd.read_csv(data_path)
            
            # Clean merchant names (remove 'fraud_' prefix common in synthetic datasets)
            if 'merchant' in self.train_df.columns:
                self.train_df['merchant'] = self.train_df['merchant'].str.replace('fraud_', '', regex=False)
                
            logger.info(f"Loaded {len(self.train_df)} rows from training data (merchant names cleaned)")
            return self.train_df
        except Exception as e:
            logger.error(f"Error loading training data: {str(e)}")
            raise

    def get_transaction_summary(self, transaction_id: Optional[int] = None) -> Dict:
        """Get summary of a transaction or all transactions.



        Args:

            transaction_id: Optional transaction ID. If None, returns overall summary.



        Returns:

            Transaction summary dictionary.

        """
        if self.train_df is None:
            self.load_train_data()

        df = self.train_df

        if transaction_id is not None:
            transaction = df[df.index == transaction_id]
            if transaction.empty:
                raise ValueError(f"Transaction {transaction_id} not found")

            return transaction.iloc[0].to_dict()

        # Overall summary
        summary = {
            "total_transactions": len(df),
            "fraud_count": int(df["is_fraud"].sum()),
            "fraud_percentage": float(df["is_fraud"].mean() * 100),
            "total_amount": float(df["amt"].sum()),
            "average_amount": float(df["amt"].mean()),
            "categories": df["category"].value_counts().to_dict(),
        }

        return summary

    def format_transaction_for_llm(self, transaction: Dict) -> str:
        """Format a transaction dictionary for LLM analysis.



        Args:

            transaction: Transaction dictionary.



        Returns:

            Formatted string representation.

        """
        formatted = f"""

Transaction Details:

- Date/Time: {transaction.get('trans_date_trans_time', 'N/A')}

- Merchant: {str(transaction.get('merchant', 'N/A')).replace('fraud_', '')}

- Category: {transaction.get('category', 'N/A')}

- Amount: ${transaction.get('amt', 'N/A')}

- Customer: {transaction.get('first', 'N/A')} {transaction.get('last', 'N/A')}

- Gender: {transaction.get('gender', 'N/A')}

- Location: {transaction.get('city', 'N/A')}, {transaction.get('state', 'N/A')}

- Job: {transaction.get('job', 'N/A')}

- City Population: {transaction.get('city_pop', 'N/A')}

- Distance from Merchant: Calculated from coordinates

"""
        return formatted.strip()