File size: 6,997 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
"""
Create Strict Transaction Benchmark.

Only includes real transaction alerts with clear patterns.
Excludes marketing, bill notifications, and investment updates.

Author: Ranjit Behera
"""

import json
import re
import random
from pathlib import Path
from collections import defaultdict

CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl")
BENCHMARK_FILE = Path("data/benchmark/strict_benchmark.json")

# Transaction patterns that indicate real transactions
TRANSACTION_PATTERNS = [
    r'has been debited',
    r'has been credited', 
    r'is debited from',
    r'is credited to',
    r'Rs\.\s*[\d,]+.*debited',
    r'Rs\.\s*[\d,]+.*credited',
    r'INR\s*[\d,]+.*debited',
    r'INR\s*[\d,]+.*credited',
    r'UPI transaction reference',
    r'UPI Ref',
    r'IMPS Ref',
    r'NEFT Ref',
]

# Exclude patterns (marketing, bills, investments)
EXCLUDE_PATTERNS = [
    r'welcome to your',
    r'greetings of the day',
    r'unsubscribe from',
    r'skills that will get you',
    r'daily digest',
    r'top picks',
    r'mutual fund nav',
    r'market update',
    r'job opportunity',
    r'margin statement',
    r'password reset',
]


def is_transaction_email(body: str) -> bool:
    """Check if email is a real transaction alert."""
    body_lower = body.lower()
    
    # Must match at least one transaction pattern
    has_transaction = any(re.search(p, body, re.IGNORECASE) for p in TRANSACTION_PATTERNS)
    
    # Must not match exclude patterns
    has_exclude = any(re.search(p, body_lower) for p in EXCLUDE_PATTERNS)
    
    return has_transaction and not has_exclude


def detect_bank(body: str, sender: str = "") -> str:
    """Detect bank from email."""
    text = (body + " " + sender).lower()
    
    # Priority order (more specific first)
    if 'hdfc bank' in text or 'hdfcbank' in text:
        return 'hdfc'
    elif 'icici bank' in text:
        return 'icici'
    elif 'state bank' in text or 'sbi:' in text:
        return 'sbi'
    elif 'axis bank' in text:
        return 'axis'
    elif 'kotak' in text:
        return 'kotak'
    
    return ''


def extract_entities(body: str, bank: str) -> dict:
    """Extract entities from transaction email."""
    entities = {
        'amount': '',
        'type': '',
        'date': '',
        'account': '',
        'reference': '',
        'merchant': '',
        'bank': bank
    }
    
    # Amount
    match = re.search(r'Rs\.?\s*([\d,]+\.?\d*)', body, re.IGNORECASE)
    if match:
        entities['amount'] = match.group(1).replace(',', '')
    else:
        match = re.search(r'INR\s*([\d,]+\.?\d*)', body, re.IGNORECASE)
        if match:
            entities['amount'] = match.group(1).replace(',', '')
    
    # Type
    body_lower = body.lower()
    if 'debited' in body_lower:
        entities['type'] = 'debit'
    elif 'credited' in body_lower:
        entities['type'] = 'credit'
    
    # Account (4 digits after XX or **)
    match = re.search(r'(?:XX|X|\*\*|account\s*)(\d{4})', body, re.IGNORECASE)
    if match:
        entities['account'] = match.group(1)
    
    # Date
    match = re.search(r'on\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', body)
    if match:
        entities['date'] = match.group(1)
    
    # Reference (12+ digit number)
    ref_patterns = [
        r'reference number is\s*(\d{10,})',
        r'(?:Ref(?:erence)?[:\s.]*|UPI\s*Ref[:\s]*|IMPS\s*Ref[:\s]*)(\d{10,})',
    ]
    for pattern in ref_patterns:
        match = re.search(pattern, body, re.IGNORECASE)
        if match:
            entities['reference'] = match.group(1)
            break
    
    # Merchant from VPA
    match = re.search(r'VPA[:\s]+\S+\s+([A-Z][A-Za-z\s]+?)(?:\s+on|\s+\d)', body)
    if match:
        entities['merchant'] = match.group(1).strip().lower()
    
    return entities


def create_strict_benchmark():
    """Create strictly filtered benchmark."""
    print("=" * 60)
    print("📊 CREATING STRICT TRANSACTION BENCHMARK")
    print("=" * 60)
    
    bank_transactions = defaultdict(list)
    
    with open(CORPUS_FILE, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                body = data.get('body', '')
                sender = data.get('sender', '')
                
                # Strict filtering
                if not is_transaction_email(body):
                    continue
                
                if len(body) < 50:
                    continue
                
                # Detect bank
                bank = detect_bank(body, sender)
                if not bank:
                    continue
                
                # Extract entities
                entities = extract_entities(body, bank)
                
                # Must have amount, type, and reference
                if entities['amount'] and entities['type'] and entities['reference']:
                    bank_transactions[bank].append({
                        'text': body,
                        'expected_entities': entities,
                        'subject': data.get('subject', ''),
                        'verified': True
                    })
            except:
                continue
    
    print("\n📊 Strict transactions per bank:")
    for bank, txns in sorted(bank_transactions.items()):
        print(f"   {bank.upper():10} {len(txns):4} transactions")
    
    # Sample and deduplicate
    random.seed(42)
    benchmark = []
    
    for bank, txns in bank_transactions.items():
        # Deduplicate by reference
        seen_refs = set()
        unique = []
        for t in txns:
            ref = t['expected_entities']['reference']
            if ref not in seen_refs:
                seen_refs.add(ref)
                unique.append(t)
        
        sampled = random.sample(unique, min(15, len(unique)))
        benchmark.extend(sampled)
    
    for i, s in enumerate(benchmark):
        s['id'] = i + 1
    
    random.shuffle(benchmark)
    
    # Save
    BENCHMARK_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(BENCHMARK_FILE, 'w') as f:
        json.dump(benchmark, f, indent=2, ensure_ascii=False)
    
    print(f"\n✅ Saved {len(benchmark)} samples to {BENCHMARK_FILE}")
    
    # Stats
    bank_counts = defaultdict(int)
    for s in benchmark:
        bank_counts[s['expected_entities']['bank']] += 1
    
    print("\n📊 Benchmark composition:")
    for bank, count in sorted(bank_counts.items()):
        print(f"   {bank.upper():10} {count:3} samples")
    
    # Show samples
    print("\n📧 Sample transaction:")
    if benchmark:
        s = benchmark[0]
        print(f"   Bank: {s['expected_entities']['bank'].upper()}")
        print(f"   Amount: {s['expected_entities']['amount']}")
        print(f"   Type: {s['expected_entities']['type']}")
        print(f"   Reference: {s['expected_entities']['reference']}")
        print(f"   Text: {s['text'][:150]}...")
    
    return benchmark


if __name__ == "__main__":
    create_strict_benchmark()