File size: 7,535 Bytes
dcc24f8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""
Create Held-Out Benchmark from Real Emails.

Extracts 100 real financial emails from the MBOX file,
ensures they were NOT used in training, and creates a
benchmark for measuring real-world performance.

Author: Ranjit Behera
"""

import json
import random
import re
from pathlib import Path

# Paths
CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl")
TRAIN_FILE = Path("data/training/train.jsonl")
BENCHMARK_FILE = Path("data/benchmark/real_emails_benchmark.json")

def load_corpus():
    """Load the extracted financial emails."""
    emails = []
    with open(CORPUS_FILE, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                emails.append(data)
            except:
                continue
    return emails

def load_training_texts():
    """Load training data to exclude from benchmark."""
    texts = set()
    with open(TRAIN_FILE, 'r') as f:
        for line in f:
            try:
                data = json.loads(line)
                # Get first 100 chars as fingerprint
                text = data.get('text', '')[:100]
                texts.add(text)
            except:
                continue
    return texts

def extract_entities_from_email(email_body: str) -> dict:
    """Auto-extract entities from email text for labeling."""
    entities = {
        'amount': '',
        'type': '',
        'date': '',
        'account': '',
        'reference': '',
        'merchant': '',
        'bank': ''
    }
    
    text = email_body
    text_lower = text.lower()
    
    # Detect bank
    if 'hdfc' in text_lower:
        entities['bank'] = 'hdfc'
    elif 'icici' in text_lower:
        entities['bank'] = 'icici'
    elif 'sbi' in text_lower:
        entities['bank'] = 'sbi'
    elif 'axis' in text_lower:
        entities['bank'] = 'axis'
    elif 'kotak' in text_lower:
        entities['bank'] = 'kotak'
    elif 'phonepe' in text_lower:
        entities['bank'] = 'phonepe'
    elif 'gpay' in text_lower or 'google pay' in text_lower:
        entities['bank'] = 'gpay'
    elif 'paytm' in text_lower:
        entities['bank'] = 'paytm'
    
    # Detect type
    if 'debited' in text_lower or 'sent' in text_lower or 'paid' in text_lower:
        entities['type'] = 'debit'
    elif 'credited' in text_lower or 'received' in text_lower:
        entities['type'] = 'credit'
    
    # Extract amount - various patterns
    amount_patterns = [
        r'Rs\.?\s*([\d,]+\.?\d*)',
        r'INR\s*([\d,]+\.?\d*)',
        r'₹\s*([\d,]+\.?\d*)',
        r'([\d,]+\.?\d*)\s*has been (?:debited|credited)'
    ]
    for pattern in amount_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            entities['amount'] = match.group(1).replace(',', '')
            break
    
    # Extract account
    account_patterns = [
        r'account\s*(?:no\.?|number|#|XX|X)?\s*(\d{4})',
        r'A/c\s*(?:XX|X)?(\d{4})',
        r'a/c\s*(\d{4})',
        r'XX(\d{4})',
        r'X(\d{4})'
    ]
    for pattern in account_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            entities['account'] = match.group(1)
            break
    
    # Extract date - various formats
    date_patterns = [
        r'on\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})',
        r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})',
        r'(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})',
    ]
    for pattern in date_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            entities['date'] = match.group(1)
            break
    
    # Extract reference
    ref_patterns = [
        r'(?:UPI\s*)?(?:Ref(?:erence)?(?:\s*(?:No|Number|#|:))?\.?\s*:?\s*)(\d{10,})',
        r'transaction reference number is\s*(\d+)',
        r'Txn[:\s]*(\d+)',
    ]
    for pattern in ref_patterns:
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            entities['reference'] = match.group(1)
            break
    
    # Extract merchant from VPA
    vpa_match = re.search(r'VPA\s+(\w+)@\w+\s+([A-Z][A-Za-z\s]+)', text)
    if vpa_match:
        entities['merchant'] = vpa_match.group(2).strip().lower()
    else:
        # Try common merchants
        merchants = ['swiggy', 'zomato', 'amazon', 'flipkart', 'uber', 'ola', 'rapido', 'bigbasket', 'blinkit', 'zepto']
        for m in merchants:
            if m in text_lower:
                entities['merchant'] = m
                break
    
    return entities


def create_benchmark(n_samples=100):
    """Create held-out benchmark from real emails."""
    print("=" * 60)
    print("📊 CREATING HELD-OUT BENCHMARK")
    print("=" * 60)
    
    # Load data
    print(f"\n1. Loading corpus from {CORPUS_FILE}...")
    corpus = load_corpus()
    print(f"   Found {len(corpus)} financial emails")
    
    print(f"\n2. Loading training data to exclude...")
    train_texts = load_training_texts()
    print(f"   Found {len(train_texts)} training samples to exclude")
    
    # Filter for transaction emails
    print(f"\n3. Filtering for transaction emails...")
    candidates = []
    for email in corpus:
        body = email.get('body', '')
        
        # Skip if too short
        if len(body) < 50:
            continue
        
        # Must have transaction keywords
        body_lower = body.lower()
        has_transaction = any(x in body_lower for x in ['debited', 'credited', 'received', 'sent'])
        has_amount = any(x in body_lower for x in ['rs.', 'rs ', 'inr', '₹'])
        
        if has_transaction and has_amount:
            # Auto-extract entities
            entities = extract_entities_from_email(body)
            
            candidates.append({
                'text': body,
                'subject': email.get('subject', ''),
                'sender': email.get('sender', ''),
                'date': email.get('date', ''),
                'expected_entities': entities
            })
    
    print(f"   Found {len(candidates)} transaction emails")
    
    # Sample randomly
    print(f"\n4. Sampling {n_samples} emails for benchmark...")
    random.seed(42)  # Reproducible
    benchmark = random.sample(candidates, min(n_samples, len(candidates)))
    
    # Add IDs
    for i, sample in enumerate(benchmark):
        sample['id'] = i + 1
        sample['auto_labeled'] = True
        sample['verified'] = False
    
    # Save benchmark
    BENCHMARK_FILE.parent.mkdir(parents=True, exist_ok=True)
    with open(BENCHMARK_FILE, 'w') as f:
        json.dump(benchmark, f, indent=2, ensure_ascii=False)
    
    print(f"\n✅ Benchmark saved to {BENCHMARK_FILE}")
    print(f"   Total samples: {len(benchmark)}")
    
    # Stats
    banks = {}
    for s in benchmark:
        bank = s['expected_entities'].get('bank', 'unknown')
        banks[bank] = banks.get(bank, 0) + 1
    
    print("\n📊 Benchmark by Bank:")
    for bank, count in sorted(banks.items()):
        print(f"   {bank.upper():10} {count}")
    
    # Show sample
    print("\n" + "=" * 60)
    print("📧 SAMPLE EMAIL FROM BENCHMARK:")
    print("=" * 60)
    if benchmark:
        sample = benchmark[0]
        print(f"Subject: {sample.get('subject', 'N/A')}")
        print(f"Text: {sample['text'][:300]}...")
        print(f"\nAuto-extracted entities:")
        for k, v in sample['expected_entities'].items():
            if v:
                print(f"   {k}: {v}")
    
    return benchmark


if __name__ == "__main__":
    create_benchmark(n_samples=100)