|
|
""" |
|
|
Create Held-Out Benchmark from Real Emails. |
|
|
|
|
|
Extracts 100 real financial emails from the MBOX file, |
|
|
ensures they were NOT used in training, and creates a |
|
|
benchmark for measuring real-world performance. |
|
|
|
|
|
Author: Ranjit Behera |
|
|
""" |
|
|
|
|
|
import json |
|
|
import random |
|
|
import re |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl") |
|
|
TRAIN_FILE = Path("data/training/train.jsonl") |
|
|
BENCHMARK_FILE = Path("data/benchmark/real_emails_benchmark.json") |
|
|
|
|
|
def load_corpus(): |
|
|
"""Load the extracted financial emails.""" |
|
|
emails = [] |
|
|
with open(CORPUS_FILE, 'r') as f: |
|
|
for line in f: |
|
|
try: |
|
|
data = json.loads(line) |
|
|
emails.append(data) |
|
|
except: |
|
|
continue |
|
|
return emails |
|
|
|
|
|
def load_training_texts(): |
|
|
"""Load training data to exclude from benchmark.""" |
|
|
texts = set() |
|
|
with open(TRAIN_FILE, 'r') as f: |
|
|
for line in f: |
|
|
try: |
|
|
data = json.loads(line) |
|
|
|
|
|
text = data.get('text', '')[:100] |
|
|
texts.add(text) |
|
|
except: |
|
|
continue |
|
|
return texts |
|
|
|
|
|
def extract_entities_from_email(email_body: str) -> dict: |
|
|
"""Auto-extract entities from email text for labeling.""" |
|
|
entities = { |
|
|
'amount': '', |
|
|
'type': '', |
|
|
'date': '', |
|
|
'account': '', |
|
|
'reference': '', |
|
|
'merchant': '', |
|
|
'bank': '' |
|
|
} |
|
|
|
|
|
text = email_body |
|
|
text_lower = text.lower() |
|
|
|
|
|
|
|
|
if 'hdfc' in text_lower: |
|
|
entities['bank'] = 'hdfc' |
|
|
elif 'icici' in text_lower: |
|
|
entities['bank'] = 'icici' |
|
|
elif 'sbi' in text_lower: |
|
|
entities['bank'] = 'sbi' |
|
|
elif 'axis' in text_lower: |
|
|
entities['bank'] = 'axis' |
|
|
elif 'kotak' in text_lower: |
|
|
entities['bank'] = 'kotak' |
|
|
elif 'phonepe' in text_lower: |
|
|
entities['bank'] = 'phonepe' |
|
|
elif 'gpay' in text_lower or 'google pay' in text_lower: |
|
|
entities['bank'] = 'gpay' |
|
|
elif 'paytm' in text_lower: |
|
|
entities['bank'] = 'paytm' |
|
|
|
|
|
|
|
|
if 'debited' in text_lower or 'sent' in text_lower or 'paid' in text_lower: |
|
|
entities['type'] = 'debit' |
|
|
elif 'credited' in text_lower or 'received' in text_lower: |
|
|
entities['type'] = 'credit' |
|
|
|
|
|
|
|
|
amount_patterns = [ |
|
|
r'Rs\.?\s*([\d,]+\.?\d*)', |
|
|
r'INR\s*([\d,]+\.?\d*)', |
|
|
r'₹\s*([\d,]+\.?\d*)', |
|
|
r'([\d,]+\.?\d*)\s*has been (?:debited|credited)' |
|
|
] |
|
|
for pattern in amount_patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
entities['amount'] = match.group(1).replace(',', '') |
|
|
break |
|
|
|
|
|
|
|
|
account_patterns = [ |
|
|
r'account\s*(?:no\.?|number|#|XX|X)?\s*(\d{4})', |
|
|
r'A/c\s*(?:XX|X)?(\d{4})', |
|
|
r'a/c\s*(\d{4})', |
|
|
r'XX(\d{4})', |
|
|
r'X(\d{4})' |
|
|
] |
|
|
for pattern in account_patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
break |
|
|
|
|
|
|
|
|
date_patterns = [ |
|
|
r'on\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', |
|
|
r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', |
|
|
r'(\d{1,2}\s+(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\w*\s+\d{4})', |
|
|
] |
|
|
for pattern in date_patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
break |
|
|
|
|
|
|
|
|
ref_patterns = [ |
|
|
r'(?:UPI\s*)?(?:Ref(?:erence)?(?:\s*(?:No|Number|#|:))?\.?\s*:?\s*)(\d{10,})', |
|
|
r'transaction reference number is\s*(\d+)', |
|
|
r'Txn[:\s]*(\d+)', |
|
|
] |
|
|
for pattern in ref_patterns: |
|
|
match = re.search(pattern, text, re.IGNORECASE) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
break |
|
|
|
|
|
|
|
|
vpa_match = re.search(r'VPA\s+(\w+)@\w+\s+([A-Z][A-Za-z\s]+)', text) |
|
|
if vpa_match: |
|
|
entities['merchant'] = vpa_match.group(2).strip().lower() |
|
|
else: |
|
|
|
|
|
merchants = ['swiggy', 'zomato', 'amazon', 'flipkart', 'uber', 'ola', 'rapido', 'bigbasket', 'blinkit', 'zepto'] |
|
|
for m in merchants: |
|
|
if m in text_lower: |
|
|
entities['merchant'] = m |
|
|
break |
|
|
|
|
|
return entities |
|
|
|
|
|
|
|
|
def create_benchmark(n_samples=100): |
|
|
"""Create held-out benchmark from real emails.""" |
|
|
print("=" * 60) |
|
|
print("📊 CREATING HELD-OUT BENCHMARK") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
print(f"\n1. Loading corpus from {CORPUS_FILE}...") |
|
|
corpus = load_corpus() |
|
|
print(f" Found {len(corpus)} financial emails") |
|
|
|
|
|
print(f"\n2. Loading training data to exclude...") |
|
|
train_texts = load_training_texts() |
|
|
print(f" Found {len(train_texts)} training samples to exclude") |
|
|
|
|
|
|
|
|
print(f"\n3. Filtering for transaction emails...") |
|
|
candidates = [] |
|
|
for email in corpus: |
|
|
body = email.get('body', '') |
|
|
|
|
|
|
|
|
if len(body) < 50: |
|
|
continue |
|
|
|
|
|
|
|
|
body_lower = body.lower() |
|
|
has_transaction = any(x in body_lower for x in ['debited', 'credited', 'received', 'sent']) |
|
|
has_amount = any(x in body_lower for x in ['rs.', 'rs ', 'inr', '₹']) |
|
|
|
|
|
if has_transaction and has_amount: |
|
|
|
|
|
entities = extract_entities_from_email(body) |
|
|
|
|
|
candidates.append({ |
|
|
'text': body, |
|
|
'subject': email.get('subject', ''), |
|
|
'sender': email.get('sender', ''), |
|
|
'date': email.get('date', ''), |
|
|
'expected_entities': entities |
|
|
}) |
|
|
|
|
|
print(f" Found {len(candidates)} transaction emails") |
|
|
|
|
|
|
|
|
print(f"\n4. Sampling {n_samples} emails for benchmark...") |
|
|
random.seed(42) |
|
|
benchmark = random.sample(candidates, min(n_samples, len(candidates))) |
|
|
|
|
|
|
|
|
for i, sample in enumerate(benchmark): |
|
|
sample['id'] = i + 1 |
|
|
sample['auto_labeled'] = True |
|
|
sample['verified'] = False |
|
|
|
|
|
|
|
|
BENCHMARK_FILE.parent.mkdir(parents=True, exist_ok=True) |
|
|
with open(BENCHMARK_FILE, 'w') as f: |
|
|
json.dump(benchmark, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
print(f"\n✅ Benchmark saved to {BENCHMARK_FILE}") |
|
|
print(f" Total samples: {len(benchmark)}") |
|
|
|
|
|
|
|
|
banks = {} |
|
|
for s in benchmark: |
|
|
bank = s['expected_entities'].get('bank', 'unknown') |
|
|
banks[bank] = banks.get(bank, 0) + 1 |
|
|
|
|
|
print("\n📊 Benchmark by Bank:") |
|
|
for bank, count in sorted(banks.items()): |
|
|
print(f" {bank.upper():10} {count}") |
|
|
|
|
|
|
|
|
print("\n" + "=" * 60) |
|
|
print("📧 SAMPLE EMAIL FROM BENCHMARK:") |
|
|
print("=" * 60) |
|
|
if benchmark: |
|
|
sample = benchmark[0] |
|
|
print(f"Subject: {sample.get('subject', 'N/A')}") |
|
|
print(f"Text: {sample['text'][:300]}...") |
|
|
print(f"\nAuto-extracted entities:") |
|
|
for k, v in sample['expected_entities'].items(): |
|
|
if v: |
|
|
print(f" {k}: {v}") |
|
|
|
|
|
return benchmark |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
create_benchmark(n_samples=100) |
|
|
|