|
|
""" |
|
|
Create Multi-Bank Clean Benchmark. |
|
|
|
|
|
Creates a high-quality benchmark from real transaction emails |
|
|
across multiple banks: HDFC, ICICI, SBI, Axis, Kotak, PhonePe, GPay, Paytm. |
|
|
|
|
|
Author: Ranjit Behera |
|
|
""" |
|
|
|
|
|
import json |
|
|
import re |
|
|
import random |
|
|
from pathlib import Path |
|
|
from collections import defaultdict |
|
|
|
|
|
CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl") |
|
|
BENCHMARK_FILE = Path("data/benchmark/multi_bank_benchmark.json") |
|
|
|
|
|
|
|
|
def detect_bank(body: str, sender: str = "") -> str: |
|
|
"""Detect bank from email content and sender.""" |
|
|
text = (body + " " + sender).lower() |
|
|
|
|
|
if 'hdfc' in text: |
|
|
return 'hdfc' |
|
|
elif 'icici' in text: |
|
|
return 'icici' |
|
|
elif 'sbi' in text or 'state bank' in text: |
|
|
return 'sbi' |
|
|
elif 'axis' in text: |
|
|
return 'axis' |
|
|
elif 'kotak' in text: |
|
|
return 'kotak' |
|
|
elif 'phonepe' in text: |
|
|
return 'phonepe' |
|
|
elif 'gpay' in text or 'google pay' in text: |
|
|
return 'gpay' |
|
|
elif 'paytm' in text: |
|
|
return 'paytm' |
|
|
|
|
|
return '' |
|
|
|
|
|
|
|
|
def extract_entities(body: str, bank: str) -> dict: |
|
|
"""Extract entities from email based on bank format.""" |
|
|
entities = { |
|
|
'amount': '', |
|
|
'type': '', |
|
|
'date': '', |
|
|
'account': '', |
|
|
'reference': '', |
|
|
'merchant': '', |
|
|
'bank': bank |
|
|
} |
|
|
|
|
|
|
|
|
amount_patterns = [ |
|
|
r'Rs\.?\s*([\d,]+\.?\d*)', |
|
|
r'INR\s*([\d,]+\.?\d*)', |
|
|
r'₹\s*([\d,]+\.?\d*)', |
|
|
] |
|
|
for pattern in amount_patterns: |
|
|
match = re.search(pattern, body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['amount'] = match.group(1).replace(',', '') |
|
|
break |
|
|
|
|
|
|
|
|
body_lower = body.lower() |
|
|
if any(x in body_lower for x in ['debited', 'sent', 'paid', 'payment of']): |
|
|
entities['type'] = 'debit' |
|
|
elif any(x in body_lower for x in ['credited', 'received', 'added']): |
|
|
entities['type'] = 'credit' |
|
|
|
|
|
|
|
|
if bank == 'hdfc': |
|
|
|
|
|
match = re.search(r'(?:account|A/c\s*\**)(\d{4})', body) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'on\s*(\d{1,2}-\d{1,2}-\d{2,4})', body) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'reference number is\s*(\d{10,})', body) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
|
|
|
elif bank == 'icici': |
|
|
|
|
|
match = re.search(r'(?:A/c|Acct)\s*XX(\d{4})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'on\s*(\d{8}|\d{1,2}\s+\w+\s+\d{4}|\d{1,2}-\d{1,2}-\d{2,4})', body) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'Ref[:\s]*(\d{10,})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
|
|
|
elif bank == 'sbi': |
|
|
|
|
|
match = re.search(r'[Aa]/c\s*X+(\d{4})', body) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'on\s*(\d{1,2}-\d{1,2}-\d{2,4}|\d{1,2}/\d{1,2}/\d{4}|\d{1,2}\s+\w+\s+\d{4})', body) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'Ref\s*(\d{10,})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
|
|
|
elif bank == 'axis': |
|
|
|
|
|
match = re.search(r'(?:Acct|A/c)\s*XX(\d{4})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'on\s*(\d{8}|\d{1,2}-\d{1,2}-\d{4}|\d{1,2}/\d{1,2}/\d{4})', body) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'Ref\s*(\d{10,})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
|
|
|
elif bank == 'kotak': |
|
|
|
|
|
match = re.search(r'A/c\s*(?:XX)?(\d{4})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'on\s*(\d{8}|\d{1,2}-\d{1,2}-\d{2,4}|\d{1,2}\s+\w+\s+\d{4})', body) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'Ref[:\s.]*(\d{10,})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
|
|
|
elif bank in ['phonepe', 'gpay', 'paytm']: |
|
|
|
|
|
match = re.search(r'(?:a/c|account)\s*(?:XX)?(\d{4})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['account'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'(\d{1,2}[-/]\d{1,2}[-/]\d{2,4}|\d{1,2}\s+\w+\s+\d{4})', body) |
|
|
if match: |
|
|
entities['date'] = match.group(1) |
|
|
|
|
|
|
|
|
match = re.search(r'(?:Ref|Txn\s*ID)[:\s]*(\d{10,})', body, re.IGNORECASE) |
|
|
if match: |
|
|
entities['reference'] = match.group(1) |
|
|
|
|
|
|
|
|
vpa_match = re.search(r'VPA[:\s]+\S+\s+([A-Z][A-Za-z\s]+?)(?:\s+on|\s+\d|$)', body) |
|
|
if vpa_match: |
|
|
merchant = vpa_match.group(1).strip().lower() |
|
|
if len(merchant) > 2 and len(merchant) < 50: |
|
|
entities['merchant'] = merchant |
|
|
|
|
|
|
|
|
if not entities['merchant']: |
|
|
upi_match = re.search(r'UPI[:\s-]+([A-Za-z]+)', body) |
|
|
if upi_match: |
|
|
entities['merchant'] = upi_match.group(1).lower() |
|
|
|
|
|
return entities |
|
|
|
|
|
|
|
|
def create_multi_bank_benchmark(): |
|
|
"""Create benchmark with multiple banks.""" |
|
|
print("=" * 60) |
|
|
print("📊 CREATING MULTI-BANK BENCHMARK") |
|
|
print("=" * 60) |
|
|
|
|
|
|
|
|
bank_transactions = defaultdict(list) |
|
|
|
|
|
with open(CORPUS_FILE, 'r') as f: |
|
|
for line in f: |
|
|
try: |
|
|
data = json.loads(line) |
|
|
body = data.get('body', '') |
|
|
sender = data.get('sender', '') |
|
|
|
|
|
|
|
|
body_lower = body.lower() |
|
|
has_transaction = any(x in body_lower for x in |
|
|
['debited', 'credited', 'received', 'sent', 'paid', 'payment']) |
|
|
has_amount = any(x in body_lower for x in ['rs.', 'rs ', 'inr', '₹']) |
|
|
|
|
|
if not (has_transaction and has_amount and len(body) > 50): |
|
|
continue |
|
|
|
|
|
|
|
|
bank = detect_bank(body, sender) |
|
|
if not bank: |
|
|
continue |
|
|
|
|
|
|
|
|
entities = extract_entities(body, bank) |
|
|
|
|
|
|
|
|
if entities['amount'] and entities['type']: |
|
|
bank_transactions[bank].append({ |
|
|
'text': body, |
|
|
'expected_entities': entities, |
|
|
'subject': data.get('subject', ''), |
|
|
'verified': True |
|
|
}) |
|
|
except: |
|
|
continue |
|
|
|
|
|
print("\n📊 Transactions found by bank:") |
|
|
for bank, txns in sorted(bank_transactions.items()): |
|
|
print(f" {bank.upper():10} {len(txns):4} transactions") |
|
|
|
|
|
|
|
|
random.seed(42) |
|
|
benchmark = [] |
|
|
|
|
|
for bank, txns in bank_transactions.items(): |
|
|
|
|
|
seen_refs = set() |
|
|
unique_txns = [] |
|
|
for t in txns: |
|
|
ref = t['expected_entities'].get('reference', '') |
|
|
if ref: |
|
|
if ref not in seen_refs: |
|
|
seen_refs.add(ref) |
|
|
unique_txns.append(t) |
|
|
else: |
|
|
unique_txns.append(t) |
|
|
|
|
|
|
|
|
sampled = random.sample(unique_txns, min(20, len(unique_txns))) |
|
|
benchmark.extend(sampled) |
|
|
|
|
|
|
|
|
for i, sample in enumerate(benchmark): |
|
|
sample['id'] = i + 1 |
|
|
|
|
|
|
|
|
random.shuffle(benchmark) |
|
|
|
|
|
|
|
|
BENCHMARK_FILE.parent.mkdir(parents=True, exist_ok=True) |
|
|
with open(BENCHMARK_FILE, 'w') as f: |
|
|
json.dump(benchmark, f, indent=2, ensure_ascii=False) |
|
|
|
|
|
print(f"\n✅ Saved {len(benchmark)} samples to {BENCHMARK_FILE}") |
|
|
|
|
|
|
|
|
bank_counts = defaultdict(int) |
|
|
for s in benchmark: |
|
|
bank_counts[s['expected_entities']['bank']] += 1 |
|
|
|
|
|
print("\n📊 Benchmark composition:") |
|
|
for bank, count in sorted(bank_counts.items()): |
|
|
print(f" {bank.upper():10} {count:3} samples") |
|
|
|
|
|
|
|
|
print("\n📧 Sample from each bank:") |
|
|
shown_banks = set() |
|
|
for s in benchmark: |
|
|
bank = s['expected_entities']['bank'] |
|
|
if bank not in shown_banks: |
|
|
shown_banks.add(bank) |
|
|
print(f"\n [{bank.upper()}]") |
|
|
print(f" Amount: {s['expected_entities']['amount']}") |
|
|
print(f" Type: {s['expected_entities']['type']}") |
|
|
print(f" Text: {s['text'][:100]}...") |
|
|
if len(shown_banks) >= 4: |
|
|
break |
|
|
|
|
|
return benchmark |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
create_multi_bank_benchmark() |
|
|
|