finance-entity-extractor / scripts /create_strict_benchmark.py

Ranjit Behera

FinEE v1.0 - Finance Entity Extractor

dcc24f8 20 days ago

7 kB

	"""
	Create Strict Transaction Benchmark.

	Only includes real transaction alerts with clear patterns.
	Excludes marketing, bill notifications, and investment updates.

	Author: Ranjit Behera
	"""

	import json
	import re
	import random
	from pathlib import Path
	from collections import defaultdict

	CORPUS_FILE = Path("data/corpus/emails/financial_emails.jsonl")
	BENCHMARK_FILE = Path("data/benchmark/strict_benchmark.json")

	# Transaction patterns that indicate real transactions
	TRANSACTION_PATTERNS = [
	r'has been debited',
	r'has been credited',
	r'is debited from',
	r'is credited to',
	r'Rs\.\s[\d,]+.debited',
	r'Rs\.\s[\d,]+.credited',
	r'INR\s[\d,]+.debited',
	r'INR\s[\d,]+.credited',
	r'UPI transaction reference',
	r'UPI Ref',
	r'IMPS Ref',
	r'NEFT Ref',
	]

	# Exclude patterns (marketing, bills, investments)
	EXCLUDE_PATTERNS = [
	r'welcome to your',
	r'greetings of the day',
	r'unsubscribe from',
	r'skills that will get you',
	r'daily digest',
	r'top picks',
	r'mutual fund nav',
	r'market update',
	r'job opportunity',
	r'margin statement',
	r'password reset',
	]


	def is_transaction_email(body: str) -> bool:
	"""Check if email is a real transaction alert."""
	body_lower = body.lower()

	# Must match at least one transaction pattern
	has_transaction = any(re.search(p, body, re.IGNORECASE) for p in TRANSACTION_PATTERNS)

	# Must not match exclude patterns
	has_exclude = any(re.search(p, body_lower) for p in EXCLUDE_PATTERNS)

	return has_transaction and not has_exclude


	def detect_bank(body: str, sender: str = "") -> str:
	"""Detect bank from email."""
	text = (body + " " + sender).lower()

	# Priority order (more specific first)
	if 'hdfc bank' in text or 'hdfcbank' in text:
	return 'hdfc'
	elif 'icici bank' in text:
	return 'icici'
	elif 'state bank' in text or 'sbi:' in text:
	return 'sbi'
	elif 'axis bank' in text:
	return 'axis'
	elif 'kotak' in text:
	return 'kotak'

	return ''


	def extract_entities(body: str, bank: str) -> dict:
	"""Extract entities from transaction email."""
	entities = {
	'amount': '',
	'type': '',
	'date': '',
	'account': '',
	'reference': '',
	'merchant': '',
	'bank': bank
	}

	# Amount
	match = re.search(r'Rs\.?\s([\d,]+\.?\d)', body, re.IGNORECASE)
	if match:
	entities['amount'] = match.group(1).replace(',', '')
	else:
	match = re.search(r'INR\s([\d,]+\.?\d)', body, re.IGNORECASE)
	if match:
	entities['amount'] = match.group(1).replace(',', '')

	# Type
	body_lower = body.lower()
	if 'debited' in body_lower:
	entities['type'] = 'debit'
	elif 'credited' in body_lower:
	entities['type'] = 'credit'

	# Account (4 digits after XX or **)
	match = re.search(r'(?:XX\|X\|\\\|account\s*)(\d{4})', body, re.IGNORECASE)
	if match:
	entities['account'] = match.group(1)

	# Date
	match = re.search(r'on\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})', body)
	if match:
	entities['date'] = match.group(1)

	# Reference (12+ digit number)
	ref_patterns = [
	r'reference number is\s*(\d{10,})',
	r'(?:Ref(?:erence)?[:\s.]\|UPI\sRef[:\s]\|IMPS\sRef[:\s]*)(\d{10,})',
	]
	for pattern in ref_patterns:
	match = re.search(pattern, body, re.IGNORECASE)
	if match:
	entities['reference'] = match.group(1)
	break

	# Merchant from VPA
	match = re.search(r'VPA[:\s]+\S+\s+([A-Z][A-Za-z\s]+?)(?:\s+on\|\s+\d)', body)
	if match:
	entities['merchant'] = match.group(1).strip().lower()

	return entities


	def create_strict_benchmark():
	"""Create strictly filtered benchmark."""
	print("=" * 60)
	print("📊 CREATING STRICT TRANSACTION BENCHMARK")
	print("=" * 60)

	bank_transactions = defaultdict(list)

	with open(CORPUS_FILE, 'r') as f:
	for line in f:
	try:
	data = json.loads(line)
	body = data.get('body', '')
	sender = data.get('sender', '')

	# Strict filtering
	if not is_transaction_email(body):
	continue

	if len(body) < 50:
	continue

	# Detect bank
	bank = detect_bank(body, sender)
	if not bank:
	continue

	# Extract entities
	entities = extract_entities(body, bank)

	# Must have amount, type, and reference
	if entities['amount'] and entities['type'] and entities['reference']:
	bank_transactions[bank].append({
	'text': body,
	'expected_entities': entities,
	'subject': data.get('subject', ''),
	'verified': True
	})
	except:
	continue

	print("\n📊 Strict transactions per bank:")
	for bank, txns in sorted(bank_transactions.items()):
	print(f" {bank.upper():10} {len(txns):4} transactions")

	# Sample and deduplicate
	random.seed(42)
	benchmark = []

	for bank, txns in bank_transactions.items():
	# Deduplicate by reference
	seen_refs = set()
	unique = []
	for t in txns:
	ref = t['expected_entities']['reference']
	if ref not in seen_refs:
	seen_refs.add(ref)
	unique.append(t)

	sampled = random.sample(unique, min(15, len(unique)))
	benchmark.extend(sampled)

	for i, s in enumerate(benchmark):
	s['id'] = i + 1

	random.shuffle(benchmark)

	# Save
	BENCHMARK_FILE.parent.mkdir(parents=True, exist_ok=True)
	with open(BENCHMARK_FILE, 'w') as f:
	json.dump(benchmark, f, indent=2, ensure_ascii=False)

	print(f"\n✅ Saved {len(benchmark)} samples to {BENCHMARK_FILE}")

	# Stats
	bank_counts = defaultdict(int)
	for s in benchmark:
	bank_counts[s['expected_entities']['bank']] += 1

	print("\n📊 Benchmark composition:")
	for bank, count in sorted(bank_counts.items()):
	print(f" {bank.upper():10} {count:3} samples")

	# Show samples
	print("\n📧 Sample transaction:")
	if benchmark:
	s = benchmark[0]
	print(f" Bank: {s['expected_entities']['bank'].upper()}")
	print(f" Amount: {s['expected_entities']['amount']}")
	print(f" Type: {s['expected_entities']['type']}")
	print(f" Reference: {s['expected_entities']['reference']}")
	print(f" Text: {s['text'][:150]}...")

	return benchmark


	if __name__ == "__main__":
	create_strict_benchmark()