Ranjit Behera

Clean up repo structure and add benchmark

6a76e07 21 days ago

9.72 kB

	#!/usr/bin/env python3
	"""
	FinEE Benchmark Script
	======================

	Run this to verify accuracy on your own data.

	Usage:
	python benchmark.py # Run built-in tests
	python benchmark.py --file data.jsonl # Test on your data
	python benchmark.py --torture # Run edge case tests

	Author: Ranjit Behera
	"""

	import json
	import time
	import argparse
	from typing import Dict, List, Any
	from dataclasses import dataclass

	try:
	from finee import extract, FinEE
	from finee.schema import ExtractionConfig
	except ImportError:
	print("Install finee first: pip install finee")
	exit(1)


	@dataclass
	class BenchmarkResult:
	total: int = 0
	correct: int = 0
	field_accuracy: Dict[str, float] = None
	avg_latency_ms: float = 0

	def __post_init__(self):
	if self.field_accuracy is None:
	self.field_accuracy = {}


	# ============================================================================
	# BUILT-IN BENCHMARK DATA
	# ============================================================================

	BENCHMARK_DATA = [
	# HDFC Bank
	{
	"text": "HDFC Bank: Rs.2500.00 debited from A/c XX3545 on 28-12-2025 to VPA swiggy@ybl. UPI Ref: 534567891234",
	"expected": {"amount": 2500.0, "type": "debit", "account": "3545", "merchant": "Swiggy", "category": "food"}
	},
	{
	"text": "HDFC: INR 15000 credited to A/c 9876 on 15-01-2025. NEFT from RAHUL SHARMA. Ref: HDFC25011512345",
	"expected": {"amount": 15000.0, "type": "credit", "account": "9876"}
	},
	# ICICI Bank
	{
	"text": "ICICI: Rs.1,250.50 debited from Acct XX4321 on 10-01-25 to amazon@apl. Ref: 987654321012",
	"expected": {"amount": 1250.50, "type": "debit", "account": "4321", "merchant": "Amazon", "category": "shopping"}
	},
	# SBI
	{
	"text": "SBI: Rs.350 debited from a/c XX1234 on 10-01-25. UPI txn to zomato@paytm. Ref: 456789012345",
	"expected": {"amount": 350.0, "type": "debit", "account": "1234", "merchant": "Zomato", "category": "food"}
	},
	# Axis Bank
	{
	"text": "Axis Bank: INR 800.00 debited from A/c 5678 on 05-01-2025. Info: UPI-UBER. Bal: Rs.12,500",
	"expected": {"amount": 800.0, "type": "debit", "account": "5678", "merchant": "Uber", "category": "transport"}
	},
	# Kotak
	{
	"text": "Rs.2000 credited to Kotak A/c XX4321 on 20-01-2025 from rahul.sharma@okicici. Ref: 321654987012",
	"expected": {"amount": 2000.0, "type": "credit", "account": "4321"}
	},
	# Payment Apps
	{
	"text": "PhonePe: Paid Rs.150 to swiggy@ybl from A/c XX1234. UPI Ref: 123456789012",
	"expected": {"amount": 150.0, "type": "debit", "merchant": "Swiggy", "category": "food"}
	},
	{
	"text": "GPay: Sent Rs.500 to uber@paytm from HDFC Bank XX9876. Txn ID: GPY987654321",
	"expected": {"amount": 500.0, "type": "debit", "merchant": "Uber", "category": "transport"}
	},
	]


	# ============================================================================
	# TORTURE TEST DATA (Edge Cases)
	# ============================================================================

	TORTURE_TESTS = [
	# Missing spaces
	{
	"text": "Rs.500.00debited from HDFC A/c1234 on01-01-25",
	"expected": {"amount": 500.0, "type": "debit", "account": "1234"},
	"difficulty": "Missing spaces"
	},
	# Weird formatting
	{
	"text": "HDFC:Rs 2,500/-debited A/c XX3545 dt:28/12/25 VPA-swiggy@ybl Ref534567891234",
	"expected": {"amount": 2500.0, "type": "debit", "account": "3545"},
	"difficulty": "Non-standard formatting"
	},
	# Mixed case
	{
	"text": "Your A/C XXXX1234 is DEBITED for RS. 1500 on 15-JAN-25. VPA: SWIGGY@YBL",
	"expected": {"amount": 1500.0, "type": "debit", "account": "1234"},
	"difficulty": "Mixed case"
	},
	# Truncated SMS
	{
	"text": "Rs.2500 debited from A/c...3545 to swi...",
	"expected": {"amount": 2500.0, "type": "debit"},
	"difficulty": "Truncated message"
	},
	# Extra noise
	{
	"text": "ALERT! Dear Customer, Rs.500.00 has been debited from your account XX1234 on 01-01-2025. For disputes call 1800-XXX-XXXX. Ignore if done by you.",
	"expected": {"amount": 500.0, "type": "debit", "account": "1234"},
	"difficulty": "Extra noise/marketing"
	},
	# Multiple amounts
	{
	"text": "Rs.500 debited from A/c 1234. Bal: Rs.15,000. Min due: Rs.2000",
	"expected": {"amount": 500.0, "type": "debit", "account": "1234"},
	"difficulty": "Multiple amounts (balance, due)"
	},
	# Unicode symbols
	{
	"text": "₹2,500 debited from A/c •••• 3545 on 28-12-25",
	"expected": {"amount": 2500.0, "type": "debit", "account": "3545"},
	"difficulty": "Unicode symbols (₹, •)"
	},
	# Lakhs notation
	{
	"text": "INR 1.5 Lakh credited to your A/c 9876 on 15-01-25",
	"expected": {"amount": 150000.0, "type": "credit", "account": "9876"},
	"difficulty": "Lakhs notation"
	},
	]


	def normalize(val):
	"""Normalize value for comparison."""
	if val is None:
	return None
	if isinstance(val, (int, float)):
	return float(val)
	if hasattr(val, 'value'): # Enum
	return val.value.lower()
	return str(val).lower().strip()


	def compare(expected: Dict, result) -> Dict[str, bool]:
	"""Compare expected vs actual."""
	matches = {}
	for field, exp_val in expected.items():
	actual_val = getattr(result, field, None)
	exp_norm = normalize(exp_val)
	act_norm = normalize(actual_val)
	matches[field] = exp_norm == act_norm
	return matches


	def run_benchmark(data: List[Dict], name: str = "Benchmark") -> BenchmarkResult:
	"""Run benchmark on dataset."""
	result = BenchmarkResult()
	result.total = len(data)

	field_correct = {}
	field_total = {}
	latencies = []

	print(f"\n{'='*70}")
	print(f"📊 {name} ({len(data)} samples)")
	print(f"{'='*70}\n")

	for i, sample in enumerate(data):
	text = sample["text"]
	expected = sample["expected"]
	difficulty = sample.get("difficulty", "")

	start = time.time()
	r = extract(text)
	latency = (time.time() - start) * 1000
	latencies.append(latency)

	matches = compare(expected, r)
	all_match = all(matches.values())

	if all_match:
	result.correct += 1
	status = "✅"
	else:
	status = "❌"

	# Track field accuracy
	for field, matched in matches.items():
	if field not in field_total:
	field_total[field] = 0
	field_correct[field] = 0
	field_total[field] += 1
	if matched:
	field_correct[field] += 1

	# Print result
	if difficulty:
	print(f"{status} [{difficulty}]")
	else:
	print(f"{status} Sample {i+1}")

	if not all_match:
	print(f" Input: {text[:60]}...")
	for field, matched in matches.items():
	if not matched:
	actual = getattr(r, field, None)
	exp = expected[field]
	print(f" {field}: expected={exp}, got={actual}")
	print()

	# Calculate field accuracy
	result.field_accuracy = {
	field: field_correct[field] / field_total[field] * 100
	for field in field_total
	}
	result.avg_latency_ms = sum(latencies) / len(latencies)

	# Print summary
	print(f"\n{'='*70}")
	print(f"📈 SUMMARY: {name}")
	print(f"{'='*70}")
	print(f"Overall Accuracy: {result.correct}/{result.total} ({result.correct/result.total*100:.1f}%)")
	print(f"Average Latency: {result.avg_latency_ms:.2f}ms")
	print(f"\nField Accuracy:")
	for field, acc in sorted(result.field_accuracy.items()):
	status = "✅" if acc >= 90 else "⚠️" if acc >= 70 else "❌"
	print(f" {field:12} {acc:5.1f}% {status}")
	print(f"{'='*70}\n")

	return result


	def run_user_file(filepath: str) -> BenchmarkResult:
	"""Run benchmark on user's JSONL file."""
	data = []
	with open(filepath) as f:
	for line in f:
	if line.strip():
	data.append(json.loads(line))
	return run_benchmark(data, f"User Data ({filepath})")


	def main():
	parser = argparse.ArgumentParser(description="FinEE Benchmark")
	parser.add_argument("--file", "-f", help="Path to JSONL file with test data")
	parser.add_argument("--torture", "-t", action="store_true", help="Run torture tests (edge cases)")
	parser.add_argument("--all", "-a", action="store_true", help="Run all benchmarks")
	args = parser.parse_args()

	print("\n" + "="*70)
	print("🏦 FinEE BENCHMARK SUITE")
	print("="*70)
	print("Testing extraction accuracy on Indian banking messages...")

	if args.file:
	run_user_file(args.file)
	elif args.torture:
	run_benchmark(TORTURE_TESTS, "Torture Tests (Edge Cases)")
	elif args.all:
	run_benchmark(BENCHMARK_DATA, "Standard Benchmark")
	run_benchmark(TORTURE_TESTS, "Torture Tests (Edge Cases)")
	else:
	run_benchmark(BENCHMARK_DATA, "Standard Benchmark")

	print("\n✅ Benchmark complete!")
	print("To test on your own data:")
	print(' python benchmark.py --file your_data.jsonl')
	print("\nJSONL format:")
	print(' {"text": "Rs.500 debited...", "expected": {"amount": 500, "type": "debit"}}')


	if __name__ == "__main__":
	main()