#!/usr/bin/env python3 """ FinEE Benchmark Script ====================== Run this to verify accuracy on your own data. Usage: python benchmark.py # Run built-in tests python benchmark.py --file data.jsonl # Test on your data python benchmark.py --torture # Run edge case tests Author: Ranjit Behera """ import json import time import argparse from typing import Dict, List, Any from dataclasses import dataclass try: from finee import extract, FinEE from finee.schema import ExtractionConfig except ImportError: print("Install finee first: pip install finee") exit(1) @dataclass class BenchmarkResult: total: int = 0 correct: int = 0 field_accuracy: Dict[str, float] = None avg_latency_ms: float = 0 def __post_init__(self): if self.field_accuracy is None: self.field_accuracy = {} # ============================================================================ # BUILT-IN BENCHMARK DATA # ============================================================================ BENCHMARK_DATA = [ # HDFC Bank { "text": "HDFC Bank: Rs.2500.00 debited from A/c XX3545 on 28-12-2025 to VPA swiggy@ybl. UPI Ref: 534567891234", "expected": {"amount": 2500.0, "type": "debit", "account": "3545", "merchant": "Swiggy", "category": "food"} }, { "text": "HDFC: INR 15000 credited to A/c 9876 on 15-01-2025. NEFT from RAHUL SHARMA. Ref: HDFC25011512345", "expected": {"amount": 15000.0, "type": "credit", "account": "9876"} }, # ICICI Bank { "text": "ICICI: Rs.1,250.50 debited from Acct XX4321 on 10-01-25 to amazon@apl. Ref: 987654321012", "expected": {"amount": 1250.50, "type": "debit", "account": "4321", "merchant": "Amazon", "category": "shopping"} }, # SBI { "text": "SBI: Rs.350 debited from a/c XX1234 on 10-01-25. UPI txn to zomato@paytm. Ref: 456789012345", "expected": {"amount": 350.0, "type": "debit", "account": "1234", "merchant": "Zomato", "category": "food"} }, # Axis Bank { "text": "Axis Bank: INR 800.00 debited from A/c 5678 on 05-01-2025. Info: UPI-UBER. Bal: Rs.12,500", "expected": {"amount": 800.0, "type": "debit", "account": "5678", "merchant": "Uber", "category": "transport"} }, # Kotak { "text": "Rs.2000 credited to Kotak A/c XX4321 on 20-01-2025 from rahul.sharma@okicici. Ref: 321654987012", "expected": {"amount": 2000.0, "type": "credit", "account": "4321"} }, # Payment Apps { "text": "PhonePe: Paid Rs.150 to swiggy@ybl from A/c XX1234. UPI Ref: 123456789012", "expected": {"amount": 150.0, "type": "debit", "merchant": "Swiggy", "category": "food"} }, { "text": "GPay: Sent Rs.500 to uber@paytm from HDFC Bank XX9876. Txn ID: GPY987654321", "expected": {"amount": 500.0, "type": "debit", "merchant": "Uber", "category": "transport"} }, ] # ============================================================================ # TORTURE TEST DATA (Edge Cases) # ============================================================================ TORTURE_TESTS = [ # Missing spaces { "text": "Rs.500.00debited from HDFC A/c1234 on01-01-25", "expected": {"amount": 500.0, "type": "debit", "account": "1234"}, "difficulty": "Missing spaces" }, # Weird formatting { "text": "HDFC:Rs 2,500/-debited A/c XX3545 dt:28/12/25 VPA-swiggy@ybl Ref534567891234", "expected": {"amount": 2500.0, "type": "debit", "account": "3545"}, "difficulty": "Non-standard formatting" }, # Mixed case { "text": "Your A/C XXXX1234 is DEBITED for RS. 1500 on 15-JAN-25. VPA: SWIGGY@YBL", "expected": {"amount": 1500.0, "type": "debit", "account": "1234"}, "difficulty": "Mixed case" }, # Truncated SMS { "text": "Rs.2500 debited from A/c...3545 to swi...", "expected": {"amount": 2500.0, "type": "debit"}, "difficulty": "Truncated message" }, # Extra noise { "text": "ALERT! Dear Customer, Rs.500.00 has been debited from your account XX1234 on 01-01-2025. For disputes call 1800-XXX-XXXX. Ignore if done by you.", "expected": {"amount": 500.0, "type": "debit", "account": "1234"}, "difficulty": "Extra noise/marketing" }, # Multiple amounts { "text": "Rs.500 debited from A/c 1234. Bal: Rs.15,000. Min due: Rs.2000", "expected": {"amount": 500.0, "type": "debit", "account": "1234"}, "difficulty": "Multiple amounts (balance, due)" }, # Unicode symbols { "text": "₹2,500 debited from A/c •••• 3545 on 28-12-25", "expected": {"amount": 2500.0, "type": "debit", "account": "3545"}, "difficulty": "Unicode symbols (₹, •)" }, # Lakhs notation { "text": "INR 1.5 Lakh credited to your A/c 9876 on 15-01-25", "expected": {"amount": 150000.0, "type": "credit", "account": "9876"}, "difficulty": "Lakhs notation" }, ] def normalize(val): """Normalize value for comparison.""" if val is None: return None if isinstance(val, (int, float)): return float(val) if hasattr(val, 'value'): # Enum return val.value.lower() return str(val).lower().strip() def compare(expected: Dict, result) -> Dict[str, bool]: """Compare expected vs actual.""" matches = {} for field, exp_val in expected.items(): actual_val = getattr(result, field, None) exp_norm = normalize(exp_val) act_norm = normalize(actual_val) matches[field] = exp_norm == act_norm return matches def run_benchmark(data: List[Dict], name: str = "Benchmark") -> BenchmarkResult: """Run benchmark on dataset.""" result = BenchmarkResult() result.total = len(data) field_correct = {} field_total = {} latencies = [] print(f"\n{'='*70}") print(f"📊 {name} ({len(data)} samples)") print(f"{'='*70}\n") for i, sample in enumerate(data): text = sample["text"] expected = sample["expected"] difficulty = sample.get("difficulty", "") start = time.time() r = extract(text) latency = (time.time() - start) * 1000 latencies.append(latency) matches = compare(expected, r) all_match = all(matches.values()) if all_match: result.correct += 1 status = "✅" else: status = "❌" # Track field accuracy for field, matched in matches.items(): if field not in field_total: field_total[field] = 0 field_correct[field] = 0 field_total[field] += 1 if matched: field_correct[field] += 1 # Print result if difficulty: print(f"{status} [{difficulty}]") else: print(f"{status} Sample {i+1}") if not all_match: print(f" Input: {text[:60]}...") for field, matched in matches.items(): if not matched: actual = getattr(r, field, None) exp = expected[field] print(f" {field}: expected={exp}, got={actual}") print() # Calculate field accuracy result.field_accuracy = { field: field_correct[field] / field_total[field] * 100 for field in field_total } result.avg_latency_ms = sum(latencies) / len(latencies) # Print summary print(f"\n{'='*70}") print(f"📈 SUMMARY: {name}") print(f"{'='*70}") print(f"Overall Accuracy: {result.correct}/{result.total} ({result.correct/result.total*100:.1f}%)") print(f"Average Latency: {result.avg_latency_ms:.2f}ms") print(f"\nField Accuracy:") for field, acc in sorted(result.field_accuracy.items()): status = "✅" if acc >= 90 else "⚠️" if acc >= 70 else "❌" print(f" {field:12} {acc:5.1f}% {status}") print(f"{'='*70}\n") return result def run_user_file(filepath: str) -> BenchmarkResult: """Run benchmark on user's JSONL file.""" data = [] with open(filepath) as f: for line in f: if line.strip(): data.append(json.loads(line)) return run_benchmark(data, f"User Data ({filepath})") def main(): parser = argparse.ArgumentParser(description="FinEE Benchmark") parser.add_argument("--file", "-f", help="Path to JSONL file with test data") parser.add_argument("--torture", "-t", action="store_true", help="Run torture tests (edge cases)") parser.add_argument("--all", "-a", action="store_true", help="Run all benchmarks") args = parser.parse_args() print("\n" + "="*70) print("🏦 FinEE BENCHMARK SUITE") print("="*70) print("Testing extraction accuracy on Indian banking messages...") if args.file: run_user_file(args.file) elif args.torture: run_benchmark(TORTURE_TESTS, "Torture Tests (Edge Cases)") elif args.all: run_benchmark(BENCHMARK_DATA, "Standard Benchmark") run_benchmark(TORTURE_TESTS, "Torture Tests (Edge Cases)") else: run_benchmark(BENCHMARK_DATA, "Standard Benchmark") print("\n✅ Benchmark complete!") print("To test on your own data:") print(' python benchmark.py --file your_data.jsonl') print("\nJSONL format:") print(' {"text": "Rs.500 debited...", "expected": {"amount": 500, "type": "debit"}}') if __name__ == "__main__": main()