Spaces:
Sleeping
Sleeping
| """ | |
| test_extractor.py — Assert-based tests for extractor.py using hardcoded OCR strings. | |
| Run with: pytest test_extractor.py -v | |
| """ | |
| import sys | |
| from pathlib import Path | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from extractor import ( | |
| extract_vendor, | |
| extract_date, | |
| extract_invoice_number, | |
| extract_amounts, | |
| parse_invoice, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Realistic OCR output strings simulating real bill scans | |
| # --------------------------------------------------------------------------- | |
| SAMPLE_BILL_1 = """TAX INVOICE | |
| SuperMart Inc. | |
| 123 Main St, Springfield | |
| Date: 15/01/2024 | |
| Invoice No. INV-2024-001 | |
| Apples $4.50 | |
| Bread $2.00 | |
| Milk $3.50 | |
| Subtotal: $10.00 | |
| GST (5%): $0.50 | |
| Total: $10.50""" | |
| SAMPLE_BILL_2 = """RESTAURANT BILL | |
| Joe's Diner | |
| Date: 22-Feb-2024 | |
| INV# 99824 | |
| Burger 15.00 | |
| Fries 5.00 | |
| Cola 3.00 | |
| Sub Total: 23.00 | |
| Tax: 2.00 | |
| TOTAL: $25.00""" | |
| SAMPLE_BILL_3 = """TECH GADGETS LLC | |
| Invoice No: TECH-882 | |
| Date: 05 Mar 2024 | |
| Mouse Rs.1,250 | |
| Keyboard Rs.2,500 | |
| Subtotal Rs.3,500 | |
| GST 10% Rs.350 | |
| Total : ₹3,850""" | |
| # --------------------------------------------------------------------------- | |
| # Test 1: extract_vendor skips 'TAX INVOICE' header and returns company name | |
| # --------------------------------------------------------------------------- | |
| def test_extract_vendor_skips_header(): | |
| """Vendor extraction must skip generic headers and return first real company name.""" | |
| vendor = extract_vendor(SAMPLE_BILL_1) | |
| assert vendor is not None, "Vendor must not be None" | |
| assert "SuperMart" in vendor, f"Expected 'SuperMart' in vendor, got: {vendor}" | |
| print(f"PASS: test_extract_vendor_skips_header → {vendor}") | |
| # --------------------------------------------------------------------------- | |
| # Test 2: extract_date handles multiple formats | |
| # --------------------------------------------------------------------------- | |
| def test_extract_date_multiple_formats(): | |
| """Date extractor must handle DD/MM/YYYY, DD-Mon-YYYY, and DD Mon YYYY.""" | |
| date1 = extract_date(SAMPLE_BILL_1) | |
| assert date1 is not None and "2024" in date1, f"Bill 1 date failed: {date1}" | |
| date2 = extract_date(SAMPLE_BILL_2) | |
| assert date2 is not None, f"Bill 2 date (DD-Mon-YYYY) failed: {date2}" | |
| date3 = extract_date(SAMPLE_BILL_3) | |
| assert date3 is not None and "Mar" in date3 or (date3 and "2024" in date3), \ | |
| f"Bill 3 date (DD Mon YYYY) failed: {date3}" | |
| print(f"PASS: test_extract_date_multiple_formats → {date1} | {date2} | {date3}") | |
| # --------------------------------------------------------------------------- | |
| # Test 3: extract_invoice_number returns correct reference | |
| # --------------------------------------------------------------------------- | |
| def test_extract_invoice_number(): | |
| """Invoice number extractor must identify INV-XXXX and TECH-XXX patterns.""" | |
| inv1 = extract_invoice_number(SAMPLE_BILL_1) | |
| assert inv1 is not None, "Invoice number must not be None for bill 1" | |
| assert "INV-2024-001" in inv1, f"Expected INV-2024-001, got: {inv1}" | |
| inv3 = extract_invoice_number(SAMPLE_BILL_3) | |
| assert inv3 is not None, "Invoice number must not be None for bill 3" | |
| assert "TECH-882" in inv3, f"Expected TECH-882, got: {inv3}" | |
| print(f"PASS: test_extract_invoice_number → {inv1} | {inv3}") | |
| # --------------------------------------------------------------------------- | |
| # Test 4: extract_amounts correctly extracts total (case-insensitive, with space before colon) | |
| # --------------------------------------------------------------------------- | |
| def test_extract_amounts_total(): | |
| """Total must be extracted case-insensitively and with space before colon.""" | |
| amounts1 = extract_amounts(SAMPLE_BILL_1) | |
| assert amounts1["total"] == 10.50, f"Bill 1 total: expected 10.50, got {amounts1['total']}" | |
| amounts2 = extract_amounts(SAMPLE_BILL_2) | |
| assert amounts2["total"] == 25.00, f"Bill 2 total (UPPERCASE): expected 25.00, got {amounts2['total']}" | |
| amounts3 = extract_amounts(SAMPLE_BILL_3) | |
| assert amounts3["total"] == 3850.00, f"Bill 3 total (space before colon): expected 3850.00, got {amounts3['total']}" | |
| print(f"PASS: test_extract_amounts_total → {amounts1['total']} | {amounts2['total']} | {amounts3['total']}") | |
| # --------------------------------------------------------------------------- | |
| # Test 5: parse_invoice returns complete dict with all required keys | |
| # --------------------------------------------------------------------------- | |
| def test_parse_invoice_returns_complete_dict(): | |
| """parse_invoice must return dict with all required keys.""" | |
| result = parse_invoice(SAMPLE_BILL_1) | |
| required_keys = {"vendor", "date", "invoice_number", "subtotal", "gst", "total", "raw_text"} | |
| assert required_keys == set(result.keys()), f"Missing keys: {required_keys - set(result.keys())}" | |
| assert result["raw_text"] == SAMPLE_BILL_1, "raw_text must be the original input" | |
| assert result["total"] == 10.50 | |
| print(f"PASS: test_parse_invoice_returns_complete_dict → {result}") | |
| if __name__ == "__main__": | |
| test_extract_vendor_skips_header() | |
| test_extract_date_multiple_formats() | |
| test_extract_invoice_number() | |
| test_extract_amounts_total() | |
| test_parse_invoice_returns_complete_dict() | |
| print("\nAll extractor tests passed!") | |