File size: 5,306 Bytes
b0bec61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
"""
test_extractor.py — Assert-based tests for extractor.py using hardcoded OCR strings.

Run with: pytest test_extractor.py -v
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))

from extractor import (
    extract_vendor,
    extract_date,
    extract_invoice_number,
    extract_amounts,
    parse_invoice,
)

# ---------------------------------------------------------------------------
# Realistic OCR output strings simulating real bill scans
# ---------------------------------------------------------------------------

SAMPLE_BILL_1 = """TAX INVOICE
SuperMart Inc.
123 Main St, Springfield
Date: 15/01/2024
Invoice No. INV-2024-001
Apples $4.50
Bread $2.00
Milk $3.50
Subtotal: $10.00
GST (5%): $0.50
Total: $10.50"""

SAMPLE_BILL_2 = """RESTAURANT BILL
Joe's Diner
Date: 22-Feb-2024
INV# 99824
Burger 15.00
Fries 5.00
Cola 3.00
Sub Total: 23.00
Tax: 2.00
TOTAL: $25.00"""

SAMPLE_BILL_3 = """TECH GADGETS LLC
Invoice No: TECH-882
Date: 05 Mar 2024
Mouse Rs.1,250
Keyboard Rs.2,500
Subtotal Rs.3,500
GST 10% Rs.350
Total : ₹3,850"""


# ---------------------------------------------------------------------------
# Test 1: extract_vendor skips 'TAX INVOICE' header and returns company name
# ---------------------------------------------------------------------------
def test_extract_vendor_skips_header():
    """Vendor extraction must skip generic headers and return first real company name."""
    vendor = extract_vendor(SAMPLE_BILL_1)
    assert vendor is not None, "Vendor must not be None"
    assert "SuperMart" in vendor, f"Expected 'SuperMart' in vendor, got: {vendor}"
    print(f"PASS: test_extract_vendor_skips_header → {vendor}")


# ---------------------------------------------------------------------------
# Test 2: extract_date handles multiple formats
# ---------------------------------------------------------------------------
def test_extract_date_multiple_formats():
    """Date extractor must handle DD/MM/YYYY, DD-Mon-YYYY, and DD Mon YYYY."""
    date1 = extract_date(SAMPLE_BILL_1)
    assert date1 is not None and "2024" in date1, f"Bill 1 date failed: {date1}"

    date2 = extract_date(SAMPLE_BILL_2)
    assert date2 is not None, f"Bill 2 date (DD-Mon-YYYY) failed: {date2}"

    date3 = extract_date(SAMPLE_BILL_3)
    assert date3 is not None and "Mar" in date3 or (date3 and "2024" in date3), \
        f"Bill 3 date (DD Mon YYYY) failed: {date3}"

    print(f"PASS: test_extract_date_multiple_formats → {date1} | {date2} | {date3}")


# ---------------------------------------------------------------------------
# Test 3: extract_invoice_number returns correct reference
# ---------------------------------------------------------------------------
def test_extract_invoice_number():
    """Invoice number extractor must identify INV-XXXX and TECH-XXX patterns."""
    inv1 = extract_invoice_number(SAMPLE_BILL_1)
    assert inv1 is not None, "Invoice number must not be None for bill 1"
    assert "INV-2024-001" in inv1, f"Expected INV-2024-001, got: {inv1}"

    inv3 = extract_invoice_number(SAMPLE_BILL_3)
    assert inv3 is not None, "Invoice number must not be None for bill 3"
    assert "TECH-882" in inv3, f"Expected TECH-882, got: {inv3}"

    print(f"PASS: test_extract_invoice_number → {inv1} | {inv3}")


# ---------------------------------------------------------------------------
# Test 4: extract_amounts correctly extracts total (case-insensitive, with space before colon)
# ---------------------------------------------------------------------------
def test_extract_amounts_total():
    """Total must be extracted case-insensitively and with space before colon."""
    amounts1 = extract_amounts(SAMPLE_BILL_1)
    assert amounts1["total"] == 10.50, f"Bill 1 total: expected 10.50, got {amounts1['total']}"

    amounts2 = extract_amounts(SAMPLE_BILL_2)
    assert amounts2["total"] == 25.00, f"Bill 2 total (UPPERCASE): expected 25.00, got {amounts2['total']}"

    amounts3 = extract_amounts(SAMPLE_BILL_3)
    assert amounts3["total"] == 3850.00, f"Bill 3 total (space before colon): expected 3850.00, got {amounts3['total']}"

    print(f"PASS: test_extract_amounts_total → {amounts1['total']} | {amounts2['total']} | {amounts3['total']}")


# ---------------------------------------------------------------------------
# Test 5: parse_invoice returns complete dict with all required keys
# ---------------------------------------------------------------------------
def test_parse_invoice_returns_complete_dict():
    """parse_invoice must return dict with all required keys."""
    result = parse_invoice(SAMPLE_BILL_1)
    required_keys = {"vendor", "date", "invoice_number", "subtotal", "gst", "total", "raw_text"}
    assert required_keys == set(result.keys()), f"Missing keys: {required_keys - set(result.keys())}"
    assert result["raw_text"] == SAMPLE_BILL_1, "raw_text must be the original input"
    assert result["total"] == 10.50
    print(f"PASS: test_parse_invoice_returns_complete_dict → {result}")


if __name__ == "__main__":
    test_extract_vendor_skips_header()
    test_extract_date_multiple_formats()
    test_extract_invoice_number()
    test_extract_amounts_total()
    test_parse_invoice_returns_complete_dict()
    print("\nAll extractor tests passed!")