"""
test_extractor.py — Assert-based tests for extractor.py using hardcoded OCR strings.

Run with: pytest test_extractor.py -v
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent))

from extractor import (
    extract_vendor,
    extract_date,
    extract_invoice_number,
    extract_amounts,
    parse_invoice,
)

# ---------------------------------------------------------------------------
# Realistic OCR output strings simulating real bill scans
# ---------------------------------------------------------------------------

SAMPLE_BILL_1 = """TAX INVOICE
SuperMart Inc.
123 Main St, Springfield
Date: 15/01/2024
Invoice No. INV-2024-001
Apples $4.50
Bread $2.00
Milk $3.50
Subtotal: $10.00
GST (5%): $0.50
Total: $10.50"""

SAMPLE_BILL_2 = """RESTAURANT BILL
Joe's Diner
Date: 22-Feb-2024
INV# 99824
Burger 15.00
Fries 5.00
Cola 3.00
Sub Total: 23.00
Tax: 2.00
TOTAL: $25.00"""

SAMPLE_BILL_3 = """TECH GADGETS LLC
Invoice No: TECH-882
Date: 05 Mar 2024
Mouse Rs.1,250
Keyboard Rs.2,500
Subtotal Rs.3,500
GST 10% Rs.350
Total : ₹3,850"""


# ---------------------------------------------------------------------------
# Test 1: extract_vendor skips 'TAX INVOICE' header and returns company name
# ---------------------------------------------------------------------------
def test_extract_vendor_skips_header():
    """Vendor extraction must skip generic headers and return first real company name."""
    vendor = extract_vendor(SAMPLE_BILL_1)
    assert vendor is not None, "Vendor must not be None"
    assert "SuperMart" in vendor, f"Expected 'SuperMart' in vendor, got: {vendor}"
    print(f"PASS: test_extract_vendor_skips_header → {vendor}")


# ---------------------------------------------------------------------------
# Test 2: extract_date handles multiple formats
# ---------------------------------------------------------------------------
def test_extract_date_multiple_formats():
    """Date extractor must handle DD/MM/YYYY, DD-Mon-YYYY, and DD Mon YYYY."""
    date1 = extract_date(SAMPLE_BILL_1)
    assert date1 is not None and "2024" in date1, f"Bill 1 date failed: {date1}"

    date2 = extract_date(SAMPLE_BILL_2)
    assert date2 is not None, f"Bill 2 date (DD-Mon-YYYY) failed: {date2}"

    date3 = extract_date(SAMPLE_BILL_3)
    assert date3 is not None and "Mar" in date3 or (date3 and "2024" in date3), \
        f"Bill 3 date (DD Mon YYYY) failed: {date3}"

    print(f"PASS: test_extract_date_multiple_formats → {date1} | {date2} | {date3}")


# ---------------------------------------------------------------------------
# Test 3: extract_invoice_number returns correct reference
# ---------------------------------------------------------------------------
def test_extract_invoice_number():
    """Invoice number extractor must identify INV-XXXX and TECH-XXX patterns."""
    inv1 = extract_invoice_number(SAMPLE_BILL_1)
    assert inv1 is not None, "Invoice number must not be None for bill 1"
    assert "INV-2024-001" in inv1, f"Expected INV-2024-001, got: {inv1}"

    inv3 = extract_invoice_number(SAMPLE_BILL_3)
    assert inv3 is not None, "Invoice number must not be None for bill 3"
    assert "TECH-882" in inv3, f"Expected TECH-882, got: {inv3}"

    print(f"PASS: test_extract_invoice_number → {inv1} | {inv3}")


# ---------------------------------------------------------------------------
# Test 4: extract_amounts correctly extracts total (case-insensitive, with space before colon)
# ---------------------------------------------------------------------------
def test_extract_amounts_total():
    """Total must be extracted case-insensitively and with space before colon."""
    amounts1 = extract_amounts(SAMPLE_BILL_1)
    assert amounts1["total"] == 10.50, f"Bill 1 total: expected 10.50, got {amounts1['total']}"

    amounts2 = extract_amounts(SAMPLE_BILL_2)
    assert amounts2["total"] == 25.00, f"Bill 2 total (UPPERCASE): expected 25.00, got {amounts2['total']}"

    amounts3 = extract_amounts(SAMPLE_BILL_3)
    assert amounts3["total"] == 3850.00, f"Bill 3 total (space before colon): expected 3850.00, got {amounts3['total']}"

    print(f"PASS: test_extract_amounts_total → {amounts1['total']} | {amounts2['total']} | {amounts3['total']}")


# ---------------------------------------------------------------------------
# Test 5: parse_invoice returns complete dict with all required keys
# ---------------------------------------------------------------------------
def test_parse_invoice_returns_complete_dict():
    """parse_invoice must return dict with all required keys."""
    result = parse_invoice(SAMPLE_BILL_1)
    required_keys = {"vendor", "date", "invoice_number", "subtotal", "gst", "total", "raw_text"}
    assert required_keys == set(result.keys()), f"Missing keys: {required_keys - set(result.keys())}"
    assert result["raw_text"] == SAMPLE_BILL_1, "raw_text must be the original input"
    assert result["total"] == 10.50
    print(f"PASS: test_parse_invoice_returns_complete_dict → {result}")


if __name__ == "__main__":
    test_extract_vendor_skips_header()
    test_extract_date_multiple_formats()
    test_extract_invoice_number()
    test_extract_amounts_total()
    test_parse_invoice_returns_complete_dict()
    print("\nAll extractor tests passed!")