import json
import base64
import io
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
from app import app
from backend.glossary_service import GlossaryService
ROOT = Path(__file__).parents[1]
client = TestClient(app)
def test_health():
response = client.get("/health")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_definition_exact_and_arabic_alias():
response = client.get("/definition/MAWB")
assert response.status_code == 200
assert response.json()["source"] == "verified_glossary"
response = client.get("/definition/رمز النظام المنسق")
assert response.json()["term"] == "HS Code"
def test_fuzzy_ocr_match():
service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
match = service.find_match("Commercia1 Invoice")
assert match and match["canonical"] == "Commercial Invoice"
def test_invalid_image_rejected():
response = client.post("/analyze-frame", json={"image_base64": "bad", "frame_width": 100, "frame_height": 100})
assert response.status_code == 400
def test_vlm_disabled_is_graceful():
response = client.post("/analyze-document-vlm", json={"user_requested": True})
assert response.status_code == 200
assert response.json()["status"] in {"unavailable", "configured"}
def test_feedback_and_sme_priority(tmp_path):
glossary_path = tmp_path / "glossary.json"
corrections_path = tmp_path / "corrections.json"
approved_path = tmp_path / "approved.json"
glossary_path.write_text(json.dumps({"MAWB": {"definition": "verified", "source": "verified_glossary"}}))
corrections_path.write_text(json.dumps({"MAWB": {"corrected_definition": "user", "status": "pending_sme_review"}}))
approved_path.write_text(json.dumps({"MAWB": {"definition": "approved"}}))
service = GlossaryService(glossary_path, corrections_path, approved_path)
result = service.definition("MAWB")
assert result["definition"] == "approved"
assert result["source"] == "sme_approved"
def test_docx_upload_returns_clickable_term_data():
xml = '''MAWB with Packing List'''
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("word/document.xml", xml)
response = client.post("/analyze-document", json={
"file_base64": base64.b64encode(buffer.getvalue()).decode(),
"filename": "air-cargo.docx",
"language_preference": "en",
})
assert response.status_code == 200
terms = response.json()["detected_terms"]
assert {item["term"] for item in terms} >= {"MAWB", "Packing List"}
assert all(item["bbox"] and item["definition"] for item in terms)
def test_unsupported_legacy_word_is_clear():
response = client.post("/analyze-document", json={
"file_base64": base64.b64encode(b"legacy").decode(),
"filename": "legacy.doc",
"language_preference": "en",
})
assert response.status_code == 400
assert "DOCX" in response.json()["detail"]
def test_selection_insight_has_summary_and_business_meaning():
response = client.post("/explain-selection", json={
"text": "Commercial Invoice with HS Code and Customs Duty",
"language_preference": "en",
})
assert response.status_code == 200
data = response.json()
assert data["summary"]
assert data["business_meaning"]
assert {item["term"] for item in data["recognized_terms"]} >= {
"Commercial Invoice", "HS Code", "Customs Duty"
}
def test_unknown_selection_is_clearly_unverified():
response = client.post("/explain-selection", json={
"text": "Internal reference XYZ-998",
"language_preference": "en",
})
assert response.status_code == 200
assert response.json()["source"] == "ai_generated_unverified"
def test_contextual_fallback_prevents_empty_dot_state():
service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
terms = service.contextual_fallbacks([{
"text": "Port Reference ZX-2048", "bbox": [20, 30, 280, 60],
"confidence": 0.91, "language": "en",
}])
assert len(terms) == 1
assert terms[0]["bbox"] == [20, 30, 280, 60]
assert terms[0]["source"] == "ai_generated_unverified"
def test_scanner_serves_scroll_info_and_clean_status_controls():
html = client.get("/").text
assert 'id="documentScroller"' in html
assert 'id="documentInfo"' in html
assert 'id="stabilityMetric"' not in html
assert 'id="lightMetric"' not in html
assert 'id="ocrMetric"' not in html
def test_multipage_pdf_is_rendered_as_one_scroll_surface():
import fitz
document = fitz.open()
first = document.new_page()
first.insert_text((72, 100), "Commercial Invoice and HS Code", fontsize=16)
second = document.new_page()
second.insert_text((72, 100), "Packing List and Customs Duty", fontsize=16)
response = client.post("/analyze-document", json={
"file_base64": base64.b64encode(document.tobytes()).decode(),
"filename": "two-pages.pdf",
"language_preference": "en",
})
assert response.status_code == 200
data = response.json()
assert data["page_count"] == 2
assert data["pages_analyzed"] == 2
assert data["frame_height"] > data["frame_width"] * 2
assert {item["term"] for item in data["detected_terms"]} >= {"HS Code", "Commercial Invoice"}
def test_document_cache_makes_repeat_upload_fast_path():
xml = '''Delivery Order'''
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("word/document.xml", xml)
payload = {"file_base64": base64.b64encode(buffer.getvalue()).decode(), "filename": "repeat.docx", "language_preference": "en"}
assert client.post("/analyze-document", json=payload).status_code == 200
second = client.post("/analyze-document", json=payload)
assert second.status_code == 200
assert second.json()["cached"] is True
def test_confidence_calibration_does_not_multiply_down_verified_match():
service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
terms = service.match_regions([{"text": "HS Code", "bbox": [0, 0, 100, 20], "confidence": .8, "language": "en"}])
assert terms[0]["confidence"] > .8
def test_fs_merged_logo_is_served_accessibly():
html = client.get("/").text
assert 'class="mark-f">F' in html
assert 'class="mark-s">S' in html
assert 'aria-label="FalconScan home"' in html
def test_default_deployment_does_not_install_paddle_cold_start():
requirements = (ROOT / "requirements.txt").read_text()
advanced = (ROOT / "requirements-advanced.txt").read_text()
assert "paddlepaddle" not in requirements
assert "paddleocr" not in requirements
assert "paddlepaddle" in advanced
assert "paddleocr" in advanced