FalconScan / tests /test_services.py
Rajeev Pandey
Add FS identity and fast CPU OCR startup
119acad
Raw
History Blame Contribute Delete
7.4 kB
import json
import base64
import io
import zipfile
from pathlib import Path
from fastapi.testclient import TestClient
from app import app
from backend.glossary_service import GlossaryService
ROOT = Path(__file__).parents[1]
client = TestClient(app)
def test_health():
response = client.get("/health")
assert response.status_code == 200
assert response.json()["status"] == "ok"
def test_definition_exact_and_arabic_alias():
response = client.get("/definition/MAWB")
assert response.status_code == 200
assert response.json()["source"] == "verified_glossary"
response = client.get("/definition/رمز النظام المنسق")
assert response.json()["term"] == "HS Code"
def test_fuzzy_ocr_match():
service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
match = service.find_match("Commercia1 Invoice")
assert match and match["canonical"] == "Commercial Invoice"
def test_invalid_image_rejected():
response = client.post("/analyze-frame", json={"image_base64": "bad", "frame_width": 100, "frame_height": 100})
assert response.status_code == 400
def test_vlm_disabled_is_graceful():
response = client.post("/analyze-document-vlm", json={"user_requested": True})
assert response.status_code == 200
assert response.json()["status"] in {"unavailable", "configured"}
def test_feedback_and_sme_priority(tmp_path):
glossary_path = tmp_path / "glossary.json"
corrections_path = tmp_path / "corrections.json"
approved_path = tmp_path / "approved.json"
glossary_path.write_text(json.dumps({"MAWB": {"definition": "verified", "source": "verified_glossary"}}))
corrections_path.write_text(json.dumps({"MAWB": {"corrected_definition": "user", "status": "pending_sme_review"}}))
approved_path.write_text(json.dumps({"MAWB": {"definition": "approved"}}))
service = GlossaryService(glossary_path, corrections_path, approved_path)
result = service.definition("MAWB")
assert result["definition"] == "approved"
assert result["source"] == "sme_approved"
def test_docx_upload_returns_clickable_term_data():
xml = '''<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>MAWB with Packing List</w:t></w:r></w:p></w:body></w:document>'''
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("word/document.xml", xml)
response = client.post("/analyze-document", json={
"file_base64": base64.b64encode(buffer.getvalue()).decode(),
"filename": "air-cargo.docx",
"language_preference": "en",
})
assert response.status_code == 200
terms = response.json()["detected_terms"]
assert {item["term"] for item in terms} >= {"MAWB", "Packing List"}
assert all(item["bbox"] and item["definition"] for item in terms)
def test_unsupported_legacy_word_is_clear():
response = client.post("/analyze-document", json={
"file_base64": base64.b64encode(b"legacy").decode(),
"filename": "legacy.doc",
"language_preference": "en",
})
assert response.status_code == 400
assert "DOCX" in response.json()["detail"]
def test_selection_insight_has_summary_and_business_meaning():
response = client.post("/explain-selection", json={
"text": "Commercial Invoice with HS Code and Customs Duty",
"language_preference": "en",
})
assert response.status_code == 200
data = response.json()
assert data["summary"]
assert data["business_meaning"]
assert {item["term"] for item in data["recognized_terms"]} >= {
"Commercial Invoice", "HS Code", "Customs Duty"
}
def test_unknown_selection_is_clearly_unverified():
response = client.post("/explain-selection", json={
"text": "Internal reference XYZ-998",
"language_preference": "en",
})
assert response.status_code == 200
assert response.json()["source"] == "ai_generated_unverified"
def test_contextual_fallback_prevents_empty_dot_state():
service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
terms = service.contextual_fallbacks([{
"text": "Port Reference ZX-2048", "bbox": [20, 30, 280, 60],
"confidence": 0.91, "language": "en",
}])
assert len(terms) == 1
assert terms[0]["bbox"] == [20, 30, 280, 60]
assert terms[0]["source"] == "ai_generated_unverified"
def test_scanner_serves_scroll_info_and_clean_status_controls():
html = client.get("/").text
assert 'id="documentScroller"' in html
assert 'id="documentInfo"' in html
assert 'id="stabilityMetric"' not in html
assert 'id="lightMetric"' not in html
assert 'id="ocrMetric"' not in html
def test_multipage_pdf_is_rendered_as_one_scroll_surface():
import fitz
document = fitz.open()
first = document.new_page()
first.insert_text((72, 100), "Commercial Invoice and HS Code", fontsize=16)
second = document.new_page()
second.insert_text((72, 100), "Packing List and Customs Duty", fontsize=16)
response = client.post("/analyze-document", json={
"file_base64": base64.b64encode(document.tobytes()).decode(),
"filename": "two-pages.pdf",
"language_preference": "en",
})
assert response.status_code == 200
data = response.json()
assert data["page_count"] == 2
assert data["pages_analyzed"] == 2
assert data["frame_height"] > data["frame_width"] * 2
assert {item["term"] for item in data["detected_terms"]} >= {"HS Code", "Commercial Invoice"}
def test_document_cache_makes_repeat_upload_fast_path():
xml = '''<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Delivery Order</w:t></w:r></w:p></w:body></w:document>'''
buffer = io.BytesIO()
with zipfile.ZipFile(buffer, "w") as archive:
archive.writestr("word/document.xml", xml)
payload = {"file_base64": base64.b64encode(buffer.getvalue()).decode(), "filename": "repeat.docx", "language_preference": "en"}
assert client.post("/analyze-document", json=payload).status_code == 200
second = client.post("/analyze-document", json=payload)
assert second.status_code == 200
assert second.json()["cached"] is True
def test_confidence_calibration_does_not_multiply_down_verified_match():
service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
terms = service.match_regions([{"text": "HS Code", "bbox": [0, 0, 100, 20], "confidence": .8, "language": "en"}])
assert terms[0]["confidence"] > .8
def test_fs_merged_logo_is_served_accessibly():
html = client.get("/").text
assert 'class="mark-f">F</span>' in html
assert 'class="mark-s">S</span>' in html
assert 'aria-label="FalconScan home"' in html
def test_default_deployment_does_not_install_paddle_cold_start():
requirements = (ROOT / "requirements.txt").read_text()
advanced = (ROOT / "requirements-advanced.txt").read_text()
assert "paddlepaddle" not in requirements
assert "paddleocr" not in requirements
assert "paddlepaddle" in advanced
assert "paddleocr" in advanced