Spaces:

rajvivan
/

FalconScan

Sleeping

Rajeev Pandey

Add FS identity and fast CPU OCR startup

119acad 9 days ago

7.4 kB

	import json
	import base64
	import io
	import zipfile
	from pathlib import Path

	from fastapi.testclient import TestClient

	from app import app
	from backend.glossary_service import GlossaryService


	ROOT = Path(__file__).parents[1]
	client = TestClient(app)


	def test_health():
	response = client.get("/health")
	assert response.status_code == 200
	assert response.json()["status"] == "ok"


	def test_definition_exact_and_arabic_alias():
	response = client.get("/definition/MAWB")
	assert response.status_code == 200
	assert response.json()["source"] == "verified_glossary"
	response = client.get("/definition/رمز النظام المنسق")
	assert response.json()["term"] == "HS Code"


	def test_fuzzy_ocr_match():
	service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
	match = service.find_match("Commercia1 Invoice")
	assert match and match["canonical"] == "Commercial Invoice"


	def test_invalid_image_rejected():
	response = client.post("/analyze-frame", json={"image_base64": "bad", "frame_width": 100, "frame_height": 100})
	assert response.status_code == 400


	def test_vlm_disabled_is_graceful():
	response = client.post("/analyze-document-vlm", json={"user_requested": True})
	assert response.status_code == 200
	assert response.json()["status"] in {"unavailable", "configured"}


	def test_feedback_and_sme_priority(tmp_path):
	glossary_path = tmp_path / "glossary.json"
	corrections_path = tmp_path / "corrections.json"
	approved_path = tmp_path / "approved.json"
	glossary_path.write_text(json.dumps({"MAWB": {"definition": "verified", "source": "verified_glossary"}}))
	corrections_path.write_text(json.dumps({"MAWB": {"corrected_definition": "user", "status": "pending_sme_review"}}))
	approved_path.write_text(json.dumps({"MAWB": {"definition": "approved"}}))
	service = GlossaryService(glossary_path, corrections_path, approved_path)
	result = service.definition("MAWB")
	assert result["definition"] == "approved"
	assert result["source"] == "sme_approved"


	def test_docx_upload_returns_clickable_term_data():
	xml = '''<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>MAWB with Packing List</w:t></w:r></w:p></w:body></w:document>'''
	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w") as archive:
	archive.writestr("word/document.xml", xml)
	response = client.post("/analyze-document", json={
	"file_base64": base64.b64encode(buffer.getvalue()).decode(),
	"filename": "air-cargo.docx",
	"language_preference": "en",
	})
	assert response.status_code == 200
	terms = response.json()["detected_terms"]
	assert {item["term"] for item in terms} >= {"MAWB", "Packing List"}
	assert all(item["bbox"] and item["definition"] for item in terms)


	def test_unsupported_legacy_word_is_clear():
	response = client.post("/analyze-document", json={
	"file_base64": base64.b64encode(b"legacy").decode(),
	"filename": "legacy.doc",
	"language_preference": "en",
	})
	assert response.status_code == 400
	assert "DOCX" in response.json()["detail"]


	def test_selection_insight_has_summary_and_business_meaning():
	response = client.post("/explain-selection", json={
	"text": "Commercial Invoice with HS Code and Customs Duty",
	"language_preference": "en",
	})
	assert response.status_code == 200
	data = response.json()
	assert data["summary"]
	assert data["business_meaning"]
	assert {item["term"] for item in data["recognized_terms"]} >= {
	"Commercial Invoice", "HS Code", "Customs Duty"
	}


	def test_unknown_selection_is_clearly_unverified():
	response = client.post("/explain-selection", json={
	"text": "Internal reference XYZ-998",
	"language_preference": "en",
	})
	assert response.status_code == 200
	assert response.json()["source"] == "ai_generated_unverified"


	def test_contextual_fallback_prevents_empty_dot_state():
	service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
	terms = service.contextual_fallbacks([{
	"text": "Port Reference ZX-2048", "bbox": [20, 30, 280, 60],
	"confidence": 0.91, "language": "en",
	}])
	assert len(terms) == 1
	assert terms[0]["bbox"] == [20, 30, 280, 60]
	assert terms[0]["source"] == "ai_generated_unverified"


	def test_scanner_serves_scroll_info_and_clean_status_controls():
	html = client.get("/").text
	assert 'id="documentScroller"' in html
	assert 'id="documentInfo"' in html
	assert 'id="stabilityMetric"' not in html
	assert 'id="lightMetric"' not in html
	assert 'id="ocrMetric"' not in html


	def test_multipage_pdf_is_rendered_as_one_scroll_surface():
	import fitz
	document = fitz.open()
	first = document.new_page()
	first.insert_text((72, 100), "Commercial Invoice and HS Code", fontsize=16)
	second = document.new_page()
	second.insert_text((72, 100), "Packing List and Customs Duty", fontsize=16)
	response = client.post("/analyze-document", json={
	"file_base64": base64.b64encode(document.tobytes()).decode(),
	"filename": "two-pages.pdf",
	"language_preference": "en",
	})
	assert response.status_code == 200
	data = response.json()
	assert data["page_count"] == 2
	assert data["pages_analyzed"] == 2
	assert data["frame_height"] > data["frame_width"] * 2
	assert {item["term"] for item in data["detected_terms"]} >= {"HS Code", "Commercial Invoice"}


	def test_document_cache_makes_repeat_upload_fast_path():
	xml = '''<?xml version="1.0"?><w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"><w:body><w:p><w:r><w:t>Delivery Order</w:t></w:r></w:p></w:body></w:document>'''
	buffer = io.BytesIO()
	with zipfile.ZipFile(buffer, "w") as archive:
	archive.writestr("word/document.xml", xml)
	payload = {"file_base64": base64.b64encode(buffer.getvalue()).decode(), "filename": "repeat.docx", "language_preference": "en"}
	assert client.post("/analyze-document", json=payload).status_code == 200
	second = client.post("/analyze-document", json=payload)
	assert second.status_code == 200
	assert second.json()["cached"] is True


	def test_confidence_calibration_does_not_multiply_down_verified_match():
	service = GlossaryService(ROOT / "data/glossary.json", ROOT / "data/user_corrections.json", ROOT / "data/sme_approved_definitions.json")
	terms = service.match_regions([{"text": "HS Code", "bbox": [0, 0, 100, 20], "confidence": .8, "language": "en"}])
	assert terms[0]["confidence"] > .8


	def test_fs_merged_logo_is_served_accessibly():
	html = client.get("/").text
	assert 'class="mark-f">F</span>' in html
	assert 'class="mark-s">S</span>' in html
	assert 'aria-label="FalconScan home"' in html


	def test_default_deployment_does_not_install_paddle_cold_start():
	requirements = (ROOT / "requirements.txt").read_text()
	advanced = (ROOT / "requirements-advanced.txt").read_text()
	assert "paddlepaddle" not in requirements
	assert "paddleocr" not in requirements
	assert "paddlepaddle" in advanced
	assert "paddleocr" in advanced