Spaces:

Abdou-19
/

pharminds-ocr-api

Running

abdorenouni

feat(ocr): Florence-2 as primary handwritten engine — TrOCR as fallback

ce0c24a 12 days ago

7.64 kB

	"""
	Pydantic schemas for the PharMinds OCR pipeline.

	Used by:
	- ocr-model/spaces/app.py (API request/response contracts)
	- ocr-model/postprocess/matcher.py (drug-DB match results)
	- ocr-model/eval.py (eval-input validation)
	- ocr-model/data/promote.py (annotation ingest validation)

	Strict-mode: extra fields are rejected; type coercion is minimal.
	"""
	from __future__ import annotations

	import re
	from typing import Optional, List, Literal

	from pydantic import BaseModel, Field, ConfigDict, field_validator


	# ── Validation regexes ────────────────────────────────────────────────────────
	# Algerian prescriptions are written in French (majority) AND Arabic (patient
	# names, occasionally body text). Both languages are valid. What we still
	# reject:
	# - LLM/Gemini commentary artifacts (`**`, `###`, "Doctor's Stamp:", etc.)
	# - Multi-line text crammed into a single-line crop
	# - Mostly-non-printable noise rows
	# - Other-script bleed (CJK / Cyrillic / Hebrew / Devanagari / etc.)
	LATIN_RE = re.compile(r"[A-Za-zÀ-ÿ]")
	ARABIC_RE = re.compile(r"[؀-ۿݐ-ݿࢠ-ࣿﭐ-﷿ﹰ-]")
	# Scripts that are NOT expected in an Algerian prescription (catches OCR noise)
	OTHER_SCRIPT_RE = re.compile(
	r"["
	r"Ѐ-ӿ" # Cyrillic
	r"一-鿿" # CJK Unified Ideographs
	r"぀-ゟ" # Hiragana
	r"゠-ヿ" # Katakana
	r"֐-׿" # Hebrew
	r"ऀ-ॿ" # Devanagari
	r"฀-๿" # Thai
	r"]"
	)
	ARTIFACT_RE = re.compile(
	r"\\\|###\|---\|"
	r"Doctor's Stamp:\|Red Stamp:\|Institutional Stamp:\|"
	r"Printed Institutional Text:\|Handwritten Entry:\|"
	r"Plaintext it is!\|\[Stylized Signature",
	re.IGNORECASE,
	)
	DOSAGE_RE = re.compile(r"\d+(\.\d+)?\s*(mg\|g\|ml\|mL\|UI\|μg\|mcg\|%)", re.IGNORECASE)


	def detect_language(text: str) -> str:
	"""Coarse language detection for Algerian-prescription content.
	Returns 'fr', 'ar', 'mixed', or 'unknown'."""
	has_latin = bool(LATIN_RE.search(text))
	has_arabic = bool(ARABIC_RE.search(text))
	if has_latin and has_arabic:
	return 'mixed'
	if has_arabic:
	return 'ar'
	if has_latin:
	return 'fr'
	return 'unknown'


	# ── Data layer (annotations / training) ──────────────────────────────────────
	class LineAnnotation(BaseModel):
	"""A single annotated line crop, output of `dataset_tool.py /review`."""
	model_config = ConfigDict(extra='forbid', str_strip_whitespace=True)

	file_name: str = Field(..., min_length=1)
	text: str = Field(..., min_length=0) # blank allowed if status='blank'
	status: Literal['ok', 'blank', 'skip', 'trash'] = 'ok'

	@field_validator('text')
	@classmethod
	def text_must_be_clean(cls, v: str, info) -> str:
	# Algerian prescriptions are valid in French OR Arabic (or both, e.g. an
	# Arabic patient name on the same line as French dosage). What we
	# reject is unambiguous corruption: foreign scripts, LLM artifacts,
	# multi-line content, or near-pure noise.
	if not v:
	return v
	if OTHER_SCRIPT_RE.search(v):
	raise ValueError(f'Foreign script detected (only fr/ar expected): {v[:50]!r}')
	if ARTIFACT_RE.search(v):
	raise ValueError(f'OCR artifact pattern detected: {v[:50]!r}')
	if '\n' in v:
	raise ValueError(f'Multi-line label not allowed (single line crop only): {v[:50]!r}')
	# Reject if >80% non-printable (catches OCR-noise rows)
	printable_ratio = sum(1 for c in v if c.isprintable()) / max(1, len(v))
	if printable_ratio < 0.8:
	raise ValueError(f'Too many non-printable chars ({(1-printable_ratio)*100:.0f}%): {v[:50]!r}')
	return v


	# ── Inference layer (TrOCR + LLM + drug-DB match output) ─────────────────────
	MatchStrategy = Literal['exact', 'fuzzy', 'phonetic', 'atc', 'unmatched']


	class MatchResult(BaseModel):
	"""Output of `postprocess/matcher.py::DrugMatcher.match()`."""
	model_config = ConfigDict(extra='forbid')

	raw_token: str
	matched_name: Optional[str] = None
	drug_id: Optional[str] = None # UUID from drugs table
	confidence: float = Field(0.0, ge=0.0, le=1.0)
	strategy: MatchStrategy = 'unmatched'


	class Medication(BaseModel):
	"""One medication row in a structured prescription response."""
	model_config = ConfigDict(extra='forbid')

	name: str = Field(..., min_length=1)
	drug_id: Optional[str] = None # populated by DrugMatcher
	dosage: Optional[str] = None # e.g. "500mg", "1g"
	frequency: Optional[str] = None # e.g. "1 cp x 3/j"
	duration: Optional[str] = None # e.g. "7 jours", "03 mois"
	quantity: Optional[str] = None # e.g. "01 bte"
	instructions: Optional[str] = None # any extra notes
	confidence: float = Field(0.0, ge=0.0, le=1.0)
	match_strategy: MatchStrategy = 'unmatched'


	class LineCrop(BaseModel):
	"""A single line of a scanned prescription (for trace/feedback)."""
	model_config = ConfigDict(extra='forbid')

	bbox: Optional[List[int]] = None # [x1, y1, x2, y2] or None
	text: str = ''
	confidence: float = Field(0.0, ge=0.0, le=1.0)
	# Legacy compat with /scan endpoint — needed by active-learning feedback UI
	line_index: Optional[int] = None
	image_base64: Optional[str] = None # JPEG-encoded line crop


	class Prescription(BaseModel):
	"""End-to-end OCR + LLM-structuring + DB-matching response."""
	model_config = ConfigDict(extra='forbid')

	success: bool = True
	method: Literal['trocr', 'trocr-printed', 'trocr+florence2', 'florence2-handwritten', 'florence2-printed', 'paddle-printed', 'paddle+florence2', 'vlm_fallback', 'hybrid'] = 'trocr'
	confidence: float = Field(0.0, ge=0.0, le=1.0)

	# Structured fields
	doctor_name: Optional[str] = None
	patient_name: Optional[str] = None
	prescription_date: Optional[str] = None # DD/MM/YYYY string
	medications: List[Medication] = Field(default_factory=list)
	notes: Optional[str] = None

	# Trace
	line_crops: List[LineCrop] = Field(default_factory=list)
	raw_ocr_text: Optional[str] = None # joined raw lines, debug only

	# Telemetry
	processing_ms: int = 0
	model_version: str = 'v1'
	dataset_version: str = 'v1'
	error: Optional[str] = None


	# ── Feedback loop (active learning) ──────────────────────────────────────────
	class FeedbackCorrection(BaseModel):
	"""One line correction submitted via /v2/feedback."""
	model_config = ConfigDict(extra='forbid')

	file_name: str
	text: str # human-corrected ground truth
	status: Literal['ok', 'blank', 'skip']


	class FeedbackPayload(BaseModel):
	"""POST /v2/feedback request body."""
	model_config = ConfigDict(extra='forbid')

	image_id: str
	corrections: List[FeedbackCorrection]
	reviewer_id: Optional[str] = None
	submitted_at: Optional[str] = None # ISO timestamp


	# ── Eval input (one row of hold-out ground truth) ────────────────────────────
	class EvalSample(BaseModel):
	"""One labeled hold-out line for `eval.py`."""
	model_config = ConfigDict(extra='forbid')

	file_name: str
	text: str # ground truth
	expected_drug: Optional[str] = None # if known, for Drug-name accuracy metric