""" Pydantic schemas for the PharMinds OCR pipeline. Used by: - ocr-model/spaces/app.py (API request/response contracts) - ocr-model/postprocess/matcher.py (drug-DB match results) - ocr-model/eval.py (eval-input validation) - ocr-model/data/promote.py (annotation ingest validation) Strict-mode: extra fields are rejected; type coercion is minimal. """ from __future__ import annotations import re from typing import Optional, List, Literal from pydantic import BaseModel, Field, ConfigDict, field_validator # ── Validation regexes ──────────────────────────────────────────────────────── # Algerian prescriptions are written in French (majority) AND Arabic (patient # names, occasionally body text). Both languages are valid. What we still # reject: # - LLM/Gemini commentary artifacts (`**`, `###`, "Doctor's Stamp:", etc.) # - Multi-line text crammed into a single-line crop # - Mostly-non-printable noise rows # - Other-script bleed (CJK / Cyrillic / Hebrew / Devanagari / etc.) LATIN_RE = re.compile(r"[A-Za-zÀ-ÿ]") ARABIC_RE = re.compile(r"[؀-ۿݐ-ݿࢠ-ࣿﭐ-﷿ﹰ-]") # Scripts that are NOT expected in an Algerian prescription (catches OCR noise) OTHER_SCRIPT_RE = re.compile( r"[" r"Ѐ-ӿ" # Cyrillic r"一-鿿" # CJK Unified Ideographs r"぀-ゟ" # Hiragana r"゠-ヿ" # Katakana r"֐-׿" # Hebrew r"ऀ-ॿ" # Devanagari r"฀-๿" # Thai r"]" ) ARTIFACT_RE = re.compile( r"\*\*|###|---|" r"Doctor's Stamp:|Red Stamp:|Institutional Stamp:|" r"Printed Institutional Text:|Handwritten Entry:|" r"Plaintext it is!|\[Stylized Signature", re.IGNORECASE, ) DOSAGE_RE = re.compile(r"\d+(\.\d+)?\s*(mg|g|ml|mL|UI|μg|mcg|%)", re.IGNORECASE) def detect_language(text: str) -> str: """Coarse language detection for Algerian-prescription content. Returns 'fr', 'ar', 'mixed', or 'unknown'.""" has_latin = bool(LATIN_RE.search(text)) has_arabic = bool(ARABIC_RE.search(text)) if has_latin and has_arabic: return 'mixed' if has_arabic: return 'ar' if has_latin: return 'fr' return 'unknown' # ── Data layer (annotations / training) ────────────────────────────────────── class LineAnnotation(BaseModel): """A single annotated line crop, output of `dataset_tool.py /review`.""" model_config = ConfigDict(extra='forbid', str_strip_whitespace=True) file_name: str = Field(..., min_length=1) text: str = Field(..., min_length=0) # blank allowed if status='blank' status: Literal['ok', 'blank', 'skip', 'trash'] = 'ok' @field_validator('text') @classmethod def text_must_be_clean(cls, v: str, info) -> str: # Algerian prescriptions are valid in French OR Arabic (or both, e.g. an # Arabic patient name on the same line as French dosage). What we # reject is unambiguous corruption: foreign scripts, LLM artifacts, # multi-line content, or near-pure noise. if not v: return v if OTHER_SCRIPT_RE.search(v): raise ValueError(f'Foreign script detected (only fr/ar expected): {v[:50]!r}') if ARTIFACT_RE.search(v): raise ValueError(f'OCR artifact pattern detected: {v[:50]!r}') if '\n' in v: raise ValueError(f'Multi-line label not allowed (single line crop only): {v[:50]!r}') # Reject if >80% non-printable (catches OCR-noise rows) printable_ratio = sum(1 for c in v if c.isprintable()) / max(1, len(v)) if printable_ratio < 0.8: raise ValueError(f'Too many non-printable chars ({(1-printable_ratio)*100:.0f}%): {v[:50]!r}') return v # ── Inference layer (TrOCR + LLM + drug-DB match output) ───────────────────── MatchStrategy = Literal['exact', 'fuzzy', 'phonetic', 'atc', 'unmatched'] class MatchResult(BaseModel): """Output of `postprocess/matcher.py::DrugMatcher.match()`.""" model_config = ConfigDict(extra='forbid') raw_token: str matched_name: Optional[str] = None drug_id: Optional[str] = None # UUID from drugs table confidence: float = Field(0.0, ge=0.0, le=1.0) strategy: MatchStrategy = 'unmatched' class Medication(BaseModel): """One medication row in a structured prescription response.""" model_config = ConfigDict(extra='forbid') name: str = Field(..., min_length=1) drug_id: Optional[str] = None # populated by DrugMatcher dosage: Optional[str] = None # e.g. "500mg", "1g" frequency: Optional[str] = None # e.g. "1 cp x 3/j" duration: Optional[str] = None # e.g. "7 jours", "03 mois" quantity: Optional[str] = None # e.g. "01 bte" instructions: Optional[str] = None # any extra notes confidence: float = Field(0.0, ge=0.0, le=1.0) match_strategy: MatchStrategy = 'unmatched' class LineCrop(BaseModel): """A single line of a scanned prescription (for trace/feedback).""" model_config = ConfigDict(extra='forbid') bbox: Optional[List[int]] = None # [x1, y1, x2, y2] or None text: str = '' confidence: float = Field(0.0, ge=0.0, le=1.0) # Legacy compat with /scan endpoint — needed by active-learning feedback UI line_index: Optional[int] = None image_base64: Optional[str] = None # JPEG-encoded line crop class Prescription(BaseModel): """End-to-end OCR + LLM-structuring + DB-matching response.""" model_config = ConfigDict(extra='forbid') success: bool = True method: Literal['trocr', 'trocr-printed', 'trocr+florence2', 'florence2-handwritten', 'florence2-printed', 'paddle-printed', 'paddle+florence2', 'vlm_fallback', 'hybrid'] = 'trocr' confidence: float = Field(0.0, ge=0.0, le=1.0) # Structured fields doctor_name: Optional[str] = None patient_name: Optional[str] = None prescription_date: Optional[str] = None # DD/MM/YYYY string medications: List[Medication] = Field(default_factory=list) notes: Optional[str] = None # Trace line_crops: List[LineCrop] = Field(default_factory=list) raw_ocr_text: Optional[str] = None # joined raw lines, debug only # Telemetry processing_ms: int = 0 model_version: str = 'v1' dataset_version: str = 'v1' error: Optional[str] = None # ── Feedback loop (active learning) ────────────────────────────────────────── class FeedbackCorrection(BaseModel): """One line correction submitted via /v2/feedback.""" model_config = ConfigDict(extra='forbid') file_name: str text: str # human-corrected ground truth status: Literal['ok', 'blank', 'skip'] class FeedbackPayload(BaseModel): """POST /v2/feedback request body.""" model_config = ConfigDict(extra='forbid') image_id: str corrections: List[FeedbackCorrection] reviewer_id: Optional[str] = None submitted_at: Optional[str] = None # ISO timestamp # ── Eval input (one row of hold-out ground truth) ──────────────────────────── class EvalSample(BaseModel): """One labeled hold-out line for `eval.py`.""" model_config = ConfigDict(extra='forbid') file_name: str text: str # ground truth expected_drug: Optional[str] = None # if known, for Drug-name accuracy metric