pharminds-ocr-api / schema.py
abdorenouni
feat(ocr): Florence-2 as primary handwritten engine β€” TrOCR as fallback
ce0c24a
"""
Pydantic schemas for the PharMinds OCR pipeline.
Used by:
- ocr-model/spaces/app.py (API request/response contracts)
- ocr-model/postprocess/matcher.py (drug-DB match results)
- ocr-model/eval.py (eval-input validation)
- ocr-model/data/promote.py (annotation ingest validation)
Strict-mode: extra fields are rejected; type coercion is minimal.
"""
from __future__ import annotations
import re
from typing import Optional, List, Literal
from pydantic import BaseModel, Field, ConfigDict, field_validator
# ── Validation regexes ────────────────────────────────────────────────────────
# Algerian prescriptions are written in French (majority) AND Arabic (patient
# names, occasionally body text). Both languages are valid. What we still
# reject:
# - LLM/Gemini commentary artifacts (`**`, `###`, "Doctor's Stamp:", etc.)
# - Multi-line text crammed into a single-line crop
# - Mostly-non-printable noise rows
# - Other-script bleed (CJK / Cyrillic / Hebrew / Devanagari / etc.)
LATIN_RE = re.compile(r"[A-Za-zΓ€-ΓΏ]")
ARABIC_RE = re.compile(r"[Ψ€-ۿݐ-έΏΰ’ -ࣿﭐ-ο·ΏοΉ°-ο»Ώ]")
# Scripts that are NOT expected in an Algerian prescription (catches OCR noise)
OTHER_SCRIPT_RE = re.compile(
r"["
r"Π€-ΣΏ" # Cyrillic
r"δΈ€-ιΏΏ" # CJK Unified Ideographs
r"぀-γ‚Ÿ" # Hiragana
r"γ‚ -γƒΏ" # Katakana
r"֐-׿" # Hebrew
r"ΰ€€-ΰ₯Ώ" # Devanagari
r"ΰΈ€-ΰΉΏ" # Thai
r"]"
)
ARTIFACT_RE = re.compile(
r"\*\*|###|---|"
r"Doctor's Stamp:|Red Stamp:|Institutional Stamp:|"
r"Printed Institutional Text:|Handwritten Entry:|"
r"Plaintext it is!|\[Stylized Signature",
re.IGNORECASE,
)
DOSAGE_RE = re.compile(r"\d+(\.\d+)?\s*(mg|g|ml|mL|UI|ΞΌg|mcg|%)", re.IGNORECASE)
def detect_language(text: str) -> str:
"""Coarse language detection for Algerian-prescription content.
Returns 'fr', 'ar', 'mixed', or 'unknown'."""
has_latin = bool(LATIN_RE.search(text))
has_arabic = bool(ARABIC_RE.search(text))
if has_latin and has_arabic:
return 'mixed'
if has_arabic:
return 'ar'
if has_latin:
return 'fr'
return 'unknown'
# ── Data layer (annotations / training) ──────────────────────────────────────
class LineAnnotation(BaseModel):
"""A single annotated line crop, output of `dataset_tool.py /review`."""
model_config = ConfigDict(extra='forbid', str_strip_whitespace=True)
file_name: str = Field(..., min_length=1)
text: str = Field(..., min_length=0) # blank allowed if status='blank'
status: Literal['ok', 'blank', 'skip', 'trash'] = 'ok'
@field_validator('text')
@classmethod
def text_must_be_clean(cls, v: str, info) -> str:
# Algerian prescriptions are valid in French OR Arabic (or both, e.g. an
# Arabic patient name on the same line as French dosage). What we
# reject is unambiguous corruption: foreign scripts, LLM artifacts,
# multi-line content, or near-pure noise.
if not v:
return v
if OTHER_SCRIPT_RE.search(v):
raise ValueError(f'Foreign script detected (only fr/ar expected): {v[:50]!r}')
if ARTIFACT_RE.search(v):
raise ValueError(f'OCR artifact pattern detected: {v[:50]!r}')
if '\n' in v:
raise ValueError(f'Multi-line label not allowed (single line crop only): {v[:50]!r}')
# Reject if >80% non-printable (catches OCR-noise rows)
printable_ratio = sum(1 for c in v if c.isprintable()) / max(1, len(v))
if printable_ratio < 0.8:
raise ValueError(f'Too many non-printable chars ({(1-printable_ratio)*100:.0f}%): {v[:50]!r}')
return v
# ── Inference layer (TrOCR + LLM + drug-DB match output) ─────────────────────
MatchStrategy = Literal['exact', 'fuzzy', 'phonetic', 'atc', 'unmatched']
class MatchResult(BaseModel):
"""Output of `postprocess/matcher.py::DrugMatcher.match()`."""
model_config = ConfigDict(extra='forbid')
raw_token: str
matched_name: Optional[str] = None
drug_id: Optional[str] = None # UUID from drugs table
confidence: float = Field(0.0, ge=0.0, le=1.0)
strategy: MatchStrategy = 'unmatched'
class Medication(BaseModel):
"""One medication row in a structured prescription response."""
model_config = ConfigDict(extra='forbid')
name: str = Field(..., min_length=1)
drug_id: Optional[str] = None # populated by DrugMatcher
dosage: Optional[str] = None # e.g. "500mg", "1g"
frequency: Optional[str] = None # e.g. "1 cp x 3/j"
duration: Optional[str] = None # e.g. "7 jours", "03 mois"
quantity: Optional[str] = None # e.g. "01 bte"
instructions: Optional[str] = None # any extra notes
confidence: float = Field(0.0, ge=0.0, le=1.0)
match_strategy: MatchStrategy = 'unmatched'
class LineCrop(BaseModel):
"""A single line of a scanned prescription (for trace/feedback)."""
model_config = ConfigDict(extra='forbid')
bbox: Optional[List[int]] = None # [x1, y1, x2, y2] or None
text: str = ''
confidence: float = Field(0.0, ge=0.0, le=1.0)
# Legacy compat with /scan endpoint β€” needed by active-learning feedback UI
line_index: Optional[int] = None
image_base64: Optional[str] = None # JPEG-encoded line crop
class Prescription(BaseModel):
"""End-to-end OCR + LLM-structuring + DB-matching response."""
model_config = ConfigDict(extra='forbid')
success: bool = True
method: Literal['trocr', 'trocr-printed', 'trocr+florence2', 'florence2-handwritten', 'florence2-printed', 'paddle-printed', 'paddle+florence2', 'vlm_fallback', 'hybrid'] = 'trocr'
confidence: float = Field(0.0, ge=0.0, le=1.0)
# Structured fields
doctor_name: Optional[str] = None
patient_name: Optional[str] = None
prescription_date: Optional[str] = None # DD/MM/YYYY string
medications: List[Medication] = Field(default_factory=list)
notes: Optional[str] = None
# Trace
line_crops: List[LineCrop] = Field(default_factory=list)
raw_ocr_text: Optional[str] = None # joined raw lines, debug only
# Telemetry
processing_ms: int = 0
model_version: str = 'v1'
dataset_version: str = 'v1'
error: Optional[str] = None
# ── Feedback loop (active learning) ──────────────────────────────────────────
class FeedbackCorrection(BaseModel):
"""One line correction submitted via /v2/feedback."""
model_config = ConfigDict(extra='forbid')
file_name: str
text: str # human-corrected ground truth
status: Literal['ok', 'blank', 'skip']
class FeedbackPayload(BaseModel):
"""POST /v2/feedback request body."""
model_config = ConfigDict(extra='forbid')
image_id: str
corrections: List[FeedbackCorrection]
reviewer_id: Optional[str] = None
submitted_at: Optional[str] = None # ISO timestamp
# ── Eval input (one row of hold-out ground truth) ────────────────────────────
class EvalSample(BaseModel):
"""One labeled hold-out line for `eval.py`."""
model_config = ConfigDict(extra='forbid')
file_name: str
text: str # ground truth
expected_drug: Optional[str] = None # if known, for Drug-name accuracy metric