"""Testimony parser: free-text witness description -> attribute slots. Tier A (this file, always available): deterministic synonym matcher, EN+ES. Tier B (model.py, when deployed): MiniCPM5-1B slot-filling — handles messy real language ("kind of a roundish face, hadn't slept in days, big caterpillar eyebrows"). Tier A doubles as validation fallback when the model output fails. Output contract for BOTH tiers: dict[attr -> value|None], values from VOCAB. None = the witness never mentioned it (that silence powers the lineup). """ from __future__ import annotations import re import unicodedata from .face import VOCAB # attr -> value -> list of trigger phrases (EN + ES), checked longest-first SYNONYMS: dict[str, dict[str, list[str]]] = { "sex": { "male": ["a man", "a guy", "a dude", "male", "he was", "un hombre", "un tio", "un tipo", "un chico", "varon"], "female": ["a woman", "a lady", "a girl", "female", "she was", "una mujer", "una tia", "una chica", "una senora"], }, "age": { "young": ["young", "a kid", "teenager", "in his twenties", "in her twenties", "joven", "un chaval", "una chavala", "veintipocos"], "adult": ["middle aged", "middle-aged", "adult", "de mediana edad", "adulto", "cuarenton", "treintanero"], "old": ["old", "elderly", "a senior", "retiree", "in his seventies", "mayor", "anciano", "anciana", "un abuelo", "una abuela", "viejo", "vieja", "jubilado"], }, "face_shape": { "oval": ["oval face", "cara ovalada", "oval"], "round": ["round face", "roundish", "chubby face", "cara redonda", "regordeta", "mofletudo"], "square": ["square face", "square jaw", "jawline", "cara cuadrada", "mandibula marcada"], "long": ["long face", "narrow face", "thin face", "cara alargada", "cara larga", "cara fina"], }, "skin": { "light": ["pale", "light skin", "fair skin", "palido", "piel clara", "blanquito"], "medium": ["tan", "medium skin", "olive skin", "moreno claro", "piel media", "tostado"], "dark": ["dark skin", "dark-skinned", "piel oscura", "piel negra", "moreno oscuro"], }, "hair_style": { "bald": ["bald", "no hair", "shaved head", "calvo", "sin pelo", "rapado al cero"], "buzz": ["buzz cut", "buzzcut", "crew cut", "very short hair", "rapado", "pelo al uno"], "short_messy": ["messy hair", "short hair", "bedhead", "pelo corto", "despeinado", "pelo revuelto"], "slick_back": ["slicked back", "slick back", "slicked his hair back", "hair back like a banker", "gelled", "combed back", "engominado", "peinado hacia atras", "repeinado", "pelo hacia atras"], "curly": ["curly", "curls", "afro", "rizado", "rizos", "pelo chino"], "long": ["long hair", "hair down", "melena", "pelo largo"], "ponytail": ["ponytail", "pony tail", "coleta"], "mohawk": ["mohawk", "mohican", "cresta"], }, "hair_color": { "black": ["black hair", "pelo negro", "moreno de pelo"], "brown": ["brown hair", "brunette", "pelo castano", "castano", "pelo marron"], "blond": ["blond", "blonde", "rubio", "pelo amarillo"], "red": ["red hair", "redhead", "ginger", "pelirrojo", "pelo rojo"], "gray": ["gray hair", "grey hair", "white hair", "silver hair", "canoso", "pelo gris", "pelo blanco", "canas"], }, "brows": { "thin": ["thin eyebrows", "thin brows", "cejas finas", "cejas depiladas"], "thick": ["thick eyebrows", "thick brows", "strong brows", "cejas gruesas", "cejas marcadas"], "bushy": ["bushy eyebrows", "bushy brows", "caterpillar", "cejas pobladas", "cejas de oruga", "cejudo"], "unibrow": ["unibrow", "monobrow", "uniceja", "cejijunto", "una sola ceja"], }, "eyes": { "narrow": ["narrow eyes", "squinty", "squinting", "ojos pequenos", "ojos entrecerrados", "ojos rasgados"], "big": ["big eyes", "wide eyes", "bulging", "ojos grandes", "ojos saltones", "ojazos"], "droopy": ["droopy eyes", "tired eyes", "sleepy eyes", "sad eyes", "hadn't slept", "ojos caidos", "ojos tristes", "ojos cansados", "ojeras"], "normal": ["normal eyes", "ojos normales"], }, "glasses": { "round": ["round glasses", "circular glasses", "grandpa glasses", "gafas redondas", "lentes redondos", "gafas de abuelo", "gafas de esas redondas"], "square": ["square glasses", "rectangular glasses", "gafas cuadradas", "gafas de pasta"], "sunglasses": ["sunglasses", "shades", "dark glasses", "gafas de sol", "gafas oscuras", "lentes oscuros"], "none": ["no glasses", "sin gafas"], }, "nose": { "small": ["small nose", "little nose", "button nose", "nariz pequena", "naricilla", "nariz chata"], "big": ["big nose", "large nose", "huge nose", "narizon", "nariz grande", "napia"], "hooked": ["hooked nose", "hook nose", "roman nose", "beak", "nariz aguilena", "nariz de gancho", "nariz curva"], "wide": ["wide nose", "broad nose", "flat nose", "nariz ancha"], }, "mouth": { "smirk": ["smirk", "smirking", "smug smile", "half smile", "sonrisilla", "sonrisa de lado", "media sonrisa", "sonrisa chulesca"], "frown": ["frown", "frowning", "scowl", "angry mouth", "ceno", "boca enfadada", "mueca"], "open": ["mouth open", "open mouth", "gasping", "boca abierta"], "neutral": ["neutral mouth", "boca normal"], }, "facial_hair": { "none": ["clean shaven", "clean-shaven", "no beard", "afeitado", "sin barba", "bien afeitado"], "stubble": ["stubble", "five o'clock shadow", "unshaven", "scruffy", "barba de tres dias", "sin afeitar", "barba incipiente"], "mustache": ["mustache", "moustache", "bigote", "mostacho"], "goatee": ["goatee", "perilla", "chivo"], "full_beard": ["full beard", "big beard", "huge beard", "thick beard", "bearded", "barba", "barbudo", "barba cerrada", "barbaza"], }, "hat": { "beanie": ["beanie", "wool hat", "knit hat", "gorro", "gorro de lana"], "cap": ["baseball cap", "cap", "gorra", "visera"], "fedora": ["fedora", "brimmed hat", "trilby", "sombrero"], "none": ["no hat", "sin gorro", "sin sombrero", "sin gorra"], }, "extra": { "scar_cheek": ["scar", "cicatriz", "marca en la cara"], "earring": ["earring", "ear ring", "pendiente", "arete", "aro en la oreja"], "neck_tattoo": ["neck tattoo", "tattoo", "tatuaje", "tattoo en el cuello"], "mole": ["mole", "beauty mark", "lunar"], "none": [], }, } # precedence quirks: "barba" matches full_beard but "barba de tres dias" is stubble — # the longest-first matching below handles it. def _normalize(text: str) -> str: text = unicodedata.normalize("NFD", text.lower()) text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn") return re.sub(r"\s+", " ", text) _MATCHERS: list[tuple[str, str, str]] = sorted( ((phrase, attr, value) for attr, values in SYNONYMS.items() for value, phrases in values.items() for phrase in phrases), key=lambda t: len(t[0]), reverse=True, ) def parse_testimony(text: str) -> dict[str, str | None]: """Tier A: longest-phrase-first deterministic matching, EN+ES.""" norm = _normalize(text) out: dict[str, str | None] = {attr: None for attr in VOCAB} consumed: list[tuple[int, int]] = [] for phrase, attr, value in _MATCHERS: if out[attr] is not None: continue i = norm.find(phrase) while i != -1: span = (i, i + len(phrase)) if not any(s < span[1] and span[0] < e for s, e in consumed): out[attr] = value consumed.append(span) break i = norm.find(phrase, i + 1) # bare-color heuristic: "rubio"/"blond" w/o the word hair still means hair color return out