eyewitness / game /parser.py
Fcabla's picture
Upload folder using huggingface_hub
e1b885e verified
Raw
History Blame Contribute Delete
7.92 kB
"""Testimony parser: free-text witness description -> attribute slots.
Tier A (this file, always available): deterministic synonym matcher, EN+ES.
Tier B (model.py, when deployed): MiniCPM5-1B slot-filling — handles messy
real language ("kind of a roundish face, hadn't slept in days, big caterpillar
eyebrows"). Tier A doubles as validation fallback when the model output fails.
Output contract for BOTH tiers: dict[attr -> value|None], values from VOCAB.
None = the witness never mentioned it (that silence powers the lineup).
"""
from __future__ import annotations
import re
import unicodedata
from .face import VOCAB
# attr -> value -> list of trigger phrases (EN + ES), checked longest-first
SYNONYMS: dict[str, dict[str, list[str]]] = {
"sex": {
"male": ["a man", "a guy", "a dude", "male", "he was", "un hombre", "un tio", "un tipo", "un chico", "varon"],
"female": ["a woman", "a lady", "a girl", "female", "she was", "una mujer", "una tia", "una chica", "una senora"],
},
"age": {
"young": ["young", "a kid", "teenager", "in his twenties", "in her twenties", "joven", "un chaval", "una chavala", "veintipocos"],
"adult": ["middle aged", "middle-aged", "adult", "de mediana edad", "adulto", "cuarenton", "treintanero"],
"old": ["old", "elderly", "a senior", "retiree", "in his seventies", "mayor", "anciano", "anciana", "un abuelo", "una abuela", "viejo", "vieja", "jubilado"],
},
"face_shape": {
"oval": ["oval face", "cara ovalada", "oval"],
"round": ["round face", "roundish", "chubby face", "cara redonda", "regordeta", "mofletudo"],
"square": ["square face", "square jaw", "jawline", "cara cuadrada", "mandibula marcada"],
"long": ["long face", "narrow face", "thin face", "cara alargada", "cara larga", "cara fina"],
},
"skin": {
"light": ["pale", "light skin", "fair skin", "palido", "piel clara", "blanquito"],
"medium": ["tan", "medium skin", "olive skin", "moreno claro", "piel media", "tostado"],
"dark": ["dark skin", "dark-skinned", "piel oscura", "piel negra", "moreno oscuro"],
},
"hair_style": {
"bald": ["bald", "no hair", "shaved head", "calvo", "sin pelo", "rapado al cero"],
"buzz": ["buzz cut", "buzzcut", "crew cut", "very short hair", "rapado", "pelo al uno"],
"short_messy": ["messy hair", "short hair", "bedhead", "pelo corto", "despeinado", "pelo revuelto"],
"slick_back": ["slicked back", "slick back", "slicked his hair back", "hair back like a banker", "gelled", "combed back", "engominado", "peinado hacia atras", "repeinado", "pelo hacia atras"],
"curly": ["curly", "curls", "afro", "rizado", "rizos", "pelo chino"],
"long": ["long hair", "hair down", "melena", "pelo largo"],
"ponytail": ["ponytail", "pony tail", "coleta"],
"mohawk": ["mohawk", "mohican", "cresta"],
},
"hair_color": {
"black": ["black hair", "pelo negro", "moreno de pelo"],
"brown": ["brown hair", "brunette", "pelo castano", "castano", "pelo marron"],
"blond": ["blond", "blonde", "rubio", "pelo amarillo"],
"red": ["red hair", "redhead", "ginger", "pelirrojo", "pelo rojo"],
"gray": ["gray hair", "grey hair", "white hair", "silver hair", "canoso", "pelo gris", "pelo blanco", "canas"],
},
"brows": {
"thin": ["thin eyebrows", "thin brows", "cejas finas", "cejas depiladas"],
"thick": ["thick eyebrows", "thick brows", "strong brows", "cejas gruesas", "cejas marcadas"],
"bushy": ["bushy eyebrows", "bushy brows", "caterpillar", "cejas pobladas", "cejas de oruga", "cejudo"],
"unibrow": ["unibrow", "monobrow", "uniceja", "cejijunto", "una sola ceja"],
},
"eyes": {
"narrow": ["narrow eyes", "squinty", "squinting", "ojos pequenos", "ojos entrecerrados", "ojos rasgados"],
"big": ["big eyes", "wide eyes", "bulging", "ojos grandes", "ojos saltones", "ojazos"],
"droopy": ["droopy eyes", "tired eyes", "sleepy eyes", "sad eyes", "hadn't slept", "ojos caidos", "ojos tristes", "ojos cansados", "ojeras"],
"normal": ["normal eyes", "ojos normales"],
},
"glasses": {
"round": ["round glasses", "circular glasses", "grandpa glasses", "gafas redondas", "lentes redondos", "gafas de abuelo", "gafas de esas redondas"],
"square": ["square glasses", "rectangular glasses", "gafas cuadradas", "gafas de pasta"],
"sunglasses": ["sunglasses", "shades", "dark glasses", "gafas de sol", "gafas oscuras", "lentes oscuros"],
"none": ["no glasses", "sin gafas"],
},
"nose": {
"small": ["small nose", "little nose", "button nose", "nariz pequena", "naricilla", "nariz chata"],
"big": ["big nose", "large nose", "huge nose", "narizon", "nariz grande", "napia"],
"hooked": ["hooked nose", "hook nose", "roman nose", "beak", "nariz aguilena", "nariz de gancho", "nariz curva"],
"wide": ["wide nose", "broad nose", "flat nose", "nariz ancha"],
},
"mouth": {
"smirk": ["smirk", "smirking", "smug smile", "half smile", "sonrisilla", "sonrisa de lado", "media sonrisa", "sonrisa chulesca"],
"frown": ["frown", "frowning", "scowl", "angry mouth", "ceno", "boca enfadada", "mueca"],
"open": ["mouth open", "open mouth", "gasping", "boca abierta"],
"neutral": ["neutral mouth", "boca normal"],
},
"facial_hair": {
"none": ["clean shaven", "clean-shaven", "no beard", "afeitado", "sin barba", "bien afeitado"],
"stubble": ["stubble", "five o'clock shadow", "unshaven", "scruffy", "barba de tres dias", "sin afeitar", "barba incipiente"],
"mustache": ["mustache", "moustache", "bigote", "mostacho"],
"goatee": ["goatee", "perilla", "chivo"],
"full_beard": ["full beard", "big beard", "huge beard", "thick beard", "bearded", "barba", "barbudo", "barba cerrada", "barbaza"],
},
"hat": {
"beanie": ["beanie", "wool hat", "knit hat", "gorro", "gorro de lana"],
"cap": ["baseball cap", "cap", "gorra", "visera"],
"fedora": ["fedora", "brimmed hat", "trilby", "sombrero"],
"none": ["no hat", "sin gorro", "sin sombrero", "sin gorra"],
},
"extra": {
"scar_cheek": ["scar", "cicatriz", "marca en la cara"],
"earring": ["earring", "ear ring", "pendiente", "arete", "aro en la oreja"],
"neck_tattoo": ["neck tattoo", "tattoo", "tatuaje", "tattoo en el cuello"],
"mole": ["mole", "beauty mark", "lunar"],
"none": [],
},
}
# precedence quirks: "barba" matches full_beard but "barba de tres dias" is stubble —
# the longest-first matching below handles it.
def _normalize(text: str) -> str:
text = unicodedata.normalize("NFD", text.lower())
text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
return re.sub(r"\s+", " ", text)
_MATCHERS: list[tuple[str, str, str]] = sorted(
((phrase, attr, value)
for attr, values in SYNONYMS.items()
for value, phrases in values.items()
for phrase in phrases),
key=lambda t: len(t[0]), reverse=True,
)
def parse_testimony(text: str) -> dict[str, str | None]:
"""Tier A: longest-phrase-first deterministic matching, EN+ES."""
norm = _normalize(text)
out: dict[str, str | None] = {attr: None for attr in VOCAB}
consumed: list[tuple[int, int]] = []
for phrase, attr, value in _MATCHERS:
if out[attr] is not None:
continue
i = norm.find(phrase)
while i != -1:
span = (i, i + len(phrase))
if not any(s < span[1] and span[0] < e for s, e in consumed):
out[attr] = value
consumed.append(span)
break
i = norm.find(phrase, i + 1)
# bare-color heuristic: "rubio"/"blond" w/o the word hair still means hair color
return out