Spaces:

build-small-hackathon
/

eyewitness

Sleeping

File size: 7,917 Bytes

"""Testimony parser: free-text witness description -> attribute slots.

Tier A (this file, always available): deterministic synonym matcher, EN+ES.
Tier B (model.py, when deployed): MiniCPM5-1B slot-filling — handles messy
real language ("kind of a roundish face, hadn't slept in days, big caterpillar
eyebrows"). Tier A doubles as validation fallback when the model output fails.

Output contract for BOTH tiers: dict[attr -> value|None], values from VOCAB.
None = the witness never mentioned it (that silence powers the lineup).
"""
from __future__ import annotations

import re
import unicodedata

from .face import VOCAB

# attr -> value -> list of trigger phrases (EN + ES), checked longest-first
SYNONYMS: dict[str, dict[str, list[str]]] = {
    "sex": {
        "male": ["a man", "a guy", "a dude", "male", "he was", "un hombre", "un tio", "un tipo", "un chico", "varon"],
        "female": ["a woman", "a lady", "a girl", "female", "she was", "una mujer", "una tia", "una chica", "una senora"],
    },
    "age": {
        "young": ["young", "a kid", "teenager", "in his twenties", "in her twenties", "joven", "un chaval", "una chavala", "veintipocos"],
        "adult": ["middle aged", "middle-aged", "adult", "de mediana edad", "adulto", "cuarenton", "treintanero"],
        "old": ["old", "elderly", "a senior", "retiree", "in his seventies", "mayor", "anciano", "anciana", "un abuelo", "una abuela", "viejo", "vieja", "jubilado"],
    },
    "face_shape": {
        "oval": ["oval face", "cara ovalada", "oval"],
        "round": ["round face", "roundish", "chubby face", "cara redonda", "regordeta", "mofletudo"],
        "square": ["square face", "square jaw", "jawline", "cara cuadrada", "mandibula marcada"],
        "long": ["long face", "narrow face", "thin face", "cara alargada", "cara larga", "cara fina"],
    },
    "skin": {
        "light": ["pale", "light skin", "fair skin", "palido", "piel clara", "blanquito"],
        "medium": ["tan", "medium skin", "olive skin", "moreno claro", "piel media", "tostado"],
        "dark": ["dark skin", "dark-skinned", "piel oscura", "piel negra", "moreno oscuro"],
    },
    "hair_style": {
        "bald": ["bald", "no hair", "shaved head", "calvo", "sin pelo", "rapado al cero"],
        "buzz": ["buzz cut", "buzzcut", "crew cut", "very short hair", "rapado", "pelo al uno"],
        "short_messy": ["messy hair", "short hair", "bedhead", "pelo corto", "despeinado", "pelo revuelto"],
        "slick_back": ["slicked back", "slick back", "slicked his hair back", "hair back like a banker", "gelled", "combed back", "engominado", "peinado hacia atras", "repeinado", "pelo hacia atras"],
        "curly": ["curly", "curls", "afro", "rizado", "rizos", "pelo chino"],
        "long": ["long hair", "hair down", "melena", "pelo largo"],
        "ponytail": ["ponytail", "pony tail", "coleta"],
        "mohawk": ["mohawk", "mohican", "cresta"],
    },
    "hair_color": {
        "black": ["black hair", "pelo negro", "moreno de pelo"],
        "brown": ["brown hair", "brunette", "pelo castano", "castano", "pelo marron"],
        "blond": ["blond", "blonde", "rubio", "pelo amarillo"],
        "red": ["red hair", "redhead", "ginger", "pelirrojo", "pelo rojo"],
        "gray": ["gray hair", "grey hair", "white hair", "silver hair", "canoso", "pelo gris", "pelo blanco", "canas"],
    },
    "brows": {
        "thin": ["thin eyebrows", "thin brows", "cejas finas", "cejas depiladas"],
        "thick": ["thick eyebrows", "thick brows", "strong brows", "cejas gruesas", "cejas marcadas"],
        "bushy": ["bushy eyebrows", "bushy brows", "caterpillar", "cejas pobladas", "cejas de oruga", "cejudo"],
        "unibrow": ["unibrow", "monobrow", "uniceja", "cejijunto", "una sola ceja"],
    },
    "eyes": {
        "narrow": ["narrow eyes", "squinty", "squinting", "ojos pequenos", "ojos entrecerrados", "ojos rasgados"],
        "big": ["big eyes", "wide eyes", "bulging", "ojos grandes", "ojos saltones", "ojazos"],
        "droopy": ["droopy eyes", "tired eyes", "sleepy eyes", "sad eyes", "hadn't slept", "ojos caidos", "ojos tristes", "ojos cansados", "ojeras"],
        "normal": ["normal eyes", "ojos normales"],
    },
    "glasses": {
        "round": ["round glasses", "circular glasses", "grandpa glasses", "gafas redondas", "lentes redondos", "gafas de abuelo", "gafas de esas redondas"],
        "square": ["square glasses", "rectangular glasses", "gafas cuadradas", "gafas de pasta"],
        "sunglasses": ["sunglasses", "shades", "dark glasses", "gafas de sol", "gafas oscuras", "lentes oscuros"],
        "none": ["no glasses", "sin gafas"],
    },
    "nose": {
        "small": ["small nose", "little nose", "button nose", "nariz pequena", "naricilla", "nariz chata"],
        "big": ["big nose", "large nose", "huge nose", "narizon", "nariz grande", "napia"],
        "hooked": ["hooked nose", "hook nose", "roman nose", "beak", "nariz aguilena", "nariz de gancho", "nariz curva"],
        "wide": ["wide nose", "broad nose", "flat nose", "nariz ancha"],
    },
    "mouth": {
        "smirk": ["smirk", "smirking", "smug smile", "half smile", "sonrisilla", "sonrisa de lado", "media sonrisa", "sonrisa chulesca"],
        "frown": ["frown", "frowning", "scowl", "angry mouth", "ceno", "boca enfadada", "mueca"],
        "open": ["mouth open", "open mouth", "gasping", "boca abierta"],
        "neutral": ["neutral mouth", "boca normal"],
    },
    "facial_hair": {
        "none": ["clean shaven", "clean-shaven", "no beard", "afeitado", "sin barba", "bien afeitado"],
        "stubble": ["stubble", "five o'clock shadow", "unshaven", "scruffy", "barba de tres dias", "sin afeitar", "barba incipiente"],
        "mustache": ["mustache", "moustache", "bigote", "mostacho"],
        "goatee": ["goatee", "perilla", "chivo"],
        "full_beard": ["full beard", "big beard", "huge beard", "thick beard", "bearded", "barba", "barbudo", "barba cerrada", "barbaza"],
    },
    "hat": {
        "beanie": ["beanie", "wool hat", "knit hat", "gorro", "gorro de lana"],
        "cap": ["baseball cap", "cap", "gorra", "visera"],
        "fedora": ["fedora", "brimmed hat", "trilby", "sombrero"],
        "none": ["no hat", "sin gorro", "sin sombrero", "sin gorra"],
    },
    "extra": {
        "scar_cheek": ["scar", "cicatriz", "marca en la cara"],
        "earring": ["earring", "ear ring", "pendiente", "arete", "aro en la oreja"],
        "neck_tattoo": ["neck tattoo", "tattoo", "tatuaje", "tattoo en el cuello"],
        "mole": ["mole", "beauty mark", "lunar"],
        "none": [],
    },
}

# precedence quirks: "barba" matches full_beard but "barba de tres dias" is stubble —
# the longest-first matching below handles it.


def _normalize(text: str) -> str:
    text = unicodedata.normalize("NFD", text.lower())
    text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
    return re.sub(r"\s+", " ", text)


_MATCHERS: list[tuple[str, str, str]] = sorted(
    ((phrase, attr, value)
     for attr, values in SYNONYMS.items()
     for value, phrases in values.items()
     for phrase in phrases),
    key=lambda t: len(t[0]), reverse=True,
)


def parse_testimony(text: str) -> dict[str, str | None]:
    """Tier A: longest-phrase-first deterministic matching, EN+ES."""
    norm = _normalize(text)
    out: dict[str, str | None] = {attr: None for attr in VOCAB}
    consumed: list[tuple[int, int]] = []
    for phrase, attr, value in _MATCHERS:
        if out[attr] is not None:
            continue
        i = norm.find(phrase)
        while i != -1:
            span = (i, i + len(phrase))
            if not any(s < span[1] and span[0] < e for s, e in consumed):
                out[attr] = value
                consumed.append(span)
                break
            i = norm.find(phrase, i + 1)
    # bare-color heuristic: "rubio"/"blond" w/o the word hair still means hair color
    return out