Spaces:
Sleeping
Sleeping
File size: 7,917 Bytes
5252c32 e1b885e 5252c32 e1b885e 5252c32 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | """Testimony parser: free-text witness description -> attribute slots.
Tier A (this file, always available): deterministic synonym matcher, EN+ES.
Tier B (model.py, when deployed): MiniCPM5-1B slot-filling — handles messy
real language ("kind of a roundish face, hadn't slept in days, big caterpillar
eyebrows"). Tier A doubles as validation fallback when the model output fails.
Output contract for BOTH tiers: dict[attr -> value|None], values from VOCAB.
None = the witness never mentioned it (that silence powers the lineup).
"""
from __future__ import annotations
import re
import unicodedata
from .face import VOCAB
# attr -> value -> list of trigger phrases (EN + ES), checked longest-first
SYNONYMS: dict[str, dict[str, list[str]]] = {
"sex": {
"male": ["a man", "a guy", "a dude", "male", "he was", "un hombre", "un tio", "un tipo", "un chico", "varon"],
"female": ["a woman", "a lady", "a girl", "female", "she was", "una mujer", "una tia", "una chica", "una senora"],
},
"age": {
"young": ["young", "a kid", "teenager", "in his twenties", "in her twenties", "joven", "un chaval", "una chavala", "veintipocos"],
"adult": ["middle aged", "middle-aged", "adult", "de mediana edad", "adulto", "cuarenton", "treintanero"],
"old": ["old", "elderly", "a senior", "retiree", "in his seventies", "mayor", "anciano", "anciana", "un abuelo", "una abuela", "viejo", "vieja", "jubilado"],
},
"face_shape": {
"oval": ["oval face", "cara ovalada", "oval"],
"round": ["round face", "roundish", "chubby face", "cara redonda", "regordeta", "mofletudo"],
"square": ["square face", "square jaw", "jawline", "cara cuadrada", "mandibula marcada"],
"long": ["long face", "narrow face", "thin face", "cara alargada", "cara larga", "cara fina"],
},
"skin": {
"light": ["pale", "light skin", "fair skin", "palido", "piel clara", "blanquito"],
"medium": ["tan", "medium skin", "olive skin", "moreno claro", "piel media", "tostado"],
"dark": ["dark skin", "dark-skinned", "piel oscura", "piel negra", "moreno oscuro"],
},
"hair_style": {
"bald": ["bald", "no hair", "shaved head", "calvo", "sin pelo", "rapado al cero"],
"buzz": ["buzz cut", "buzzcut", "crew cut", "very short hair", "rapado", "pelo al uno"],
"short_messy": ["messy hair", "short hair", "bedhead", "pelo corto", "despeinado", "pelo revuelto"],
"slick_back": ["slicked back", "slick back", "slicked his hair back", "hair back like a banker", "gelled", "combed back", "engominado", "peinado hacia atras", "repeinado", "pelo hacia atras"],
"curly": ["curly", "curls", "afro", "rizado", "rizos", "pelo chino"],
"long": ["long hair", "hair down", "melena", "pelo largo"],
"ponytail": ["ponytail", "pony tail", "coleta"],
"mohawk": ["mohawk", "mohican", "cresta"],
},
"hair_color": {
"black": ["black hair", "pelo negro", "moreno de pelo"],
"brown": ["brown hair", "brunette", "pelo castano", "castano", "pelo marron"],
"blond": ["blond", "blonde", "rubio", "pelo amarillo"],
"red": ["red hair", "redhead", "ginger", "pelirrojo", "pelo rojo"],
"gray": ["gray hair", "grey hair", "white hair", "silver hair", "canoso", "pelo gris", "pelo blanco", "canas"],
},
"brows": {
"thin": ["thin eyebrows", "thin brows", "cejas finas", "cejas depiladas"],
"thick": ["thick eyebrows", "thick brows", "strong brows", "cejas gruesas", "cejas marcadas"],
"bushy": ["bushy eyebrows", "bushy brows", "caterpillar", "cejas pobladas", "cejas de oruga", "cejudo"],
"unibrow": ["unibrow", "monobrow", "uniceja", "cejijunto", "una sola ceja"],
},
"eyes": {
"narrow": ["narrow eyes", "squinty", "squinting", "ojos pequenos", "ojos entrecerrados", "ojos rasgados"],
"big": ["big eyes", "wide eyes", "bulging", "ojos grandes", "ojos saltones", "ojazos"],
"droopy": ["droopy eyes", "tired eyes", "sleepy eyes", "sad eyes", "hadn't slept", "ojos caidos", "ojos tristes", "ojos cansados", "ojeras"],
"normal": ["normal eyes", "ojos normales"],
},
"glasses": {
"round": ["round glasses", "circular glasses", "grandpa glasses", "gafas redondas", "lentes redondos", "gafas de abuelo", "gafas de esas redondas"],
"square": ["square glasses", "rectangular glasses", "gafas cuadradas", "gafas de pasta"],
"sunglasses": ["sunglasses", "shades", "dark glasses", "gafas de sol", "gafas oscuras", "lentes oscuros"],
"none": ["no glasses", "sin gafas"],
},
"nose": {
"small": ["small nose", "little nose", "button nose", "nariz pequena", "naricilla", "nariz chata"],
"big": ["big nose", "large nose", "huge nose", "narizon", "nariz grande", "napia"],
"hooked": ["hooked nose", "hook nose", "roman nose", "beak", "nariz aguilena", "nariz de gancho", "nariz curva"],
"wide": ["wide nose", "broad nose", "flat nose", "nariz ancha"],
},
"mouth": {
"smirk": ["smirk", "smirking", "smug smile", "half smile", "sonrisilla", "sonrisa de lado", "media sonrisa", "sonrisa chulesca"],
"frown": ["frown", "frowning", "scowl", "angry mouth", "ceno", "boca enfadada", "mueca"],
"open": ["mouth open", "open mouth", "gasping", "boca abierta"],
"neutral": ["neutral mouth", "boca normal"],
},
"facial_hair": {
"none": ["clean shaven", "clean-shaven", "no beard", "afeitado", "sin barba", "bien afeitado"],
"stubble": ["stubble", "five o'clock shadow", "unshaven", "scruffy", "barba de tres dias", "sin afeitar", "barba incipiente"],
"mustache": ["mustache", "moustache", "bigote", "mostacho"],
"goatee": ["goatee", "perilla", "chivo"],
"full_beard": ["full beard", "big beard", "huge beard", "thick beard", "bearded", "barba", "barbudo", "barba cerrada", "barbaza"],
},
"hat": {
"beanie": ["beanie", "wool hat", "knit hat", "gorro", "gorro de lana"],
"cap": ["baseball cap", "cap", "gorra", "visera"],
"fedora": ["fedora", "brimmed hat", "trilby", "sombrero"],
"none": ["no hat", "sin gorro", "sin sombrero", "sin gorra"],
},
"extra": {
"scar_cheek": ["scar", "cicatriz", "marca en la cara"],
"earring": ["earring", "ear ring", "pendiente", "arete", "aro en la oreja"],
"neck_tattoo": ["neck tattoo", "tattoo", "tatuaje", "tattoo en el cuello"],
"mole": ["mole", "beauty mark", "lunar"],
"none": [],
},
}
# precedence quirks: "barba" matches full_beard but "barba de tres dias" is stubble —
# the longest-first matching below handles it.
def _normalize(text: str) -> str:
text = unicodedata.normalize("NFD", text.lower())
text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
return re.sub(r"\s+", " ", text)
_MATCHERS: list[tuple[str, str, str]] = sorted(
((phrase, attr, value)
for attr, values in SYNONYMS.items()
for value, phrases in values.items()
for phrase in phrases),
key=lambda t: len(t[0]), reverse=True,
)
def parse_testimony(text: str) -> dict[str, str | None]:
"""Tier A: longest-phrase-first deterministic matching, EN+ES."""
norm = _normalize(text)
out: dict[str, str | None] = {attr: None for attr in VOCAB}
consumed: list[tuple[int, int]] = []
for phrase, attr, value in _MATCHERS:
if out[attr] is not None:
continue
i = norm.find(phrase)
while i != -1:
span = (i, i + len(phrase))
if not any(s < span[1] and span[0] < e for s, e in consumed):
out[attr] = value
consumed.append(span)
break
i = norm.find(phrase, i + 1)
# bare-color heuristic: "rubio"/"blond" w/o the word hair still means hair color
return out
|