# app.py — Traductor Español ↔ Neoíbero (BI-ONLY 1:1 estricto, determinista) # UI completa + CSS “íbero” + TTS + Línea ibérica (codificación appOld) # Requiere un ÚNICO CSV con superficies exactas (UTF-8) y columnas: # - source_es (o es/es_surface) # - target_ni (o ni/ni_surface) # - pair_id (opcional) # # El motor NO hace heurísticas ni morfología: 1:1 exacto por superficie. # Puntuación y números pasan tal cual. Desconocidos -> [SIN-LEX:...] / [?:...] # Determinismo NI→ES: entradas NI duplicadas (ambigüas) quedan bloqueadas y se rinden como [AMB-NI:...] import gradio as gr import os, csv, re, base64, unicodedata, gzip import torch from transformers import AutoProcessor, VitsModel import numpy as np from html import escape # ====== cache ====== os.environ['TRANSFORMERS_CACHE'] = os.environ.get('TRANSFORMERS_CACHE', '/tmp/cache') os.environ['HF_HOME'] = os.environ.get('HF_HOME', '/tmp/hf') DEBUG_MODE = False def debug_print(msg): if DEBUG_MODE: print(f"[DEBUG] {msg}") # ====== util ====== def _open_maybe_gzip(path): if str(path).endswith(".gz"): # CSV debe venir en UTF-8 (evita mojibake) return gzip.open(path, "rt", encoding="utf-8", newline="") return open(path, "r", encoding="utf-8", newline="") def norm(x): return (str(x).strip()) if x is not None else "" def lower(x): return norm(x).lower() def fold(s:str)->str: return ''.join(c for c in unicodedata.normalize('NFD', s or "") if unicodedata.category(c)!="Mn") # ====== rutas ====== def _cand(*names): for n in names: if os.path.exists(n): return n p = os.path.join("salida", n) if os.path.exists(p): return p return names[0] # último recurso para mensajes # Prioriza los “master/surface-ready”; luego retrocompatibles CSV_BI = _cand( "LEXICON_UNICO_1a1.csv.gz", "MASTER_SURFACE_READY.csv.gz", "MASTER_REEXTENDED.csv.gz", "BI_SURFACE_READY.csv.gz", "HF_Pairs_BI_REEXTENDED.csv.gz", "HF_Pairs_BI_EXPANDED1_EXTENDED_FILLED.csv.gz", "HF_Pairs_BI_EXPANDED1.csv.gz" ) # ====== estructuras strict BI ====== # Clave = superficie exacta en minúsculas. Valor = (superficie_original_opuesta, pair_id) ES2NI = {} # es_surface_lower -> (ni_surface, pair_id) NI2ES = {} # ni_surface_lower -> (es_surface, pair_id) # N-gramas/frases: ESPHRASE2NI = {} # "el saco" -> (ni_surface, pair_id) NIPHRASE2ES = {} # "…-ke ni etxe-ka" -> (es_surface, pair_id) MAX_NGRAM = 3 # ====== signos / tokenización mínima ====== VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’")) _num_re = re.compile(r"^\d+([.,]\d+)?$") def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or "")) # --- separadores de cláusula + placeholders atómicos --- CLAUSE_BREAKS = {",", ";", "—", "–", ":"} PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$") def is_placeholder(tok: str) -> bool: return bool(PLACEHOLDER_RE.match(tok or "")) def _restore_brk(tok, protected): m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "") if not m: return tok idx = int(m.group(1)) suf = m.group(2) base = protected[idx] if 0 <= idx < len(protected) else tok return base + (f"-{suf}" if suf else "") def simple_tokenize(text:str): """Tokenización mínima, sin romper [ ... ] ni [ ... ]-na/-ba.""" if not text: return [] protected = [] def _repl(m): key = f"__BRK{len(protected)}__" protected.append(m.group(0)) return key t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip()) t = re.sub(r"\s+"," ", t) t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t) toks = [tok for tok in t.split() if tok] for i, tok in enumerate(toks): if tok.startswith("__BRK") and "__" in tok: toks[i] = _restore_brk(tok, protected) return toks def detokenize(tokens): s = " ".join(tokens) s = re.sub(r"\s+([,.;:!?])", r"\1", s) s = re.sub(r"([¿¡])\s+", r"\1", s) s = re.sub(r"\(\s+", "(", s) s = re.sub(r"\s+\)", ")", s) s = re.sub(r"\s{2,}", " ", s).strip() return s # ====== Modalidad vascoide (-na / -ba) ====== MODAL_SUFFIX_ENABLE = True MODAL_ONLY_ON_FINITE = True MODAL_STRIP_QE_IN_NI = True SENT_END = {".", "!", "?", "…"} OPEN_FOR = {"?": "¿", "!": "¡"} WRAP_PREFIX = set(list("«“‘([{\"'")) PERS_ENDINGS = ("-n","-zu","-gu","-zuk","-zuek","-k") TAM_FINITE = ("-ke","-bo","-ta","-ni","-tu") def looks_like_finite_ni(tok:str)->bool: t = (tok or "").lower() if not t or t.startswith("["): return False base = re.sub(r"-(na|ba)$","", t) for tam in TAM_FINITE: if base.endswith(tam) or any(base.endswith(tam+pe) for pe in PERS_ENDINGS): return True return False def last_content_index(tokens, start, end_exclusive): i = end_exclusive - 1 while i >= start and tokens[i] in VISIBLE_PUNCT: i -= 1 return i if i >= start else -1 def strip_qe_punct(tokens): return [t for t in tokens if t not in ("¿","?","¡","!")] # --- helpers numéricos para no cortar decimales/horas --- def _is_numeric_comma(tokens, i): return (0 < i < len(tokens)-1 and tokens[i] == "," and is_number(tokens[i-1]) and is_number(tokens[i+1])) def _is_time_colon(tokens, i): return (0 < i < len(tokens)-1 and tokens[i] == ":" and is_number(tokens[i-1]) and is_number(tokens[i+1])) def _is_true_clause_break(tokens, i): if tokens[i] not in CLAUSE_BREAKS: return False if _is_numeric_comma(tokens, i): return False if _is_time_colon(tokens, i): return False return True def add_modal_suffixes_es2ni(tokens): """Añade -na/-ba al último verbo finito (o último constituyente) por oración.""" if not MODAL_SUFFIX_ENABLE: return tokens out = tokens[:] n = len(out) i = 0 sent_start = 0 while i < n: if out[i] in ("?", "!"): closer = out[i] target = -1 j = i - 1 while j >= sent_start: if out[j] not in VISIBLE_PUNCT and (not MODAL_ONLY_ON_FINITE or looks_like_finite_ni(out[j])): target = j; break j -= 1 if target == -1: target = last_content_index(out, sent_start, i) if target != -1: suf = "na" if closer == "?" else "ba" if not re.search(rf"-(?:{suf})$", out[target].lower()): out[target] = out[target] + "-" + suf sent_start = i + 1 elif out[i] in SENT_END: sent_start = i + 1 i += 1 if MODAL_STRIP_QE_IN_NI: out = strip_qe_punct(out) return out def strip_modal_suffixes_ni(tokens): """ Interpreta -na/-ba como modalidad; ahora SOLO cerramos al final de oración. (No cerramos en comas/“:”, salvo que ya haya ?/! explícitos.) """ if not MODAL_SUFFIX_ENABLE: return tokens out = [] buf = [] pending_end = None mode = None # "?" / "!" def _emit(end_override=None, also_append=None): nonlocal buf, mode, pending_end, out local = [t for t in buf if t not in ("¿","?","¡","!")] if local: end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".") out.extend(local) out.append(end_tok) buf.clear(); mode = None; pending_end = None if also_append: out.append(also_append) toks = tokens + ["."] for i, t in enumerate(toks): if t in ("¿", "¡"): _emit(); mode = "?" if t == "¿" else "!" continue if t in ("?", "!"): pending_end = t; _emit(); continue if t in SENT_END: pending_end = t; _emit(); continue # ✦ MODALIDAD: en separadores de cláusula NO cerramos todavía: if t in CLAUSE_BREAKS and mode in ("?","!"): buf.append(t) continue m = re.search(r"-(na|ba)$", (t or "").lower()) if m: if mode and buf: _emit() mode = "?" if m.group(1) == "na" else "!" t = t[:-len(m.group(0))] if t: buf.append(t) if len(out) >= 2 and out[-1] == "." and out[-2] == ".": out.pop() return out def add_inverted_openers(tokens): """Inserta ¿/¡ al inicio de cada tramo que acaba en ?/!, ignorando comas/“:” numéricos.""" out = tokens[:] START_BREAKS = SENT_END | CLAUSE_BREAKS def _is_true_start_break(idx): if out[idx] in SENT_END: return True if out[idx] in CLAUSE_BREAKS: return _is_true_clause_break(out, idx) return False i = 0 while i < len(out): if out[i] in ("?", "!"): closer = out[i]; opener = OPEN_FOR[closer] j = i - 1 while j >= 0 and not _is_true_start_break(j): j -= 1 start = j + 1 k = start while k < i and out[k] in WRAP_PREFIX: k += 1 if not (k < len(out) and out[k] == opener): out.insert(k, opener); i += 1 i += 1 return out # ====== EXPANSIONES (deterministas, sólo ES→NI) ====== EXPANSION_ENABLE = True FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH") FLAG_PLURAL = ("S",) FLAG_3PL = ("3","V3") VOWELS = "aeiouáéíóúüAEIOUÁÉÍÓÚÜ" def _has_flag(cell:str, wanted:tuple)->bool: c = (cell or "") return any(w in c for w in wanted) def _pluralize_es_form(s: str) -> str: if not s: return s sl = s.lower() if sl.endswith("z"): return s[:-1] + ("ces" if s[-1].islower() else "CES") if s[-1] not in VOWELS: return s + ("es" if s[-1].islower() else "ES") return s + ("s" if s[-1].islower() else "S") def _present_3pl_from_3sg(s: str) -> str: if not s: return s return s + ("n" if s[-1].islower() else "N") # ====== TTS (appOld) ====== print("Cargando modelo de voz (opcional)…") device = "cuda" if torch.cuda.is_available() else "cpu" processor = model = None try: processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa") model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device) print("Modelo de voz cargado.") except Exception as e: print(f"AVISO TTS: {e}") def add_reading_pauses(text: str, level:int=3) -> str: if level <= 1: return text t = re.sub(r",\s*", ", , ", text) t = re.sub(r"\.\s*", ". . ", text) return re.sub(r'\s+',' ',t).strip() def hispanize_for_tts(ni_text: str) -> str: text=(ni_text or "").lower() text=text.replace('ŕ','rr').replace('ś','s').replace('eś','es').replace('-', ' ') text=re.sub(r'\[.*?\]','',text); text=re.sub(r'\s+',' ',text).strip() return add_reading_pauses(text, 3) def synthesize_speech(text): if not text or not text.strip() or model is None or processor is None: return None try: inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device) with torch.no_grad(): output = model(**inputs).waveform speech_np = output.cpu().numpy().squeeze() mx = max(abs(speech_np.min()), abs(speech_np.max())) if mx>0: speech_np = speech_np/mx*0.9 return (16000, speech_np.astype(np.float32)) except Exception as e: print(f"Error TTS: {e}"); return None # ====== Línea ibérica (appOld) ====== V = "aeiou" SYL_FOR = { "b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"], "d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"], "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"], "g":["‹GA›","‹GE›","‹GI›","‹DO›","‹GU›"] if False else ["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"], "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"] } ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›", "l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"} CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"} def tokens_from_latin(ni:str)->str: out=[]; i=0; ni=(ni or "").lower() while istr: low=(ni_plain or "").lower() if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low] m=re.findall(r"‹(.*?)›", token_str) out=[] for t in m: if KEYS_MODE == "compact": if len(t)==2 and t[0] in "BDTGK": out.append(t[0]) elif t in ("A","E","I","O","U"): out.append(t) elif t=="Ś": out.append("X") elif t=="Ŕ": out.append("r") else: out.append(t[0].upper()) else: if len(t)==2 and t[0] in "BDTGK": out.append(t) elif t=="Ś": out.append("X") elif t=="Ŕ": out.append("r") else: out.append(t) return "".join(out) TRIDOT = "/" def render_ib_with_tridots(ib_toks): res=[]; prev_word=False for tk in ib_toks: is_punct = tk in VISIBLE_PUNCT if is_punct: res.append(" "+tk+" "); prev_word=False else: if prev_word: res.append(" "+TRIDOT+" ") res.append(tk); prev_word=True return "".join(res).strip() # ====== BI loader + diagnóstico ====== # ### ★ MODO ESTRICTO Y DETERMINISTA STRICT_BI_ENFORCE = True # si True, no se admite NI ambigua AMBIG_NI = {} # ni_lower -> set de ES conflictivos BI_DIAG_HTML = "Sin CSV cargado." def load_bi_strict_and_diagnose(): """Carga el CSV, llena ES2NI/NI2ES y prepara un HTML de diagnóstico.""" global BI_DIAG_HTML # vaciar estructuras antes de cargar (determinismo) ES2NI.clear(); NI2ES.clear(); ESPHRASE2NI.clear(); NIPHRASE2ES.clear() AMBIG_NI.clear() if not os.path.exists(CSV_BI): msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}" print(msg); BI_DIAG_HTML=f"Error: {escape(msg)}" return False rows=0; dup_es=0; dup_ni=0; empty_pid=0 mismatch_backmap = 0 mismatch_samples = [] pid_seen=set() print(f"Detectado CSV bilingüe: {CSV_BI}") try: with _open_maybe_gzip(CSV_BI) as f: rd = csv.DictReader(f) flds=set(rd.fieldnames or []) ES_COL = "source_es" if "source_es" in flds else "es_surface" if "es_surface" in flds else "es" NI_COL = "target_ni" if "target_ni" in flds else "ni_surface" if "ni_surface" in flds else "ni" IDCOL = "pair_id" if "pair_id" in flds else "id" if "id" in flds else None FLAGCOL = None for cand in FLAG_COLNAMES: if cand in flds: FLAGCOL = cand; break base_rows = [] for r in rd: es_orig = (r.get(ES_COL) or "").strip() ni_orig = (r.get(NI_COL) or "").strip() if not (es_orig and ni_orig): continue pid = (r.get(IDCOL) or "").strip() if IDCOL else "" if not pid: empty_pid += 1 else: pid_seen.add(pid) flags = (r.get(FLAGCOL) or "") if FLAGCOL else "" es = lower(es_orig) ni = lower(ni_orig) # Frases if " " in es: if es not in ESPHRASE2NI: # determinista: primera manda ESPHRASE2NI[es] = (ni_orig, pid) if " " in ni: if ni not in NIPHRASE2ES: NIPHRASE2ES[ni] = (es_orig, pid) # ES→NI (determinista: primera fila gana) if es in ES2NI: dup_es += 1 else: ES2NI[es] = (ni_orig, pid) # NI→ES (determinista + bloqueo de ambigüedad) if ni in NI2ES: dup_ni += 1 # registra ambigüedad s = AMBIG_NI.get(ni, set()) s.add(NI2ES[ni][0]); s.add(es_orig) AMBIG_NI[ni] = s if STRICT_BI_ENFORCE: NI2ES.pop(ni, None) # invalida la superficie NI conflictiva else: if STRICT_BI_ENFORCE and ni in AMBIG_NI: # ya marcada ambigua: no insertar pass else: NI2ES[ni] = (es_orig, pid) base_rows.append((es_orig, ni_orig, pid, flags)) rows += 1 # Expansiones deterministas (solo añaden ES2NI; NO tocan NI2ES) if EXPANSION_ENABLE: for es_orig, ni_orig, pid, flags in base_rows: if not flags: continue if _has_flag(flags, FLAG_PLURAL): pl = _pluralize_es_form(es_orig) pl_key = lower(pl) if pl_key not in ES2NI: ES2NI[pl_key] = (ni_orig, pid) if _has_flag(flags, FLAG_3PL): p3 = _present_3pl_from_3sg(es_orig) p3_key = lower(p3) if p3_key not in ES2NI: ES2NI[p3_key] = (ni_orig, pid) # Diagnóstico asimetrías (no afecta determinismo) for es_low, (ni_surf, _) in ES2NI.items(): ni_low = lower(ni_surf) back = NI2ES.get(ni_low) if back and lower(back[0]) != es_low: mismatch_backmap += 1 if len(mismatch_samples) < 10: mismatch_samples.append((es_low, ni_low, lower(back[0]))) except Exception as e: msg=f"[ERROR] Al leer {CSV_BI}: {e}" print(msg); BI_DIAG_HTML=f"Error: {escape(msg)}" return False es_unique = len(ES2NI) ni_unique = len(NI2ES) pid_unique = len(pid_seen) print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.") if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).") if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (bloqueados en modo estricto).") if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.") if mismatch_backmap: print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).") sam_html = "" if mismatch_samples: sam_rows = "".join( f"
  • {escape(es)}{escape(ni)}{escape(es2)}
  • " for es,ni,es2 in mismatch_samples ) sam_html = f"
    Muestras
    " ambN = sum(len(v) > 1 for v in AMBIG_NI.values()) ambList = ", ".join(f"{k}→{sorted(list(v))[:3]}" for k,v in list(AMBIG_NI.items())[:5]) BI_DIAG_HTML = f"""
    Diagnóstico del CSV BI
    Archivo: {escape(CSV_BI)}
    Filas base (CSV): {rows:,}
    ES únicas (tras expansiones): {es_unique:,}  |  NI únicas: {ni_unique:,}  |  pair_id únicos: {pid_unique:,}
    Duplicados ES: {dup_es:,}  |  Duplicados NI: {dup_ni:,} (bloqueados en estricto)  |  Sin pair_id: {empty_pid:,}
    Asimetrías ES↔NI: {mismatch_backmap:,} {sam_html}
    NI ambiguas bloqueadas: {ambN:,}{(' · ej.: ' + escape(ambList)) if ambN else ''}
    Regla: el motor usa sólo tablas 1:1; NI duplicadas se bloquean y se muestran como [AMB-NI:...].
    """ return rows > 0 print("Cargando léxico/pares (BI-estricto)…") load_bi_strict_and_diagnose() # ====== Utilidad n-grama (longest-match, BI-only) ====== def _longest_match(tokens, i, phrase_map): """Devuelve (span, surface) si hay frase que comience en i.""" if not phrase_map: return (0, None) max_span = 0; surface = None # determinista: prioriza el span más largo for span in range(1, MAX_NGRAM+1): if i+span > len(tokens): break cand = " ".join(lower(t) for t in tokens[i:i+span]) if cand in phrase_map: max_span = span surface = phrase_map[cand][0] return (max_span, surface) # ====== Post-proceso ES (espacios + mayúsculas de oración) ====== def sentence_case_spanish(s: str) -> str: out = [] start = True in_br = False # dentro de [ ... ] WRAPS = "¿¡\"'«(“‘[" for ch in s: if ch == '[': in_br = True if not in_br and start: if ch.isspace(): out.append(ch) elif ch in WRAPS: out.append(ch) elif ch.isalpha(): out.append(ch.upper()); start = False else: out.append(ch) start = ch in "¿¡" else: out.append(ch) if not in_br and ch in ".?!…": start = True elif not in_br and ch in "¿¡": start = True if ch == ']': in_br = False return "".join(out) # ✦ FIX: no re-espaciar horas/decimales y no añadir espacios tras “:”/“,” def postprocess_spanish(s: str) -> str: # 1) compactar horas y decimales s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s) # 18:30 s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s) # 12,65 / 3.1415 # 2) espacios y signos s = re.sub(r"\s+([,.;:!?])", r"\1", s) # nada antes de signos # añadir espacio SOLO tras . ! ? ; (NO tras coma/“:”) s = re.sub(r"([?.!;])(?!\s|$)([^\s])", r"\1 \2", s) # 3) signos invertidos s = re.sub(r"([¿¡])\s+", r"\1", s) # 4) colapsar espacios s = re.sub(r"\s{2,}", " ", s).strip() # 5) mayúscula inicial de oración return sentence_case_spanish(s) # ====== Traducción BI estricta ====== def translate_es_to_ni_bi(text:str): toks = simple_tokenize(text) out=[]; ib_toks=[] i=0 while i < len(toks): t = toks[i] if t in VISIBLE_PUNCT: out.append(t); ib_toks.append(t); i+=1; continue if is_placeholder(t): out.append(t); ib_toks.append(t); i+=1; continue span, ni_surface = _longest_match(toks, i, ESPHRASE2NI) if span > 1: out.append(ni_surface) ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface)) i += span; continue key = lower(t) if key in ES2NI: ni = ES2NI[key][0] out.append(ni) ib_toks.append(georgeos_keys(tokens_from_latin(ni), ni)) elif is_number(key): out.append(t); ib_toks.append(t) else: ph = f"[SIN-LEX:{t}]" out.append(ph); ib_toks.append(ph) i += 1 if MODAL_SUFFIX_ENABLE: out = add_modal_suffixes_es2ni(out) ib_toks = [] for tt in out: if tt in VISIBLE_PUNCT or tt.startswith("["): ib_toks.append(tt) else: ib_toks.append(georgeos_keys(tokens_from_latin(tt), tt)) ni_text = detokenize(out) ib_html = "
    " + escape(render_ib_with_tridots(ib_toks)) + "
    " return ni_text, ib_html def translate_ni_to_es_bi(text:str): toks = simple_tokenize(text) if MODAL_SUFFIX_ENABLE: toks = strip_modal_suffixes_ni(toks) out=[] i=0 while i < len(toks): t = toks[i] if t in VISIBLE_PUNCT: out.append(t); i+=1; continue if is_placeholder(t): out.append(t); i+=1; continue span, es_surface = _longest_match(toks, i, NIPHRASE2ES) if span > 1: out.append(es_surface); i += span; continue key = lower(t) if key in NI2ES: es = NI2ES[key][0] or "" out.append(es if es else f"[?:{t}]") elif key in AMBIG_NI and STRICT_BI_ENFORCE: # ★ determinista: no elegimos al azar superficies NI con colisión out.append(f"[AMB-NI:{t}]") elif is_number(key): out.append(t) else: out.append(f"[?:{t}]") i += 1 if MODAL_SUFFIX_ENABLE: out = add_inverted_openers(out) es_text = detokenize(out) es_text = postprocess_spanish(es_text) return es_text # ====== Diagnóstico ====== def diagnose_text(text, dir_label): if not text or not text.strip(): return "Introduce texto para diagnosticar." toks = simple_tokenize(text) unknown=set(); asym=set(); amb=set() total_tokens=0; covered=0 if dir_label.startswith("ES"): head = "ES→NI" i=0 while i < len(toks): t = toks[i] if t in VISIBLE_PUNCT or is_number(t): i+=1; continue total_tokens += 1 span, _ = _longest_match(toks, i, ESPHRASE2NI) if span > 1: covered += 1; i += span; continue k=lower(t) if k not in ES2NI: unknown.add(t); i+=1; continue covered += 1 ni = ES2NI[k][0] back = NI2ES.get(lower(ni)) if back and lower(back[0]) != k: asym.add(f"{t} → {ni} → {back[0]}") i+=1 else: head = "NI→ES" i=0 while i < len(toks): t = toks[i] if t in VISIBLE_PUNCT or is_number(t): i+=1; continue total_tokens += 1 span, _ = _longest_match(toks, i, NIPHRASE2ES) if span > 1: covered += 1; i += span; continue k=lower(t) if k in AMBIG_NI: amb.add(t); i+=1; continue if k not in NI2ES: unknown.add(t); i+=1; continue covered += 1 es = NI2ES[k][0] back = ES2NI.get(lower(es)) if back and lower(back[0]) != k: asym.add(f"{t} → {es} → {back[0]}") i+=1 cov_pct = (covered/total_tokens*100) if total_tokens else 100.0 cov_html = f"
    Tokens (sin puntuación/numéricos): {total_tokens}  |  Cubiertos: {covered} ({cov_pct:.1f}%)
    " unk_html = "".join(f"
  • {escape(u)}
  • " for u in sorted(unknown, key=lambda x: lower(x))) or "
  • " amb_html = "".join(f"
  • {escape(a)}
  • " for a in sorted(amb, key=lambda x: lower(x))) or "
  • " asy_html = "".join(f"
  • {escape(a)}
  • " for a in sorted(asym)) or "
  • " return f"Diagnóstico {head}{cov_html}Ambiguas (NI duplicada):Faltantes:Asimetrías:" # ====== UI (CSS / acordeones / fuentes) ====== LABELS={ "ES":{ "title":"Traductor Español ↔ Neoíbero", "subtitle":"CSV estricto (BI-only 1:1; sin heurísticas; .gz) — determinista", "in_label_es":"✏️ Entrada (Español)", "in_label_ni":"✏️ Entrada (Neoíbero)", "in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.", "in_ph_ni":"Idatzi hemen. Adib.: nuker-ke ni etxe-ka.", "out_lat_esni":"📜 Salida: Neoíbero (latín)", "out_lat_nies":"📜 Salida: Español", "out_ib":"🗿 Línea ibérica", "out_audio":"🔊 Locución (Audio)", "btn":"🔄 Traducir", "combo":"🌍 Idioma (UI + explicación)", "dir":"🔁 Dirección", "dir_opts":["ES → NI","NI → ES"], "doc_header":"📚 Documentación y Referencia", "acc_titles":[ "🎓 Marco académico y decisiones del neoíbero", "🏛️ Herencia posible del íbero histórico", "🎨 Diseño de la conlang (neoíbero)", "⚙️ Pipeline del traductor (BI-estricto 1:1)", "🔤 Ortografía, línea ibérica y claves", "❓/❗ Modalidad vascoide (-na / -ba)", "🧩 Expansiones por CSV: plurales (S) y 3pl (3/V3)", "📖 Gramática de referencia (v1.2)", "📚 Bibliografía de base", "🧾 Siglas y glosario", "🔗 Simetría por pair_id (modo bilingüe)" ] }, "EN":{ "title":"Spanish ↔ Neo-Iberian Translator", "subtitle":"Strict BI-only (1:1 surfaces; no heuristics; .gz) — deterministic", "in_label_es":"✏️ Input (Spanish)", "in_label_ni":"✏️ Input (Neo-Iberian)", "in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.", "in_ph_ni":"Type here. E.g., nuker-ke ni etxe-ka.", "out_lat_esni":"📜 Output: Neo-Iberian (Latin)", "out_lat_nies":"📜 Output: Spanish", "out_ib":"🗿 Iberian line", "out_audio":"🔊 Speech (Audio)", "btn":"🔄 Translate", "combo":"🌍 Language (UI + docs)", "dir":"🔁 Direction", "dir_opts":["ES → NI","NI → ES"], "doc_header":"📚 Documentation & Reference", "acc_titles":[ "🎓 Background & design choices", "🏛️ Possible inheritance from ancient Iberian", "🎨 Conlang design (Neo-Iberian)", "⚙️ Translator pipeline (strict 1:1)", "🔤 Orthography, Iberian line & keys", "❓/❗ Vascoid modality (-na / -ba)", "🧩 CSV-driven expansions: plurals (S) & 3pl (3/V3)", "📖 Reference grammar (v1.2)", "📚 Core references", "🧾 Acronyms & glossary", "🔗 Pair-id symmetry (bilingual mode)" ] } } DOC = { "ES":[ "**Escritura y datos.** Un **único CSV con `pair_id`** y superficies exactas. La traducción ES↔NI es **1:1** por superficie.", "**Herencia plausible del íbero.** Fonotaxis CV(C); p→b; r/ŕ; casos -k/-te/-ka/-ar/-en/-i.", "**Diseño del neoíbero.** TAM: PRS -ke, PST -bo, FUT -ta, IPFV -ri, IMP -tu, COND/SBJV -ni, FUT_SBJV -ra.", "**Pipeline (BI-estricto 1:1).** Tokeniza; sustitución exacta; NI ambigua **se bloquea** y sale como `[AMB-NI:…]`.", "**Ortografía y línea ibérica.** Tokens BA/BE/…; tridots '/'; p→b; codas N/S/Ś/R/Ŕ/L/M/K/T.", "**Modalidad (-na/-ba).** ES→NI puede omitir ¿?¡! (si está activo). NI→ES inserta `¿…?`/`¡…!` al final de la oración marcada, **no en comas**.", "**Expansiones por CSV (deterministas).** `flags=S` plural regular; `flags=3|V3` 3ª plural del presente. Solo si lo marcas.", "**Gramática mínima.** Visualización; la gramática no se “calcula”.", "**Bibliografía.** Untermann; de Hoz; Ferrer i Jané; Correa…", "**Glosario & datasets.** Faltas → `[SIN-LEX:…]` / `[?:…]`. Ambiguas → `[AMB-NI:…]` (limpia tu CSV).", "**Simetría por pair_id.** El diagnóstico avisa si una NI apunta a dos ES distintos." ], "EN":[ "One bilingual CSV with `pair_id` and exact surfaces. ES↔NI is strictly 1:1.", "Possible inheritance (non-palaeographic).", "Neo-Iberian design (phonology & morphology).", "Pipeline: tokenise → exact replacement. Ambiguous NI are **blocked** and rendered as `[AMB-NI:…]`.", "Orthography, Iberian line & keys.", "Modality (-na/-ba): ES→NI can drop ¿?¡!. NI→ES places them at sentence end, not at commas.", "CSV-driven expansions (deterministic): `S` plural; `3|V3` present 3pl.", "Minimal grammar (v1.2).", "Selected references.", "Glossary & datasets.", "Pair-id symmetry diagnostics." ] } # ====== CSS + fuente ====== def build_css(): b64=None if os.path.exists("Iberia-Georgeos.ttf"): with open("Iberia-Georgeos.ttf","rb") as f: b64=base64.b64encode(f.read()).decode("ascii") font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')" return f""" @font-face {{ font-family: 'IberiaGeorgeos'; src: {font_src}; font-weight: normal; font-style: normal; }} :root {{ --iberian-clay:#8B4513; --iberian-ochre:#CC7722; --iberian-stone:#5C5C5C; --iberian-sand:#D2B48C; --iberian-rust:#A0522D; --iberian-bronze:#CD7F32; }} .gradio-container {{ background:linear-gradient(135deg,#f4e8d8 0%,#e8d5c4 50%,#d4c4b0 100%)!important; font-family:'Georgia','Times New Roman',serif!important; }} .gradio-container h1, .gradio-container h2, .gradio-container h3 {{ color:var(--iberian-clay)!important; text-shadow:2px 2px 4px rgba(139,69,19,.15)!important; border-bottom:3px solid var(--iberian-bronze)!important; padding-bottom:.5rem!important; letter-spacing:.5px!important; }} .gradio-container .gr-group {{ background:linear-gradient(to bottom,#f9f6f0,#ede6dc)!important; border:2px solid var(--iberian-sand)!important; border-radius:8px!important; box-shadow:0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5)!important; padding:1.5rem!important; margin-bottom:1.5rem!important; }} .gradio-container .gr-accordion {{ background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important; border:2px solid var(--iberian-rust)!important; border-radius:6px!important; margin-bottom:.8rem!important; box-shadow:2px 2px 6px rgba(0,0,0,.15)!important; }} .gradio-container .gr-accordion .label-wrap {{ background:linear-gradient(to right,var(--iberian-ochre),var(--iberian-rust))!important; color:#fff!important; font-weight:600!important; padding:.8rem 1rem!important; border-radius:4px!important; text-shadow:1px 1px 2px rgba(0,0,0,.3)!important; }} .gradio-container .gr-textbox textarea, .gradio-container .gr-textbox input {{ background:linear-gradient(to bottom,#faf8f3,#f5f0e8)!important; border:2px solid var(--iberian-sand)!important; border-radius:6px!important; color:var(--iberian-stone)!important; font-family:'Georgia',serif!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1)!important; }} .gradio-container .gr-textbox textarea:focus, .gradio-container .gr-textbox input:focus {{ border-color:var(--iberian-bronze)!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3)!important; }} .gradio-container .gr-button.gr-button-primary {{ background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important; border:2px solid var(--iberian-clay)!important; color:#fff!important; font-weight:bold!important; text-shadow:1px 2px 2px rgba(0,0,0,.4)!important; box-shadow:0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2)!important; border-radius:8px!important; padding:.8rem 1.5rem!important; transition:all .3s ease!important; }} .gradio-container .gr-button.gr-button-primary:hover {{ background:linear-gradient(145deg,var(--iberian-rust),var(--iberian-bronze))!important; transform:translateY(-2px)!important; box-shadow:0 6px 12px rgba(139,69,19,.4)!important; }} .ib-line {{ font-family:'IberiaGeorgeos',monospace,sans-serif!important; font-size:1.9rem!important; line-height:2.4rem!important; white-space:pre-wrap!important; background:linear-gradient(135deg,#e8dcc8 0%,#d4c4a8 50%,#c4b098 100%)!important; padding:24px!important; border-radius:10px!important; border:3px solid var(--iberian-rust)!important; border-left:6px solid var(--iberian-bronze)!important; box-shadow:0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1)!important; color:var(--iberian-clay)!important; position:relative!important; }} .ib-line::before {{ content:''!important; position:absolute!important; inset:0!important; background-image:repeating-linear-gradient(0deg,transparent,transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px)!important; pointer-events:none!important; border-radius:10px!important; }} @media (max-width:768px) {{ .ib-line {{ font-size:1.5rem!important; line-height:2rem!important; padding:16px!important; }} .gradio-container .gr-group {{ padding:1rem!important; }} .gradio-container h1 {{ font-size:1.8rem!important; }} }} @media (max-width:480px) {{ .ib-line {{ font-size:1.3rem!important; line-height:1.8rem!important; padding:12px!important; }} .gradio-container h1 {{ font-size:1.5rem!important; }} }} """ CSS = build_css() with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo: with gr.Group(): title = gr.Markdown(f"# {LABELS['ES']['title']}") subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*") with gr.Row(): combo = gr.Dropdown(choices=["ES","EN"], value="ES", label=LABELS["ES"]["combo"]) direction = gr.Radio(choices=LABELS["ES"]["dir_opts"], value="ES → NI", label=LABELS["ES"]["dir"]) with gr.Group(): doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}") acc_titles = LABELS["ES"]["acc_titles"] with gr.Accordion(acc_titles[0], open=False) as acc1: md1 = gr.Markdown(DOC["ES"][0]) with gr.Accordion(acc_titles[1], open=False) as acc2: md2 = gr.Markdown(DOC["ES"][1]) with gr.Accordion(acc_titles[2], open=False) as acc3: md3 = gr.Markdown(DOC["ES"][2]) with gr.Accordion(acc_titles[3], open=False) as acc4: md4 = gr.Markdown(DOC["ES"][3]) with gr.Accordion(acc_titles[4], open=False) as acc5: md5 = gr.Markdown(DOC["ES"][4]) with gr.Accordion(acc_titles[5], open=False) as acc6: md6 = gr.Markdown(DOC["ES"][5]) with gr.Accordion(acc_titles[6], open=False) as acc7: md7 = gr.Markdown(DOC["ES"][6]) with gr.Accordion(acc_titles[7], open=False) as acc8: md8 = gr.Markdown(DOC["ES"][7]) with gr.Accordion(acc_titles[8], open=False) as acc9: md9 = gr.Markdown(DOC["ES"][8]) with gr.Accordion(acc_titles[9], open=False) as acc10: md10 = gr.Markdown(DOC["ES"][9]) with gr.Accordion(acc_titles[10], open=False) as acc11: md11 = gr.Markdown(DOC["ES"][10]) with gr.Accordion("🧪 Diagnóstico del CSV BI (al cargar)", open=False): bi_diag_box = gr.HTML(value=BI_DIAG_HTML) with gr.Group(): es_in = gr.Textbox(label=LABELS["ES"]["in_label_es"], placeholder=LABELS["ES"]["in_ph_es"], lines=5) with gr.Row(): btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary") btn_diag = gr.Button("🔎 Diagnosticar BI con este texto", variant="secondary") with gr.Row(): with gr.Column(scale=2): ni_out = gr.Textbox(label=LABELS["ES"]["out_lat_esni"], lines=5, interactive=False) loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=True) audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy") with gr.Column(scale=1): ib_out = gr.HTML(label=LABELS["ES"]["out_ib"]) diag_out = gr.HTML(value="") def do_translate(text, dir_label): if not text or not text.strip(): return (gr.update(value=""), gr.update(value="
    "), gr.update(visible=False), gr.update(value=None), gr.update(value="")) if dir_label.startswith("ES"): latin, ib = translate_es_to_ni_bi(text) return (gr.update(label=LABELS["ES"]["out_lat_esni"], value=latin), gr.update(value=ib), gr.update(visible=True), gr.update(value=None), gr.update(value="")) else: es_text = translate_ni_to_es_bi(text) return (gr.update(label=LABELS["ES"]["out_lat_nies"], value=es_text), gr.update(value="
    "), gr.update(visible=False), gr.update(value=None), gr.update(value="")) btn_tr.click(do_translate, [es_in, direction], [ni_out, ib_out, loc_btn, audio_out, diag_out]) def run_locution(latin_text, dir_label): if dir_label.startswith("ES"): return synthesize_speech(latin_text) return None loc_btn.click(run_locution, [ni_out, direction], audio_out) def do_diagnose(text, dir_label): return gr.update(value=diagnose_text(text, dir_label)) btn_diag.click(do_diagnose, [es_in, direction], [diag_out]) def switch_lang(sel_lang, dir_label): L=LABELS[sel_lang]; T=L["acc_titles"]; D=DOC[sel_lang] in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] return ( gr.update(value=f"# {L['title']}"), gr.update(value=f"*{L['subtitle']}*"), gr.update(label=L["combo"], value=sel_lang), gr.update(label=L["dir"], choices=L["dir_opts"], value=dir_label), gr.update(value=f"## {L['doc_header']}"), gr.update(label=T[0]), gr.update(value=D[0]), gr.update(label=T[1]), gr.update(value=D[1]), gr.update(label=T[2]), gr.update(value=D[2]), gr.update(label=T[3]), gr.update(value=D[3]), gr.update(label=T[4]), gr.update(value=D[4]), gr.update(label=T[5]), gr.update(value=D[5]), gr.update(label=T[6]), gr.update(value=D[6]), gr.update(label=T[7]), gr.update(value=D[7]), gr.update(label=T[8]), gr.update(value=D[8]), gr.update(label=T[9]), gr.update(value=D[9]), gr.update(label=T[10]), gr.update(value=D[10]), gr.update(label=in_label, placeholder=in_ph), gr.update(label=out_lab), gr.update(label=L["out_ib"]), gr.update(label=L["out_audio"]), gr.update(value=L["btn"]) ) combo.change( switch_lang, [combo, direction], [title, subtitle, combo, direction, doc_header, acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9, acc10, md10, acc11, md11, es_in, ni_out, ib_out, audio_out, btn_tr] ) def switch_direction(dir_label, sel_lang): L=LABELS[sel_lang] in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"] in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"] out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"] loc_vis = True if dir_label.startswith("ES") else False return (gr.update(label=in_label, placeholder=in_ph), gr.update(label=out_lab, value=""), gr.update(value="
    "), gr.update(visible=loc_vis), gr.update(value=None), gr.update(value="")) direction.change( switch_direction, [direction, combo], [es_in, ni_out, ib_out, loc_btn, audio_out, diag_out] ) # ====== smoke opcional ====== def _symmetry_smoketest(): print("\n[SMOKE] Prueba ES↔NI (BI-estricto, determinista)…") probes = [ "nuker-ke ni etxe-ka ?", "¿Pagaste 12,75 en la cafetería?", "Marta llega a las 18:30.", "[SIN-LEX:Tomás]-na euŕak-ke !" ] for p in probes: es_from_ni = translate_ni_to_es_bi(p) ni_round, _ = translate_es_to_ni_bi(es_from_ni) print(" IN:", p) print(" ES:", es_from_ni) print(" NI:", ni_round) print("---") if DEBUG_MODE: _symmetry_smoketest() if __name__ == "__main__": demo.queue().launch()