LoloSemper's picture
Update app.py
6d901fa verified
# app.py — Traductor Español ↔ Neoíbero (BI-ONLY 1:1 estricto, determinista)
# UI completa + CSS “íbero” + TTS + Línea ibérica (codificación appOld)
# Requiere un ÚNICO CSV con superficies exactas (UTF-8) y columnas:
# - source_es (o es/es_surface)
# - target_ni (o ni/ni_surface)
# - pair_id (opcional)
#
# El motor NO hace heurísticas ni morfología: 1:1 exacto por superficie.
# Puntuación y números pasan tal cual. Desconocidos -> [SIN-LEX:...] / [?:...]
# Determinismo NI→ES: entradas NI duplicadas (ambigüas) quedan bloqueadas y se rinden como [AMB-NI:...]
import gradio as gr
import os, csv, re, base64, unicodedata, gzip
import torch
from transformers import AutoProcessor, VitsModel
import numpy as np
from html import escape
# ====== cache ======
os.environ['TRANSFORMERS_CACHE'] = os.environ.get('TRANSFORMERS_CACHE', '/tmp/cache')
os.environ['HF_HOME'] = os.environ.get('HF_HOME', '/tmp/hf')
DEBUG_MODE = False
def debug_print(msg):
if DEBUG_MODE: print(f"[DEBUG] {msg}")
# ====== util ======
def _open_maybe_gzip(path):
if str(path).endswith(".gz"):
# CSV debe venir en UTF-8 (evita mojibake)
return gzip.open(path, "rt", encoding="utf-8", newline="")
return open(path, "r", encoding="utf-8", newline="")
def norm(x): return (str(x).strip()) if x is not None else ""
def lower(x): return norm(x).lower()
def fold(s:str)->str:
return ''.join(c for c in unicodedata.normalize('NFD', s or "") if unicodedata.category(c)!="Mn")
# ====== rutas ======
def _cand(*names):
for n in names:
if os.path.exists(n): return n
p = os.path.join("salida", n)
if os.path.exists(p): return p
return names[0] # último recurso para mensajes
# Prioriza los “master/surface-ready”; luego retrocompatibles
CSV_BI = _cand(
"LEXICON_UNICO_1a1.csv.gz",
"MASTER_SURFACE_READY.csv.gz",
"MASTER_REEXTENDED.csv.gz",
"BI_SURFACE_READY.csv.gz",
"HF_Pairs_BI_REEXTENDED.csv.gz",
"HF_Pairs_BI_EXPANDED1_EXTENDED_FILLED.csv.gz",
"HF_Pairs_BI_EXPANDED1.csv.gz"
)
# ====== estructuras strict BI ======
# Clave = superficie exacta en minúsculas. Valor = (superficie_original_opuesta, pair_id)
ES2NI = {} # es_surface_lower -> (ni_surface, pair_id)
NI2ES = {} # ni_surface_lower -> (es_surface, pair_id)
# N-gramas/frases:
ESPHRASE2NI = {} # "el saco" -> (ni_surface, pair_id)
NIPHRASE2ES = {} # "…-ke ni etxe-ka" -> (es_surface, pair_id)
MAX_NGRAM = 3
# ====== signos / tokenización mínima ======
VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
_num_re = re.compile(r"^\d+([.,]\d+)?$")
def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
# --- separadores de cláusula + placeholders atómicos ---
CLAUSE_BREAKS = {",", ";", "—", "–", ":"}
PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
def is_placeholder(tok: str) -> bool:
return bool(PLACEHOLDER_RE.match(tok or ""))
def _restore_brk(tok, protected):
m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
if not m: return tok
idx = int(m.group(1))
suf = m.group(2)
base = protected[idx] if 0 <= idx < len(protected) else tok
return base + (f"-{suf}" if suf else "")
def simple_tokenize(text:str):
"""Tokenización mínima, sin romper [ ... ] ni [ ... ]-na/-ba."""
if not text:
return []
protected = []
def _repl(m):
key = f"__BRK{len(protected)}__"
protected.append(m.group(0))
return key
t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
t = re.sub(r"\s+"," ", t)
t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
toks = [tok for tok in t.split() if tok]
for i, tok in enumerate(toks):
if tok.startswith("__BRK") and "__" in tok:
toks[i] = _restore_brk(tok, protected)
return toks
def detokenize(tokens):
s = " ".join(tokens)
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
s = re.sub(r"([¿¡])\s+", r"\1", s)
s = re.sub(r"\(\s+", "(", s)
s = re.sub(r"\s+\)", ")", s)
s = re.sub(r"\s{2,}", " ", s).strip()
return s
# ====== Modalidad vascoide (-na / -ba) ======
MODAL_SUFFIX_ENABLE = True
MODAL_ONLY_ON_FINITE = True
MODAL_STRIP_QE_IN_NI = True
SENT_END = {".", "!", "?", "…"}
OPEN_FOR = {"?": "¿", "!": "¡"}
WRAP_PREFIX = set(list("«“‘([{\"'"))
PERS_ENDINGS = ("-n","-zu","-gu","-zuk","-zuek","-k")
TAM_FINITE = ("-ke","-bo","-ta","-ni","-tu")
def looks_like_finite_ni(tok:str)->bool:
t = (tok or "").lower()
if not t or t.startswith("["): return False
base = re.sub(r"-(na|ba)$","", t)
for tam in TAM_FINITE:
if base.endswith(tam) or any(base.endswith(tam+pe) for pe in PERS_ENDINGS):
return True
return False
def last_content_index(tokens, start, end_exclusive):
i = end_exclusive - 1
while i >= start and tokens[i] in VISIBLE_PUNCT:
i -= 1
return i if i >= start else -1
def strip_qe_punct(tokens):
return [t for t in tokens if t not in ("¿","?","¡","!")]
# --- helpers numéricos para no cortar decimales/horas ---
def _is_numeric_comma(tokens, i):
return (0 < i < len(tokens)-1 and tokens[i] == "," and
is_number(tokens[i-1]) and is_number(tokens[i+1]))
def _is_time_colon(tokens, i):
return (0 < i < len(tokens)-1 and tokens[i] == ":" and
is_number(tokens[i-1]) and is_number(tokens[i+1]))
def _is_true_clause_break(tokens, i):
if tokens[i] not in CLAUSE_BREAKS: return False
if _is_numeric_comma(tokens, i): return False
if _is_time_colon(tokens, i): return False
return True
def add_modal_suffixes_es2ni(tokens):
"""Añade -na/-ba al último verbo finito (o último constituyente) por oración."""
if not MODAL_SUFFIX_ENABLE:
return tokens
out = tokens[:]
n = len(out)
i = 0
sent_start = 0
while i < n:
if out[i] in ("?", "!"):
closer = out[i]
target = -1
j = i - 1
while j >= sent_start:
if out[j] not in VISIBLE_PUNCT and (not MODAL_ONLY_ON_FINITE or looks_like_finite_ni(out[j])):
target = j; break
j -= 1
if target == -1:
target = last_content_index(out, sent_start, i)
if target != -1:
suf = "na" if closer == "?" else "ba"
if not re.search(rf"-(?:{suf})$", out[target].lower()):
out[target] = out[target] + "-" + suf
sent_start = i + 1
elif out[i] in SENT_END:
sent_start = i + 1
i += 1
if MODAL_STRIP_QE_IN_NI:
out = strip_qe_punct(out)
return out
def strip_modal_suffixes_ni(tokens):
"""
Interpreta -na/-ba como modalidad; ahora SOLO cerramos al final de oración.
(No cerramos en comas/“:”, salvo que ya haya ?/! explícitos.)
"""
if not MODAL_SUFFIX_ENABLE:
return tokens
out = []
buf = []
pending_end = None
mode = None # "?" / "!"
def _emit(end_override=None, also_append=None):
nonlocal buf, mode, pending_end, out
local = [t for t in buf if t not in ("¿","?","¡","!")]
if local:
end_tok = end_override or ("?" if mode == "?" else "!" if mode == "!" else pending_end or ".")
out.extend(local)
out.append(end_tok)
buf.clear(); mode = None; pending_end = None
if also_append:
out.append(also_append)
toks = tokens + ["."]
for i, t in enumerate(toks):
if t in ("¿", "¡"):
_emit(); mode = "?" if t == "¿" else "!"
continue
if t in ("?", "!"):
pending_end = t; _emit(); continue
if t in SENT_END:
pending_end = t; _emit(); continue
# ✦ MODALIDAD: en separadores de cláusula NO cerramos todavía:
if t in CLAUSE_BREAKS and mode in ("?","!"):
buf.append(t)
continue
m = re.search(r"-(na|ba)$", (t or "").lower())
if m:
if mode and buf: _emit()
mode = "?" if m.group(1) == "na" else "!"
t = t[:-len(m.group(0))]
if t:
buf.append(t)
if len(out) >= 2 and out[-1] == "." and out[-2] == ".": out.pop()
return out
def add_inverted_openers(tokens):
"""Inserta ¿/¡ al inicio de cada tramo que acaba en ?/!, ignorando comas/“:” numéricos."""
out = tokens[:]
START_BREAKS = SENT_END | CLAUSE_BREAKS
def _is_true_start_break(idx):
if out[idx] in SENT_END: return True
if out[idx] in CLAUSE_BREAKS: return _is_true_clause_break(out, idx)
return False
i = 0
while i < len(out):
if out[i] in ("?", "!"):
closer = out[i]; opener = OPEN_FOR[closer]
j = i - 1
while j >= 0 and not _is_true_start_break(j):
j -= 1
start = j + 1
k = start
while k < i and out[k] in WRAP_PREFIX:
k += 1
if not (k < len(out) and out[k] == opener):
out.insert(k, opener); i += 1
i += 1
return out
# ====== EXPANSIONES (deterministas, sólo ES→NI) ======
EXPANSION_ENABLE = True
FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH")
FLAG_PLURAL = ("S",)
FLAG_3PL = ("3","V3")
VOWELS = "aeiouáéíóúüAEIOUÁÉÍÓÚÜ"
def _has_flag(cell:str, wanted:tuple)->bool:
c = (cell or "")
return any(w in c for w in wanted)
def _pluralize_es_form(s: str) -> str:
if not s: return s
sl = s.lower()
if sl.endswith("z"):
return s[:-1] + ("ces" if s[-1].islower() else "CES")
if s[-1] not in VOWELS:
return s + ("es" if s[-1].islower() else "ES")
return s + ("s" if s[-1].islower() else "S")
def _present_3pl_from_3sg(s: str) -> str:
if not s: return s
return s + ("n" if s[-1].islower() else "N")
# ====== TTS (appOld) ======
print("Cargando modelo de voz (opcional)…")
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = model = None
try:
processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa")
model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device)
print("Modelo de voz cargado.")
except Exception as e:
print(f"AVISO TTS: {e}")
def add_reading_pauses(text: str, level:int=3) -> str:
if level <= 1: return text
t = re.sub(r",\s*", ", , ", text)
t = re.sub(r"\.\s*", ". . ", text)
return re.sub(r'\s+',' ',t).strip()
def hispanize_for_tts(ni_text: str) -> str:
text=(ni_text or "").lower()
text=text.replace('ŕ','rr').replace('ś','s').replace('eś','es').replace('-', ' ')
text=re.sub(r'\[.*?\]','',text); text=re.sub(r'\s+',' ',text).strip()
return add_reading_pauses(text, 3)
def synthesize_speech(text):
if not text or not text.strip() or model is None or processor is None: return None
try:
inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device)
with torch.no_grad(): output = model(**inputs).waveform
speech_np = output.cpu().numpy().squeeze()
mx = max(abs(speech_np.min()), abs(speech_np.max()))
if mx>0: speech_np = speech_np/mx*0.9
return (16000, speech_np.astype(np.float32))
except Exception as e:
print(f"Error TTS: {e}"); return None
# ====== Línea ibérica (appOld) ======
V = "aeiou"
SYL_FOR = {
"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],
"d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
"t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],
"g":["‹GA›","‹GE›","‹GI›","‹DO›","‹GU›"] if False else ["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
"k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]
}
ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›",
"l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"}
CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"}
def tokens_from_latin(ni:str)->str:
out=[]; i=0; ni=(ni or "").lower()
while i<len(ni):
c=ni[i]
if c=="p": c="b"
if c=="-": out.append("—"); i+=1; continue
if c in V:
out.append(ALPHA_FOR.get(c, c.upper())); i+=1; continue
if c in SYL_FOR and i+1<len(ni) and ni[i+1] in V:
idx=V.index(ni[i+1]); tok=SYL_FOR[c][idx]
coda=ni[i+2] if i+2<len(ni) else ""
if coda in CODA_FOR and coda!="": tok+=CODA_FOR[coda]; i+=3
else: i+=2
out.append(tok); continue
out.append(ALPHA_FOR.get(c, c.upper())); i+=1
return "".join(out)
KEYS_MODE = "compact"
KEYS_OVERRIDE = {}
def georgeos_keys(token_str:str, ni_plain:str)->str:
low=(ni_plain or "").lower()
if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low]
m=re.findall(r"‹(.*?)›", token_str)
out=[]
for t in m:
if KEYS_MODE == "compact":
if len(t)==2 and t[0] in "BDTGK": out.append(t[0])
elif t in ("A","E","I","O","U"): out.append(t)
elif t=="Ś": out.append("X")
elif t=="Ŕ": out.append("r")
else: out.append(t[0].upper())
else:
if len(t)==2 and t[0] in "BDTGK": out.append(t)
elif t=="Ś": out.append("X")
elif t=="Ŕ": out.append("r")
else: out.append(t)
return "".join(out)
TRIDOT = "/"
def render_ib_with_tridots(ib_toks):
res=[]; prev_word=False
for tk in ib_toks:
is_punct = tk in VISIBLE_PUNCT
if is_punct:
res.append(" "+tk+" "); prev_word=False
else:
if prev_word: res.append(" "+TRIDOT+" ")
res.append(tk); prev_word=True
return "".join(res).strip()
# ====== BI loader + diagnóstico ======
# ### ★ MODO ESTRICTO Y DETERMINISTA
STRICT_BI_ENFORCE = True # si True, no se admite NI ambigua
AMBIG_NI = {} # ni_lower -> set de ES conflictivos
BI_DIAG_HTML = "<em>Sin CSV cargado.</em>"
def load_bi_strict_and_diagnose():
"""Carga el CSV, llena ES2NI/NI2ES y prepara un HTML de diagnóstico."""
global BI_DIAG_HTML
# vaciar estructuras antes de cargar (determinismo)
ES2NI.clear(); NI2ES.clear(); ESPHRASE2NI.clear(); NIPHRASE2ES.clear()
AMBIG_NI.clear()
if not os.path.exists(CSV_BI):
msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}"
print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}"
return False
rows=0; dup_es=0; dup_ni=0; empty_pid=0
mismatch_backmap = 0
mismatch_samples = []
pid_seen=set()
print(f"Detectado CSV bilingüe: {CSV_BI}")
try:
with _open_maybe_gzip(CSV_BI) as f:
rd = csv.DictReader(f)
flds=set(rd.fieldnames or [])
ES_COL = "source_es" if "source_es" in flds else "es_surface" if "es_surface" in flds else "es"
NI_COL = "target_ni" if "target_ni" in flds else "ni_surface" if "ni_surface" in flds else "ni"
IDCOL = "pair_id" if "pair_id" in flds else "id" if "id" in flds else None
FLAGCOL = None
for cand in FLAG_COLNAMES:
if cand in flds:
FLAGCOL = cand; break
base_rows = []
for r in rd:
es_orig = (r.get(ES_COL) or "").strip()
ni_orig = (r.get(NI_COL) or "").strip()
if not (es_orig and ni_orig): continue
pid = (r.get(IDCOL) or "").strip() if IDCOL else ""
if not pid: empty_pid += 1
else: pid_seen.add(pid)
flags = (r.get(FLAGCOL) or "") if FLAGCOL else ""
es = lower(es_orig)
ni = lower(ni_orig)
# Frases
if " " in es:
if es not in ESPHRASE2NI: # determinista: primera manda
ESPHRASE2NI[es] = (ni_orig, pid)
if " " in ni:
if ni not in NIPHRASE2ES:
NIPHRASE2ES[ni] = (es_orig, pid)
# ES→NI (determinista: primera fila gana)
if es in ES2NI:
dup_es += 1
else:
ES2NI[es] = (ni_orig, pid)
# NI→ES (determinista + bloqueo de ambigüedad)
if ni in NI2ES:
dup_ni += 1
# registra ambigüedad
s = AMBIG_NI.get(ni, set())
s.add(NI2ES[ni][0]); s.add(es_orig)
AMBIG_NI[ni] = s
if STRICT_BI_ENFORCE:
NI2ES.pop(ni, None) # invalida la superficie NI conflictiva
else:
if STRICT_BI_ENFORCE and ni in AMBIG_NI:
# ya marcada ambigua: no insertar
pass
else:
NI2ES[ni] = (es_orig, pid)
base_rows.append((es_orig, ni_orig, pid, flags))
rows += 1
# Expansiones deterministas (solo añaden ES2NI; NO tocan NI2ES)
if EXPANSION_ENABLE:
for es_orig, ni_orig, pid, flags in base_rows:
if not flags: continue
if _has_flag(flags, FLAG_PLURAL):
pl = _pluralize_es_form(es_orig)
pl_key = lower(pl)
if pl_key not in ES2NI:
ES2NI[pl_key] = (ni_orig, pid)
if _has_flag(flags, FLAG_3PL):
p3 = _present_3pl_from_3sg(es_orig)
p3_key = lower(p3)
if p3_key not in ES2NI:
ES2NI[p3_key] = (ni_orig, pid)
# Diagnóstico asimetrías (no afecta determinismo)
for es_low, (ni_surf, _) in ES2NI.items():
ni_low = lower(ni_surf)
back = NI2ES.get(ni_low)
if back and lower(back[0]) != es_low:
mismatch_backmap += 1
if len(mismatch_samples) < 10:
mismatch_samples.append((es_low, ni_low, lower(back[0])))
except Exception as e:
msg=f"[ERROR] Al leer {CSV_BI}: {e}"
print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}"
return False
es_unique = len(ES2NI)
ni_unique = len(NI2ES)
pid_unique = len(pid_seen)
print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.")
if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).")
if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (bloqueados en modo estricto).")
if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
if mismatch_backmap:
print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
sam_html = ""
if mismatch_samples:
sam_rows = "".join(
f"<li><code>{escape(es)}</code> → <code>{escape(ni)}</code> → <code>{escape(es2)}</code></li>"
for es,ni,es2 in mismatch_samples
)
sam_html = f"<details><summary>Muestras</summary><ul>{sam_rows}</ul></details>"
ambN = sum(len(v) > 1 for v in AMBIG_NI.values())
ambList = ", ".join(f"{k}{sorted(list(v))[:3]}" for k,v in list(AMBIG_NI.items())[:5])
BI_DIAG_HTML = f"""
<div style="font-family:Georgia,serif">
<b>Diagnóstico del CSV BI</b><br>
Archivo: <b>{escape(CSV_BI)}</b><br>
Filas base (CSV): <b>{rows:,}</b><br>
ES únicas (tras expansiones): <b>{es_unique:,}</b> &nbsp;|&nbsp; NI únicas: <b>{ni_unique:,}</b> &nbsp;|&nbsp; pair_id únicos: <b>{pid_unique:,}</b><br>
Duplicados ES: <b>{dup_es:,}</b> &nbsp;|&nbsp; Duplicados NI: <b>{dup_ni:,}</b> (bloqueados en estricto) &nbsp;|&nbsp; Sin pair_id: <b>{empty_pid:,}</b><br>
Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b>
{sam_html}
<hr style="border:0;border-top:1px solid #caa">
<small>NI ambiguas bloqueadas: <b>{ambN:,}</b>{(' · ej.: ' + escape(ambList)) if ambN else ''}</small><br>
<small>Regla: el motor usa <b>sólo</b> tablas 1:1; NI duplicadas se bloquean y se muestran como <code>[AMB-NI:...]</code>.</small>
</div>
"""
return rows > 0
print("Cargando léxico/pares (BI-estricto)…")
load_bi_strict_and_diagnose()
# ====== Utilidad n-grama (longest-match, BI-only) ======
def _longest_match(tokens, i, phrase_map):
"""Devuelve (span, surface) si hay frase que comience en i."""
if not phrase_map: return (0, None)
max_span = 0; surface = None
# determinista: prioriza el span más largo
for span in range(1, MAX_NGRAM+1):
if i+span > len(tokens): break
cand = " ".join(lower(t) for t in tokens[i:i+span])
if cand in phrase_map:
max_span = span
surface = phrase_map[cand][0]
return (max_span, surface)
# ====== Post-proceso ES (espacios + mayúsculas de oración) ======
def sentence_case_spanish(s: str) -> str:
out = []
start = True
in_br = False # dentro de [ ... ]
WRAPS = "¿¡\"'«(“‘["
for ch in s:
if ch == '[':
in_br = True
if not in_br and start:
if ch.isspace():
out.append(ch)
elif ch in WRAPS:
out.append(ch)
elif ch.isalpha():
out.append(ch.upper()); start = False
else:
out.append(ch)
start = ch in "¿¡"
else:
out.append(ch)
if not in_br and ch in ".?!…":
start = True
elif not in_br and ch in "¿¡":
start = True
if ch == ']':
in_br = False
return "".join(out)
# ✦ FIX: no re-espaciar horas/decimales y no añadir espacios tras “:”/“,”
def postprocess_spanish(s: str) -> str:
# 1) compactar horas y decimales
s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s) # 18:30
s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s) # 12,65 / 3.1415
# 2) espacios y signos
s = re.sub(r"\s+([,.;:!?])", r"\1", s) # nada antes de signos
# añadir espacio SOLO tras . ! ? ; (NO tras coma/“:”)
s = re.sub(r"([?.!;])(?!\s|$)([^\s])", r"\1 \2", s)
# 3) signos invertidos
s = re.sub(r"([¿¡])\s+", r"\1", s)
# 4) colapsar espacios
s = re.sub(r"\s{2,}", " ", s).strip()
# 5) mayúscula inicial de oración
return sentence_case_spanish(s)
# ====== Traducción BI estricta ======
def translate_es_to_ni_bi(text:str):
toks = simple_tokenize(text)
out=[]; ib_toks=[]
i=0
while i < len(toks):
t = toks[i]
if t in VISIBLE_PUNCT:
out.append(t); ib_toks.append(t); i+=1; continue
if is_placeholder(t):
out.append(t); ib_toks.append(t); i+=1; continue
span, ni_surface = _longest_match(toks, i, ESPHRASE2NI)
if span > 1:
out.append(ni_surface)
ib_toks.append(georgeos_keys(tokens_from_latin(ni_surface), ni_surface))
i += span; continue
key = lower(t)
if key in ES2NI:
ni = ES2NI[key][0]
out.append(ni)
ib_toks.append(georgeos_keys(tokens_from_latin(ni), ni))
elif is_number(key):
out.append(t); ib_toks.append(t)
else:
ph = f"[SIN-LEX:{t}]"
out.append(ph); ib_toks.append(ph)
i += 1
if MODAL_SUFFIX_ENABLE:
out = add_modal_suffixes_es2ni(out)
ib_toks = []
for tt in out:
if tt in VISIBLE_PUNCT or tt.startswith("["):
ib_toks.append(tt)
else:
ib_toks.append(georgeos_keys(tokens_from_latin(tt), tt))
ni_text = detokenize(out)
ib_html = "<div class='ib-line'>" + escape(render_ib_with_tridots(ib_toks)) + "</div>"
return ni_text, ib_html
def translate_ni_to_es_bi(text:str):
toks = simple_tokenize(text)
if MODAL_SUFFIX_ENABLE:
toks = strip_modal_suffixes_ni(toks)
out=[]
i=0
while i < len(toks):
t = toks[i]
if t in VISIBLE_PUNCT:
out.append(t); i+=1; continue
if is_placeholder(t):
out.append(t); i+=1; continue
span, es_surface = _longest_match(toks, i, NIPHRASE2ES)
if span > 1:
out.append(es_surface); i += span; continue
key = lower(t)
if key in NI2ES:
es = NI2ES[key][0] or ""
out.append(es if es else f"[?:{t}]")
elif key in AMBIG_NI and STRICT_BI_ENFORCE:
# ★ determinista: no elegimos al azar superficies NI con colisión
out.append(f"[AMB-NI:{t}]")
elif is_number(key):
out.append(t)
else:
out.append(f"[?:{t}]")
i += 1
if MODAL_SUFFIX_ENABLE:
out = add_inverted_openers(out)
es_text = detokenize(out)
es_text = postprocess_spanish(es_text)
return es_text
# ====== Diagnóstico ======
def diagnose_text(text, dir_label):
if not text or not text.strip():
return "<em>Introduce texto para diagnosticar.</em>"
toks = simple_tokenize(text)
unknown=set(); asym=set(); amb=set()
total_tokens=0; covered=0
if dir_label.startswith("ES"):
head = "ES→NI"
i=0
while i < len(toks):
t = toks[i]
if t in VISIBLE_PUNCT or is_number(t):
i+=1; continue
total_tokens += 1
span, _ = _longest_match(toks, i, ESPHRASE2NI)
if span > 1:
covered += 1; i += span; continue
k=lower(t)
if k not in ES2NI:
unknown.add(t); i+=1; continue
covered += 1
ni = ES2NI[k][0]
back = NI2ES.get(lower(ni))
if back and lower(back[0]) != k:
asym.add(f"{t}{ni}{back[0]}")
i+=1
else:
head = "NI→ES"
i=0
while i < len(toks):
t = toks[i]
if t in VISIBLE_PUNCT or is_number(t):
i+=1; continue
total_tokens += 1
span, _ = _longest_match(toks, i, NIPHRASE2ES)
if span > 1:
covered += 1; i += span; continue
k=lower(t)
if k in AMBIG_NI:
amb.add(t); i+=1; continue
if k not in NI2ES:
unknown.add(t); i+=1; continue
covered += 1
es = NI2ES[k][0]
back = ES2NI.get(lower(es))
if back and lower(back[0]) != k:
asym.add(f"{t}{es}{back[0]}")
i+=1
cov_pct = (covered/total_tokens*100) if total_tokens else 100.0
cov_html = f"<div><b>Tokens (sin puntuación/numéricos):</b> {total_tokens} &nbsp;|&nbsp; <b>Cubiertos:</b> {covered} ({cov_pct:.1f}%)</div>"
unk_html = "".join(f"<li><code>{escape(u)}</code></li>" for u in sorted(unknown, key=lambda x: lower(x))) or "<li><i>—</i></li>"
amb_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(amb, key=lambda x: lower(x))) or "<li><i>—</i></li>"
asy_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(asym)) or "<li><i>—</i></li>"
return f"<b>Diagnóstico {head}</b>{cov_html}<b>Ambiguas (NI duplicada):</b><ul>{amb_html}</ul><b>Faltantes:</b><ul>{unk_html}</ul><b>Asimetrías:</b><ul>{asy_html}</ul>"
# ====== UI (CSS / acordeones / fuentes) ======
LABELS={
"ES":{
"title":"Traductor Español ↔ Neoíbero",
"subtitle":"CSV estricto (BI-only 1:1; sin heurísticas; .gz) — determinista",
"in_label_es":"✏️ Entrada (Español)",
"in_label_ni":"✏️ Entrada (Neoíbero)",
"in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
"in_ph_ni":"Idatzi hemen. Adib.: nuker-ke ni etxe-ka.",
"out_lat_esni":"📜 Salida: Neoíbero (latín)",
"out_lat_nies":"📜 Salida: Español",
"out_ib":"🗿 Línea ibérica",
"out_audio":"🔊 Locución (Audio)",
"btn":"🔄 Traducir",
"combo":"🌍 Idioma (UI + explicación)",
"dir":"🔁 Dirección",
"dir_opts":["ES → NI","NI → ES"],
"doc_header":"📚 Documentación y Referencia",
"acc_titles":[
"🎓 Marco académico y decisiones del neoíbero",
"🏛️ Herencia posible del íbero histórico",
"🎨 Diseño de la conlang (neoíbero)",
"⚙️ Pipeline del traductor (BI-estricto 1:1)",
"🔤 Ortografía, línea ibérica y claves",
"❓/❗ Modalidad vascoide (-na / -ba)",
"🧩 Expansiones por CSV: plurales (S) y 3pl (3/V3)",
"📖 Gramática de referencia (v1.2)",
"📚 Bibliografía de base",
"🧾 Siglas y glosario",
"🔗 Simetría por pair_id (modo bilingüe)"
]
},
"EN":{
"title":"Spanish ↔ Neo-Iberian Translator",
"subtitle":"Strict BI-only (1:1 surfaces; no heuristics; .gz) — deterministic",
"in_label_es":"✏️ Input (Spanish)",
"in_label_ni":"✏️ Input (Neo-Iberian)",
"in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.",
"in_ph_ni":"Type here. E.g., nuker-ke ni etxe-ka.",
"out_lat_esni":"📜 Output: Neo-Iberian (Latin)",
"out_lat_nies":"📜 Output: Spanish",
"out_ib":"🗿 Iberian line",
"out_audio":"🔊 Speech (Audio)",
"btn":"🔄 Translate",
"combo":"🌍 Language (UI + docs)",
"dir":"🔁 Direction",
"dir_opts":["ES → NI","NI → ES"],
"doc_header":"📚 Documentation & Reference",
"acc_titles":[
"🎓 Background & design choices",
"🏛️ Possible inheritance from ancient Iberian",
"🎨 Conlang design (Neo-Iberian)",
"⚙️ Translator pipeline (strict 1:1)",
"🔤 Orthography, Iberian line & keys",
"❓/❗ Vascoid modality (-na / -ba)",
"🧩 CSV-driven expansions: plurals (S) & 3pl (3/V3)",
"📖 Reference grammar (v1.2)",
"📚 Core references",
"🧾 Acronyms & glossary",
"🔗 Pair-id symmetry (bilingual mode)"
]
}
}
DOC = {
"ES":[
"**Escritura y datos.** Un **único CSV con `pair_id`** y superficies exactas. La traducción ES↔NI es **1:1** por superficie.",
"**Herencia plausible del íbero.** Fonotaxis CV(C); p→b; r/ŕ; casos -k/-te/-ka/-ar/-en/-i.",
"**Diseño del neoíbero.** TAM: PRS -ke, PST -bo, FUT -ta, IPFV -ri, IMP -tu, COND/SBJV -ni, FUT_SBJV -ra.",
"**Pipeline (BI-estricto 1:1).** Tokeniza; sustitución exacta; NI ambigua **se bloquea** y sale como `[AMB-NI:…]`.",
"**Ortografía y línea ibérica.** Tokens BA/BE/…; tridots '/'; p→b; codas N/S/Ś/R/Ŕ/L/M/K/T.",
"**Modalidad (-na/-ba).** ES→NI puede omitir ¿?¡! (si está activo). NI→ES inserta `¿…?`/`¡…!` al final de la oración marcada, **no en comas**.",
"**Expansiones por CSV (deterministas).** `flags=S` plural regular; `flags=3|V3` 3ª plural del presente. Solo si lo marcas.",
"**Gramática mínima.** Visualización; la gramática no se “calcula”.",
"**Bibliografía.** Untermann; de Hoz; Ferrer i Jané; Correa…",
"**Glosario & datasets.** Faltas → `[SIN-LEX:…]` / `[?:…]`. Ambiguas → `[AMB-NI:…]` (limpia tu CSV).",
"**Simetría por pair_id.** El diagnóstico avisa si una NI apunta a dos ES distintos."
],
"EN":[
"One bilingual CSV with `pair_id` and exact surfaces. ES↔NI is strictly 1:1.",
"Possible inheritance (non-palaeographic).",
"Neo-Iberian design (phonology & morphology).",
"Pipeline: tokenise → exact replacement. Ambiguous NI are **blocked** and rendered as `[AMB-NI:…]`.",
"Orthography, Iberian line & keys.",
"Modality (-na/-ba): ES→NI can drop ¿?¡!. NI→ES places them at sentence end, not at commas.",
"CSV-driven expansions (deterministic): `S` plural; `3|V3` present 3pl.",
"Minimal grammar (v1.2).",
"Selected references.",
"Glossary & datasets.",
"Pair-id symmetry diagnostics."
]
}
# ====== CSS + fuente ======
def build_css():
b64=None
if os.path.exists("Iberia-Georgeos.ttf"):
with open("Iberia-Georgeos.ttf","rb") as f:
b64=base64.b64encode(f.read()).decode("ascii")
font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')"
return f"""
@font-face {{
font-family: 'IberiaGeorgeos';
src: {font_src};
font-weight: normal; font-style: normal;
}}
:root {{
--iberian-clay:#8B4513; --iberian-ochre:#CC7722; --iberian-stone:#5C5C5C;
--iberian-sand:#D2B48C; --iberian-rust:#A0522D; --iberian-bronze:#CD7F32;
}}
.gradio-container {{ background:linear-gradient(135deg,#f4e8d8 0%,#e8d5c4 50%,#d4c4b0 100%)!important;
font-family:'Georgia','Times New Roman',serif!important; }}
.gradio-container h1, .gradio-container h2, .gradio-container h3 {{
color:var(--iberian-clay)!important; text-shadow:2px 2px 4px rgba(139,69,19,.15)!important;
border-bottom:3px solid var(--iberian-bronze)!important; padding-bottom:.5rem!important; letter-spacing:.5px!important;
}}
.gradio-container .gr-group {{ background:linear-gradient(to bottom,#f9f6f0,#ede6dc)!important;
border:2px solid var(--iberian-sand)!important; border-radius:8px!important; box-shadow:0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5)!important;
padding:1.5rem!important; margin-bottom:1.5rem!important; }}
.gradio-container .gr-accordion {{ background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important;
border:2px solid var(--iberian-rust)!important; border-radius:6px!important; margin-bottom:.8rem!important; box-shadow:2px 2px 6px rgba(0,0,0,.15)!important; }}
.gradio-container .gr-accordion .label-wrap {{ background:linear-gradient(to right,var(--iberian-ochre),var(--iberian-rust))!important;
color:#fff!important; font-weight:600!important; padding:.8rem 1rem!important; border-radius:4px!important; text-shadow:1px 1px 2px rgba(0,0,0,.3)!important; }}
.gradio-container .gr-textbox textarea, .gradio-container .gr-textbox input {{ background:linear-gradient(to bottom,#faf8f3,#f5f0e8)!important;
border:2px solid var(--iberian-sand)!important; border-radius:6px!important; color:var(--iberian-stone)!important;
font-family:'Georgia',serif!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1)!important; }}
.gradio-container .gr-textbox textarea:focus, .gradio-container .gr-textbox input:focus {{
border-color:var(--iberian-bronze)!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3)!important; }}
.gradio-container .gr-button.gr-button-primary {{ background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important;
border:2px solid var(--iberian-clay)!important; color:#fff!important; font-weight:bold!important; text-shadow:1px 2px 2px rgba(0,0,0,.4)!important;
box-shadow:0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2)!important; border-radius:8px!important; padding:.8rem 1.5rem!important; transition:all .3s ease!important; }}
.gradio-container .gr-button.gr-button-primary:hover {{ background:linear-gradient(145deg,var(--iberian-rust),var(--iberian-bronze))!important;
transform:translateY(-2px)!important; box-shadow:0 6px 12px rgba(139,69,19,.4)!important; }}
.ib-line {{ font-family:'IberiaGeorgeos',monospace,sans-serif!important; font-size:1.9rem!important; line-height:2.4rem!important; white-space:pre-wrap!important;
background:linear-gradient(135deg,#e8dcc8 0%,#d4c4a8 50%,#c4b098 100%)!important; padding:24px!important; border-radius:10px!important;
border:3px solid var(--iberian-rust)!important; border-left:6px solid var(--iberian-bronze)!important;
box-shadow:0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1)!important; color:var(--iberian-clay)!important; position:relative!important; }}
.ib-line::before {{ content:''!important; position:absolute!important; inset:0!important;
background-image:repeating-linear-gradient(0deg,transparent,transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px)!important;
pointer-events:none!important; border-radius:10px!important; }}
@media (max-width:768px) {{
.ib-line {{ font-size:1.5rem!important; line-height:2rem!important; padding:16px!important; }}
.gradio-container .gr-group {{ padding:1rem!important; }}
.gradio-container h1 {{ font-size:1.8rem!important; }}
}}
@media (max-width:480px) {{
.ib-line {{ font-size:1.3rem!important; line-height:1.8rem!important; padding:12px!important; }}
.gradio-container h1 {{ font-size:1.5rem!important; }}
}}
"""
CSS = build_css()
with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo:
with gr.Group():
title = gr.Markdown(f"# {LABELS['ES']['title']}")
subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*")
with gr.Row():
combo = gr.Dropdown(choices=["ES","EN"], value="ES", label=LABELS["ES"]["combo"])
direction = gr.Radio(choices=LABELS["ES"]["dir_opts"], value="ES → NI", label=LABELS["ES"]["dir"])
with gr.Group():
doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}")
acc_titles = LABELS["ES"]["acc_titles"]
with gr.Accordion(acc_titles[0], open=False) as acc1: md1 = gr.Markdown(DOC["ES"][0])
with gr.Accordion(acc_titles[1], open=False) as acc2: md2 = gr.Markdown(DOC["ES"][1])
with gr.Accordion(acc_titles[2], open=False) as acc3: md3 = gr.Markdown(DOC["ES"][2])
with gr.Accordion(acc_titles[3], open=False) as acc4: md4 = gr.Markdown(DOC["ES"][3])
with gr.Accordion(acc_titles[4], open=False) as acc5: md5 = gr.Markdown(DOC["ES"][4])
with gr.Accordion(acc_titles[5], open=False) as acc6: md6 = gr.Markdown(DOC["ES"][5])
with gr.Accordion(acc_titles[6], open=False) as acc7: md7 = gr.Markdown(DOC["ES"][6])
with gr.Accordion(acc_titles[7], open=False) as acc8: md8 = gr.Markdown(DOC["ES"][7])
with gr.Accordion(acc_titles[8], open=False) as acc9: md9 = gr.Markdown(DOC["ES"][8])
with gr.Accordion(acc_titles[9], open=False) as acc10: md10 = gr.Markdown(DOC["ES"][9])
with gr.Accordion(acc_titles[10], open=False) as acc11: md11 = gr.Markdown(DOC["ES"][10])
with gr.Accordion("🧪 Diagnóstico del CSV BI (al cargar)", open=False):
bi_diag_box = gr.HTML(value=BI_DIAG_HTML)
with gr.Group():
es_in = gr.Textbox(label=LABELS["ES"]["in_label_es"], placeholder=LABELS["ES"]["in_ph_es"], lines=5)
with gr.Row():
btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary")
btn_diag = gr.Button("🔎 Diagnosticar BI con este texto", variant="secondary")
with gr.Row():
with gr.Column(scale=2):
ni_out = gr.Textbox(label=LABELS["ES"]["out_lat_esni"], lines=5, interactive=False)
loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=True)
audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy")
with gr.Column(scale=1):
ib_out = gr.HTML(label=LABELS["ES"]["out_ib"])
diag_out = gr.HTML(value="")
def do_translate(text, dir_label):
if not text or not text.strip():
return (gr.update(value=""),
gr.update(value="<div class='ib-line'></div>"),
gr.update(visible=False),
gr.update(value=None),
gr.update(value=""))
if dir_label.startswith("ES"):
latin, ib = translate_es_to_ni_bi(text)
return (gr.update(label=LABELS["ES"]["out_lat_esni"], value=latin),
gr.update(value=ib),
gr.update(visible=True),
gr.update(value=None),
gr.update(value=""))
else:
es_text = translate_ni_to_es_bi(text)
return (gr.update(label=LABELS["ES"]["out_lat_nies"], value=es_text),
gr.update(value="<div class='ib-line'></div>"),
gr.update(visible=False),
gr.update(value=None),
gr.update(value=""))
btn_tr.click(do_translate, [es_in, direction], [ni_out, ib_out, loc_btn, audio_out, diag_out])
def run_locution(latin_text, dir_label):
if dir_label.startswith("ES"):
return synthesize_speech(latin_text)
return None
loc_btn.click(run_locution, [ni_out, direction], audio_out)
def do_diagnose(text, dir_label):
return gr.update(value=diagnose_text(text, dir_label))
btn_diag.click(do_diagnose, [es_in, direction], [diag_out])
def switch_lang(sel_lang, dir_label):
L=LABELS[sel_lang]; T=L["acc_titles"]; D=DOC[sel_lang]
in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"]
in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"]
out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"]
return (
gr.update(value=f"# {L['title']}"),
gr.update(value=f"*{L['subtitle']}*"),
gr.update(label=L["combo"], value=sel_lang),
gr.update(label=L["dir"], choices=L["dir_opts"], value=dir_label),
gr.update(value=f"## {L['doc_header']}"),
gr.update(label=T[0]), gr.update(value=D[0]),
gr.update(label=T[1]), gr.update(value=D[1]),
gr.update(label=T[2]), gr.update(value=D[2]),
gr.update(label=T[3]), gr.update(value=D[3]),
gr.update(label=T[4]), gr.update(value=D[4]),
gr.update(label=T[5]), gr.update(value=D[5]),
gr.update(label=T[6]), gr.update(value=D[6]),
gr.update(label=T[7]), gr.update(value=D[7]),
gr.update(label=T[8]), gr.update(value=D[8]),
gr.update(label=T[9]), gr.update(value=D[9]),
gr.update(label=T[10]), gr.update(value=D[10]),
gr.update(label=in_label, placeholder=in_ph),
gr.update(label=out_lab),
gr.update(label=L["out_ib"]),
gr.update(label=L["out_audio"]),
gr.update(value=L["btn"])
)
combo.change(
switch_lang,
[combo, direction],
[title, subtitle, combo, direction, doc_header,
acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9, acc10, md10, acc11, md11,
es_in, ni_out, ib_out, audio_out, btn_tr]
)
def switch_direction(dir_label, sel_lang):
L=LABELS[sel_lang]
in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"]
in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"]
out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"]
loc_vis = True if dir_label.startswith("ES") else False
return (gr.update(label=in_label, placeholder=in_ph),
gr.update(label=out_lab, value=""),
gr.update(value="<div class='ib-line'></div>"),
gr.update(visible=loc_vis),
gr.update(value=None),
gr.update(value=""))
direction.change(
switch_direction,
[direction, combo],
[es_in, ni_out, ib_out, loc_btn, audio_out, diag_out]
)
# ====== smoke opcional ======
def _symmetry_smoketest():
print("\n[SMOKE] Prueba ES↔NI (BI-estricto, determinista)…")
probes = [
"nuker-ke ni etxe-ka ?",
"¿Pagaste 12,75 en la cafetería?",
"Marta llega a las 18:30.",
"[SIN-LEX:Tomás]-na euŕak-ke !"
]
for p in probes:
es_from_ni = translate_ni_to_es_bi(p)
ni_round, _ = translate_es_to_ni_bi(es_from_ni)
print(" IN:", p)
print(" ES:", es_from_ni)
print(" NI:", ni_round)
print("---")
if DEBUG_MODE:
_symmetry_smoketest()
if __name__ == "__main__":
demo.queue().launch()