Spaces:

LoloSemper
/

Spanish_NeoIberian_TranslatorInversionUltimate

Sleeping

App Files Files Community

LoloSemper commited on Nov 8, 2025

Commit

6d901fa

verified ·

1 Parent(s): 310343a

Update app.py

Browse files

Files changed (1) hide show

app.py +118 -142

app.py CHANGED Viewed

@@ -1,13 +1,13 @@
-# app.py — Traductor Español ↔ Neoíbero (BI-ONLY 1:1 estricto)
 # UI completa + CSS “íbero” + TTS + Línea ibérica (codificación appOld)
 # Requiere un ÚNICO CSV con superficies exactas (UTF-8) y columnas:
-#   - source_es (o es/es_surface)   ← superficies ES usadas para ES→NI
 #   - target_ni (o ni/ni_surface)
-#   - target_es (opcional pero RECOMENDADO) ← superficies ES usadas para NI→ES
 #   - pair_id (opcional)
 #
 # El motor NO hace heurísticas ni morfología: 1:1 exacto por superficie.
 # Puntuación y números pasan tal cual. Desconocidos -> [SIN-LEX:...] / [?:...]
 import gradio as gr
 import os, csv, re, base64, unicodedata, gzip
@@ -57,13 +57,13 @@ CSV_BI = _cand(
 # ====== estructuras strict BI ======
 # Clave = superficie exacta en minúsculas. Valor = (superficie_original_opuesta, pair_id)
-ES2NI = {}   # es_surface_lower -> (ni_surface, pair_id)       [para ES→NI]
-NI2ES = {}   # ni_surface_lower -> (es_surface_FOR_NI, pair_id) [para NI→ES]
-# <<< NGRAM: diccionarios de frases (si el CSV trae claves con espacios)
 ESPHRASE2NI = {}  # "el saco" -> (ni_surface, pair_id)
-NIPHRASE2ES = {}  # "…-ke ni etxe-ka" -> (es_surface_FOR_NI, pair_id)
-MAX_NGRAM = 3     # buscamos hasta 3 tokens; sube si tu CSV trae sintagmas largos
 # ====== signos / tokenización mínima ======
 VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
@@ -71,16 +71,12 @@ _num_re = re.compile(r"^\d+([.,]\d+)?$")
 def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
 # --- separadores de cláusula + placeholders atómicos ---
-CLAUSE_BREAKS = {",", ";", "—", "–", ":"}  # cortes fuertes (no fin de oración)
 PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
 def is_placeholder(tok: str) -> bool:
     return bool(PLACEHOLDER_RE.match(tok or ""))
 def _restore_brk(tok, protected):
-    """
-    Restaura __BRKn__ y también __BRKn__-na / __BRKn__-ba a su forma original,
-    manteniendo el sufijo modal si existe (p.ej. '[SIN-LEX:Tomás]-na').
-    """
     m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
     if not m: return tok
     idx = int(m.group(1))
@@ -97,39 +93,29 @@ def simple_tokenize(text:str):
         key = f"__BRK{len(protected)}__"
         protected.append(m.group(0))
         return key
-    # protegemos bloques [ ... ]
     t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
     t = re.sub(r"\s+"," ", t)
     t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
     toks = [tok for tok in t.split() if tok]
-    # restaura bloques protegidos (con soporte -na/-ba adheridos)
     for i, tok in enumerate(toks):
-        # si viene pegado el sufijo modal, no se habrá restaurado; hacemos la restauración robusta
         if tok.startswith("__BRK") and "__" in tok:
             toks[i] = _restore_brk(tok, protected)
     return toks
 def detokenize(tokens):
     s = " ".join(tokens)
-    # cerrar espacios antes de .,;:!?
     s = re.sub(r"\s+([,.;:!?])", r"\1", s)
-    # quitar espacio tras abridores invertidos
     s = re.sub(r"([¿¡])\s+", r"\1", s)
-    # paréntesis
     s = re.sub(r"\(\s+", "(", s)
     s = re.sub(r"\s+\)", ")", s)
     s = re.sub(r"\s{2,}", " ", s).strip()
     return s
 # ====== Modalidad vascoide (-na / -ba) ======
-# Configuración
 MODAL_SUFFIX_ENABLE = True
 MODAL_ONLY_ON_FINITE = True
 MODAL_STRIP_QE_IN_NI = True
-# Conjuntos y ayudas
 SENT_END = {".", "!", "?", "…"}
 OPEN_FOR = {"?": "¿", "!": "¡"}
 WRAP_PREFIX = set(list("«“‘([{\"'"))
@@ -202,8 +188,8 @@ def add_modal_suffixes_es2ni(tokens):
 def strip_modal_suffixes_ni(tokens):
     """
-    Interpreta -na/-ba como modalidad; cierra antes de separadores fuertes,
-    excepto cuando la coma/“:” son numéricos (12,75 / 18:30).
     """
     if not MODAL_SUFFIX_ENABLE:
         return tokens
@@ -226,25 +212,19 @@ def strip_modal_suffixes_ni(tokens):
     toks = tokens + ["."]
     for i, t in enumerate(toks):
-        # Abridores explícitos
         if t in ("¿", "¡"):
             _emit(); mode = "?" if t == "¿" else "!"
             continue
-        # Cierres explícitos
         if t in ("?", "!"):
             pending_end = t; _emit(); continue
-        # Final de oración
         if t in SENT_END:
             pending_end = t; _emit(); continue
-        # Separadores fuertes (no numéricos)
         if t in CLAUSE_BREAKS and mode in ("?","!"):
-            if not _is_true_clause_break(toks, i):
-                # es decimal/hora -> no cerrar
-                pass
-            else:
-                _emit(also_append=t); continue
-        # Sufijos -na/-ba (en cualquier token, incl. placeholders)
         m = re.search(r"-(na|ba)$", (t or "").lower())
         if m:
             if mode and buf: _emit()
@@ -270,7 +250,6 @@ def add_inverted_openers(tokens):
     while i < len(out):
         if out[i] in ("?", "!"):
             closer = out[i]; opener = OPEN_FOR[closer]
-            # inicio del tramo = después del último fin de oración o separador FALSO/VERDADERO
             j = i - 1
             while j >= 0 and not _is_true_start_break(j):
                 j -= 1
@@ -283,7 +262,7 @@ def add_inverted_openers(tokens):
         i += 1
     return out
-# ====== EXPANSIONES CONTROLADAS POR CSV (deterministas) ======
 EXPANSION_ENABLE = True
 FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH")
 FLAG_PLURAL = ("S",)
@@ -408,100 +387,104 @@ def render_ib_with_tridots(ib_toks):
     return "".join(res).strip()
 # ====== BI loader + diagnóstico ======
 BI_DIAG_HTML = "<em>Sin CSV cargado.</em>"
 def load_bi_strict_and_diagnose():
     """Carga el CSV, llena ES2NI/NI2ES y prepara un HTML de diagnóstico."""
     global BI_DIAG_HTML
     if not os.path.exists(CSV_BI):
         msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}"
         print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}"
         return False
-    def _choose_col(flds, prefer_list, fallback=None):
-        for c in prefer_list:
-            if c in flds: return c
-        return fallback
     rows=0; dup_es=0; dup_ni=0; empty_pid=0
     mismatch_backmap = 0
     mismatch_samples = []
     pid_seen=set()
-    exp_plurals = 0
-    exp_3pl = 0
     print(f"Detectado CSV bilingüe: {CSV_BI}")
     try:
         with _open_maybe_gzip(CSV_BI) as f:
             rd = csv.DictReader(f)
             flds=set(rd.fieldnames or [])
-            # --- OJO: columnas separadas para cada dirección ---
-            # ES_SURF_COL: superficies ES para ES→NI (keys de ES2NI)
-            ES_SURF_COL   = _choose_col(flds, ["source_es","es_surface","target_es","es"], "es")
-            # NI_COL: superficies NI
-            NI_COL        = _choose_col(flds, ["target_ni","ni_surface","ni"], "ni")
-            # ES_FOR_NI_COL: superficies ES para NI→ES (valores de NI2ES)
-            ES_FOR_NI_COL = _choose_col(flds, ["target_es","es_surface","source_es","es"], "es")
             FLAGCOL = None
             for cand in FLAG_COLNAMES:
                 if cand in flds:
                     FLAGCOL = cand; break
-            IDCOL  = "pair_id" if "pair_id" in flds else ("id" if "id" in flds else None)
-            base_rows = []  # guardamos para expansiones (si las hay)
             for r in rd:
-                es_surf_orig   = norm(r.get(ES_SURF_COL))     # superficie para ES→NI
-                ni_orig        = norm(r.get(NI_COL))
-                es_for_ni_orig = norm(r.get(ES_FOR_NI_COL)) or es_surf_orig  # preferir target_es si existe
-                if not (es_surf_orig and ni_orig):
-                    continue
-                pid = (norm(r.get(IDCOL)) if IDCOL else "")
                 if not pid: empty_pid += 1
                 else: pid_seen.add(pid)
                 flags = (r.get(FLAGCOL) or "") if FLAGCOL else ""
-                es_key = lower(es_surf_orig)
-                ni_key = lower(ni_orig)
-                # Frases (n-gram)
-                if " " in es_key:
-                    ESPHRASE2NI[es_key] = (es_for_ni_orig and ni_orig, pid) if es_for_ni_orig else (ni_orig, pid)
-                if " " in ni_key:
-                    NIPHRASE2ES[ni_key] = (es_for_ni_orig, pid)
-                # Diccionarios base
-                if es_key in ES2NI: dup_es += 1
-                else: ES2NI[es_key] = (ni_orig, pid)
-                if ni_key in NI2ES: dup_ni += 1
-                else: NI2ES[ni_key] = (es_for_ni_orig, pid)
-                base_rows.append((es_surf_orig, ni_orig, pid, flags))
                 rows += 1
-        # Expansiones deterministas por flags (si están activadas)
         if EXPANSION_ENABLE:
-            for es_surf_orig, ni_orig, pid, flags in base_rows:
                 if not flags: continue
                 if _has_flag(flags, FLAG_PLURAL):
-                    pl = _pluralize_es_form(es_surf_orig)
                     pl_key = lower(pl)
                     if pl_key not in ES2NI:
                         ES2NI[pl_key] = (ni_orig, pid)
-                        exp_plurals += 1
                 if _has_flag(flags, FLAG_3PL):
-                    p3 = _present_3pl_from_3sg(es_surf_orig)
                     p3_key = lower(p3)
                     if p3_key not in ES2NI:
                         ES2NI[p3_key] = (ni_orig, pid)
-                        exp_3pl += 1
-        # Diagnóstico de asimetrías 1:1
         for es_low, (ni_surf, _) in ES2NI.items():
             ni_low = lower(ni_surf)
             back = NI2ES.get(ni_low)
@@ -521,7 +504,7 @@ def load_bi_strict_and_diagnose():
     print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.")
     if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).")
-    if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (se usó la primera).")
     if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
     if mismatch_backmap:
         print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
@@ -534,22 +517,8 @@ def load_bi_strict_and_diagnose():
         )
         sam_html = f"<details><summary>Muestras</summary><ul>{sam_rows}</ul></details>"
-    # HTML de diagnóstico enriquecido: muestra qué columnas se han usado
-    cols_html = f"""
-      <div style="margin-top:.5rem">
-        <b>Columnas usadas</b>:
-        ES→NI = <code>{escape(ES_SURF_COL)}</code> &nbsp;|&nbsp;
-        NI→ES = <code>{escape(ES_FOR_NI_COL)}</code> &nbsp;|&nbsp;
-        NI = <code>{escape(NI_COL)}</code>
-      </div>
-    """
-    warn_inf = ""
-    if "es" in (ES_SURF_COL,) and ES_FOR_NI_COL == "es":
-        warn_inf = "<div style='color:#a00'><b>⚠ Aviso:</b> Se detectó que el CSV sólo tiene <code>es</code>. Si ese campo es <i>lema</i>, la inversa podría irse a infinitivo. Este motor ya intenta usar <code>target_es</code>/<code>es_surface</code> si existen.</div>"
-    exp_html = ""
-    if EXPANSION_ENABLE and (exp_plurals or exp_3pl):
-        exp_html = f"<div>Expansiones aplicadas → Plurales: <b>{exp_plurals}</b> · 3pl: <b>{exp_3pl}</b></div>"
     BI_DIAG_HTML = f"""
     <div style="font-family:Georgia,serif">
@@ -557,15 +526,12 @@ def load_bi_strict_and_diagnose():
       Archivo: <b>{escape(CSV_BI)}</b><br>
       Filas base (CSV): <b>{rows:,}</b><br>
       ES únicas (tras expansiones): <b>{es_unique:,}</b> &nbsp;|&nbsp; NI únicas: <b>{ni_unique:,}</b> &nbsp;|&nbsp; pair_id únicos: <b>{pid_unique:,}</b><br>
-      Duplicados ES: <b>{dup_es:,}</b> &nbsp;|&nbsp; Duplicados NI: <b>{dup_ni:,}</b> &nbsp;|&nbsp; Sin pair_id: <b>{empty_pid:,}</b><br>
-      Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b><br>
-      {cols_html}
-      {warn_inf}
-      <hr style="border:0;border-top:1px solid #caa">
-      <small>Regla: el motor usa <b>sólo</b> tablas 1:1 y expansiones <b>explícitas por bandera</b> (flags) en el CSV.
-      Nada “adivina”.</small>
-      {exp_html}
       {sam_html}
     </div>
     """
     return rows > 0
@@ -578,6 +544,7 @@ def _longest_match(tokens, i, phrase_map):
     """Devuelve (span, surface) si hay frase que comience en i."""
     if not phrase_map: return (0, None)
     max_span = 0; surface = None
     for span in range(1, MAX_NGRAM+1):
         if i+span > len(tokens): break
         cand = " ".join(lower(t) for t in tokens[i:i+span])
@@ -619,16 +586,21 @@ def sentence_case_spanish(s: str) -> str:
     return "".join(out)
 def postprocess_spanish(s: str) -> str:
-    # compactar horas y decimales
-    s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)
-    s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)
-    # espacios y signos
-    s = re.sub(r"\s{2,}", " ", s)
-    s = re.sub(r"\s+([,.;:!?])", r"\1", s)
-    s = re.sub(r"([?!.:,;])([^\s])", r"\1 \2", s)
     s = re.sub(r"([¿¡])\s+", r"\1", s)
-    return sentence_case_spanish(s).strip()
 # ====== Traducción BI estricta ======
 def translate_es_to_ni_bi(text:str):
@@ -694,6 +666,9 @@ def translate_ni_to_es_bi(text:str):
         if key in NI2ES:
             es = NI2ES[key][0] or ""
             out.append(es if es else f"[?:{t}]")
         elif is_number(key):
             out.append(t)
         else:
@@ -713,7 +688,7 @@ def diagnose_text(text, dir_label):
         return "<em>Introduce texto para diagnosticar.</em>"
     toks = simple_tokenize(text)
-    unknown=set(); asym=set()
     total_tokens=0; covered=0
     if dir_label.startswith("ES"):
@@ -748,6 +723,8 @@ def diagnose_text(text, dir_label):
             if span > 1:
                 covered += 1; i += span; continue
             k=lower(t)
             if k not in NI2ES:
                 unknown.add(t); i+=1; continue
             covered += 1
@@ -761,15 +738,16 @@ def diagnose_text(text, dir_label):
     cov_html = f"<div><b>Tokens (sin puntuación/numéricos):</b> {total_tokens} &nbsp;|&nbsp; <b>Cubiertos:</b> {covered} ({cov_pct:.1f}%)</div>"
     unk_html = "".join(f"<li><code>{escape(u)}</code></li>" for u in sorted(unknown, key=lambda x: lower(x))) or "<li><i>—</i></li>"
     asy_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(asym)) or "<li><i>—</i></li>"
-    return f"<b>Diagnóstico {head}</b>{cov_html}<b>Faltantes:</b><ul>{unk_html}</ul><b>Asimetrías:</b><ul>{asy_html}</ul>"
 # ====== UI (CSS / acordeones / fuentes) ======
 LABELS={
     "ES":{
         "title":"Traductor Español ↔ Neoíbero",
-        "subtitle":"CSV estricto (BI-only 1:1; sin heurísticas; .gz)",
         "in_label_es":"✏️ Entrada (Español)",
         "in_label_ni":"✏️ Entrada (Neoíbero)",
         "in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
@@ -799,7 +777,7 @@ LABELS={
     },
     "EN":{
         "title":"Spanish ↔ Neo-Iberian Translator",
-        "subtitle":"Strict BI-only (1:1 surfaces; no heuristics; .gz)",
         "in_label_es":"✏️ Input (Spanish)",
         "in_label_ni":"✏️ Input (Neo-Iberian)",
         "in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.",
@@ -817,7 +795,7 @@ LABELS={
             "🎓 Background & design choices",
             "🏛️ Possible inheritance from ancient Iberian",
             "🎨 Conlang design (Neo-Iberian)",
-            "⚙️ Translator pipeline (strict BI 1:1)",
             "🔤 Orthography, Iberian line & keys",
             "❓/❗ Vascoid modality (-na / -ba)",
             "🧩 CSV-driven expansions: plurals (S) & 3pl (3/V3)",
@@ -833,27 +811,27 @@ DOC = {
         "**Escritura y datos.** Un **único CSV con `pair_id`** y superficies exactas. La traducción ES↔NI es **1:1** por superficie.",
         "**Herencia plausible del íbero.** Fonotaxis CV(C); p→b; r/ŕ; casos -k/-te/-ka/-ar/-en/-i.",
         "**Diseño del neoíbero.** TAM: PRS -ke, PST -bo, FUT -ta, IPFV -ri, IMP -tu, COND/SBJV -ni, FUT_SBJV -ra.",
-        "**Pipeline (BI-estricto 1:1).** Tokeniza; sustitución exacta vía diccionarios ES2NI/NI2ES; línea ibérica opcional.",
         "**Ortografía y línea ibérica.** Tokens BA/BE/…; tridots '/'; p→b; codas N/S/Ś/R/Ŕ/L/M/K/T.",
-        "**Modalidad (-na/-ba).** En **NI basta el sufijo**: `-na` (interrogativa) y `-ba` (exclamativa). Se une al **último verbo finito** de la oración (o al último constituyente si no hay finito). En **ES→NI** se puede **omitir ¿?¡!** (se eliminan si está activado). En **NI→ES** se **insertan automáticamente** `¿…?` / `¡…!` aunque el NI no tenga signos explícitos.",
-        "**Expansiones por CSV (deterministas).** Si en la fila añades **`flags`** con **`S`**, se crea el **plural regular** en ES (mismo NI). Si añades **`3`** o **`V3`**, se crea la **3ª del plural (presente)** añadiendo `n` a la 3ª singular. **Sólo si lo marcas**; sin banderas, no hay expansión.",
         "**Gramática mínima.** Visualización; la gramática no se “calcula”.",
         "**Bibliografía.** Untermann; de Hoz; Ferrer i Jané; Correa…",
-        "**Glosario & datasets.** Faltas → `[SIN-LEX:…]` / `[?:…]`. Arreglar en el CSV.",
         "**Simetría por pair_id.** El diagnóstico avisa si una NI apunta a dos ES distintos."
     ],
     "EN":[
-        "Script & data. One **bilingual CSV with `pair_id`** and exact surfaces. ES↔NI is **1:1** by surface.",
         "Possible inheritance (non-palaeographic).",
         "Neo-Iberian design (phonology & morphology).",
-        "Pipeline (strict BI 1:1).",
         "Orthography, Iberian line & keys.",
-        "**Modality (-na/-ba).** In **NI the suffix alone is enough**: `-na` (question), `-ba` (exclamation). It attaches to the **last finite verb** (or last constituent). In **ES→NI**, `¿?¡!` can be **omitted** (removed if enabled). In **NI→ES**, **`¿…?`/`¡…!` are inserted** automatically.",
-        "**CSV-driven expansions (deterministic).** Add a **`flags`** cell with **`S`** to spawn **regular plurals** in ES (same NI). Add **`3`** or **`V3`** to spawn **3rd person plural (present)** by appending `n` to 3sg. **Only when flagged.**",
         "Minimal grammar (v1.2).",
         "Selected references.",
         "Glossary & datasets.",
-        "Pair-id symmetry diagnostics highlights collisions."
     ]
 }
@@ -1044,12 +1022,12 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue
 # ====== smoke opcional ======
 def _symmetry_smoketest():
-    print("\n[SMOKE] Prueba ES↔NI (BI-estricto)…")
     probes = [
-        "nuker-ke ni etxe-ka ?",                  # modal simple
-        "¿Pagaste 12,75 en la cafetería?",        # decimal
-        "Marta llega a las 18:30.",               # hora
-        "[SIN-LEX:Tomás]-na euŕak-ke !"           # placeholder + -na + cierre explícito
     ]
     for p in probes:
         es_from_ni = translate_ni_to_es_bi(p)
@@ -1066,5 +1044,3 @@ if __name__ == "__main__":
     demo.queue().launch()

+# app.py — Traductor Español ↔ Neoíbero (BI-ONLY 1:1 estricto, determinista)
 # UI completa + CSS “íbero” + TTS + Línea ibérica (codificación appOld)
 # Requiere un ÚNICO CSV con superficies exactas (UTF-8) y columnas:
+#   - source_es (o es/es_surface)
 #   - target_ni (o ni/ni_surface)
 #   - pair_id (opcional)
 #
 # El motor NO hace heurísticas ni morfología: 1:1 exacto por superficie.
 # Puntuación y números pasan tal cual. Desconocidos -> [SIN-LEX:...] / [?:...]
+# Determinismo NI→ES: entradas NI duplicadas (ambigüas) quedan bloqueadas y se rinden como [AMB-NI:...]
 import gradio as gr
 import os, csv, re, base64, unicodedata, gzip
 # ====== estructuras strict BI ======
 # Clave = superficie exacta en minúsculas. Valor = (superficie_original_opuesta, pair_id)
+ES2NI = {}   # es_surface_lower -> (ni_surface, pair_id)
+NI2ES = {}   # ni_surface_lower -> (es_surface, pair_id)
+# N-gramas/frases:
 ESPHRASE2NI = {}  # "el saco" -> (ni_surface, pair_id)
+NIPHRASE2ES = {}  # "…-ke ni etxe-ka" -> (es_surface, pair_id)
+MAX_NGRAM = 3
 # ====== signos / tokenización mínima ======
 VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
 def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
 # --- separadores de cláusula + placeholders atómicos ---
+CLAUSE_BREAKS = {",", ";", "—", "–", ":"}
 PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
 def is_placeholder(tok: str) -> bool:
     return bool(PLACEHOLDER_RE.match(tok or ""))
 def _restore_brk(tok, protected):
     m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
     if not m: return tok
     idx = int(m.group(1))
         key = f"__BRK{len(protected)}__"
         protected.append(m.group(0))
         return key
     t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
     t = re.sub(r"\s+"," ", t)
     t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
     toks = [tok for tok in t.split() if tok]
     for i, tok in enumerate(toks):
         if tok.startswith("__BRK") and "__" in tok:
             toks[i] = _restore_brk(tok, protected)
     return toks
 def detokenize(tokens):
     s = " ".join(tokens)
     s = re.sub(r"\s+([,.;:!?])", r"\1", s)
     s = re.sub(r"([¿¡])\s+", r"\1", s)
     s = re.sub(r"\(\s+", "(", s)
     s = re.sub(r"\s+\)", ")", s)
     s = re.sub(r"\s{2,}", " ", s).strip()
     return s
 # ====== Modalidad vascoide (-na / -ba) ======
 MODAL_SUFFIX_ENABLE = True
 MODAL_ONLY_ON_FINITE = True
 MODAL_STRIP_QE_IN_NI = True
 SENT_END = {".", "!", "?", "…"}
 OPEN_FOR = {"?": "¿", "!": "¡"}
 WRAP_PREFIX = set(list("«“‘([{\"'"))
 def strip_modal_suffixes_ni(tokens):
     """
+    Interpreta -na/-ba como modalidad; ahora SOLO cerramos al final de oración.
+    (No cerramos en comas/“:”, salvo que ya haya ?/! explícitos.)
     """
     if not MODAL_SUFFIX_ENABLE:
         return tokens
     toks = tokens + ["."]
     for i, t in enumerate(toks):
         if t in ("¿", "¡"):
             _emit(); mode = "?" if t == "¿" else "!"
             continue
         if t in ("?", "!"):
             pending_end = t; _emit(); continue
         if t in SENT_END:
             pending_end = t; _emit(); continue
+        # ✦ MODALIDAD: en separadores de cláusula NO cerramos todavía:
         if t in CLAUSE_BREAKS and mode in ("?","!"):
+            buf.append(t)
+            continue
         m = re.search(r"-(na|ba)$", (t or "").lower())
         if m:
             if mode and buf: _emit()
     while i < len(out):
         if out[i] in ("?", "!"):
             closer = out[i]; opener = OPEN_FOR[closer]
             j = i - 1
             while j >= 0 and not _is_true_start_break(j):
                 j -= 1
         i += 1
     return out
+# ====== EXPANSIONES (deterministas, sólo ES→NI) ======
 EXPANSION_ENABLE = True
 FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH")
 FLAG_PLURAL = ("S",)
     return "".join(res).strip()
 # ====== BI loader + diagnóstico ======
+# ### ★ MODO ESTRICTO Y DETERMINISTA
+STRICT_BI_ENFORCE = True              # si True, no se admite NI ambigua
+AMBIG_NI = {}                         # ni_lower -> set de ES conflictivos
 BI_DIAG_HTML = "<em>Sin CSV cargado.</em>"
 def load_bi_strict_and_diagnose():
     """Carga el CSV, llena ES2NI/NI2ES y prepara un HTML de diagnóstico."""
     global BI_DIAG_HTML
+    # vaciar estructuras antes de cargar (determinismo)
+    ES2NI.clear(); NI2ES.clear(); ESPHRASE2NI.clear(); NIPHRASE2ES.clear()
+    AMBIG_NI.clear()
     if not os.path.exists(CSV_BI):
         msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}"
         print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}"
         return False
     rows=0; dup_es=0; dup_ni=0; empty_pid=0
     mismatch_backmap = 0
     mismatch_samples = []
     pid_seen=set()
     print(f"Detectado CSV bilingüe: {CSV_BI}")
     try:
         with _open_maybe_gzip(CSV_BI) as f:
             rd = csv.DictReader(f)
             flds=set(rd.fieldnames or [])
+            ES_COL = "source_es" if "source_es" in flds else "es_surface" if "es_surface" in flds else "es"
+            NI_COL = "target_ni" if "target_ni" in flds else "ni_surface" if "ni_surface" in flds else "ni"
+            IDCOL  = "pair_id"   if "pair_id"   in flds else "id" if "id" in flds else None
             FLAGCOL = None
             for cand in FLAG_COLNAMES:
                 if cand in flds:
                     FLAGCOL = cand; break
+            base_rows = []
             for r in rd:
+                es_orig = (r.get(ES_COL) or "").strip()
+                ni_orig = (r.get(NI_COL) or "").strip()
+                if not (es_orig and ni_orig): continue
+                pid = (r.get(IDCOL) or "").strip() if IDCOL else ""
                 if not pid: empty_pid += 1
                 else: pid_seen.add(pid)
                 flags = (r.get(FLAGCOL) or "") if FLAGCOL else ""
+                es = lower(es_orig)
+                ni = lower(ni_orig)
+                # Frases
+                if " " in es:
+                    if es not in ESPHRASE2NI:  # determinista: primera manda
+                        ESPHRASE2NI[es] = (ni_orig, pid)
+                if " " in ni:
+                    if ni not in NIPHRASE2ES:
+                        NIPHRASE2ES[ni] = (es_orig, pid)
+                # ES→NI (determinista: primera fila gana)
+                if es in ES2NI:
+                    dup_es += 1
+                else:
+                    ES2NI[es] = (ni_orig, pid)
+                # NI→ES (determinista + bloqueo de ambigüedad)
+                if ni in NI2ES:
+                    dup_ni += 1
+                    # registra ambigüedad
+                    s = AMBIG_NI.get(ni, set())
+                    s.add(NI2ES[ni][0]); s.add(es_orig)
+                    AMBIG_NI[ni] = s
+                    if STRICT_BI_ENFORCE:
+                        NI2ES.pop(ni, None)  # invalida la superficie NI conflictiva
+                else:
+                    if STRICT_BI_ENFORCE and ni in AMBIG_NI:
+                        # ya marcada ambigua: no insertar
+                        pass
+                    else:
+                        NI2ES[ni] = (es_orig, pid)
+                base_rows.append((es_orig, ni_orig, pid, flags))
                 rows += 1
+        # Expansiones deterministas (solo añaden ES2NI; NO tocan NI2ES)
         if EXPANSION_ENABLE:
+            for es_orig, ni_orig, pid, flags in base_rows:
                 if not flags: continue
                 if _has_flag(flags, FLAG_PLURAL):
+                    pl = _pluralize_es_form(es_orig)
                     pl_key = lower(pl)
                     if pl_key not in ES2NI:
                         ES2NI[pl_key] = (ni_orig, pid)
                 if _has_flag(flags, FLAG_3PL):
+                    p3 = _present_3pl_from_3sg(es_orig)
                     p3_key = lower(p3)
                     if p3_key not in ES2NI:
                         ES2NI[p3_key] = (ni_orig, pid)
+        # Diagnóstico asimetrías (no afecta determinismo)
         for es_low, (ni_surf, _) in ES2NI.items():
             ni_low = lower(ni_surf)
             back = NI2ES.get(ni_low)
     print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.")
     if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).")
+    if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (bloqueados en modo estricto).")
     if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
     if mismatch_backmap:
         print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
         )
         sam_html = f"<details><summary>Muestras</summary><ul>{sam_rows}</ul></details>"
+    ambN = sum(len(v) > 1 for v in AMBIG_NI.values())
+    ambList = ", ".join(f"{k}→{sorted(list(v))[:3]}" for k,v in list(AMBIG_NI.items())[:5])
     BI_DIAG_HTML = f"""
     <div style="font-family:Georgia,serif">
       Archivo: <b>{escape(CSV_BI)}</b><br>
       Filas base (CSV): <b>{rows:,}</b><br>
       ES únicas (tras expansiones): <b>{es_unique:,}</b> &nbsp;|&nbsp; NI únicas: <b>{ni_unique:,}</b> &nbsp;|&nbsp; pair_id únicos: <b>{pid_unique:,}</b><br>
+      Duplicados ES: <b>{dup_es:,}</b> &nbsp;|&nbsp; Duplicados NI: <b>{dup_ni:,}</b> (bloqueados en estricto) &nbsp;|&nbsp; Sin pair_id: <b>{empty_pid:,}</b><br>
+      Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b>
       {sam_html}
+      <hr style="border:0;border-top:1px solid #caa">
+      <small>NI ambiguas bloqueadas: <b>{ambN:,}</b>{(' · ej.: ' + escape(ambList)) if ambN else ''}</small><br>
+      <small>Regla: el motor usa <b>sólo</b> tablas 1:1; NI duplicadas se bloquean y se muestran como <code>[AMB-NI:...]</code>.</small>
     </div>
     """
     return rows > 0
     """Devuelve (span, surface) si hay frase que comience en i."""
     if not phrase_map: return (0, None)
     max_span = 0; surface = None
+    # determinista: prioriza el span más largo
     for span in range(1, MAX_NGRAM+1):
         if i+span > len(tokens): break
         cand = " ".join(lower(t) for t in tokens[i:i+span])
     return "".join(out)
+# ✦ FIX: no re-espaciar horas/decimales y no añadir espacios tras “:”/“,”
 def postprocess_spanish(s: str) -> str:
+    # 1) compactar horas y decimales
+    s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)             # 18:30
+    s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)       # 12,65 / 3.1415
+    # 2) espacios y signos
+    s = re.sub(r"\s+([,.;:!?])", r"\1", s)                  # nada antes de signos
+    # añadir espacio SOLO tras . ! ? ;   (NO tras coma/“:”)
+    s = re.sub(r"([?.!;])(?!\s|$)([^\s])", r"\1 \2", s)
+    # 3) signos invertidos
     s = re.sub(r"([¿¡])\s+", r"\1", s)
+    # 4) colapsar espacios
+    s = re.sub(r"\s{2,}", " ", s).strip()
+    # 5) mayúscula inicial de oración
+    return sentence_case_spanish(s)
 # ====== Traducción BI estricta ======
 def translate_es_to_ni_bi(text:str):
         if key in NI2ES:
             es = NI2ES[key][0] or ""
             out.append(es if es else f"[?:{t}]")
+        elif key in AMBIG_NI and STRICT_BI_ENFORCE:
+            # ★ determinista: no elegimos al azar superficies NI con colisión
+            out.append(f"[AMB-NI:{t}]")
         elif is_number(key):
             out.append(t)
         else:
         return "<em>Introduce texto para diagnosticar.</em>"
     toks = simple_tokenize(text)
+    unknown=set(); asym=set(); amb=set()
     total_tokens=0; covered=0
     if dir_label.startswith("ES"):
             if span > 1:
                 covered += 1; i += span; continue
             k=lower(t)
+            if k in AMBIG_NI:
+                amb.add(t); i+=1; continue
             if k not in NI2ES:
                 unknown.add(t); i+=1; continue
             covered += 1
     cov_html = f"<div><b>Tokens (sin puntuación/numéricos):</b> {total_tokens} &nbsp;|&nbsp; <b>Cubiertos:</b> {covered} ({cov_pct:.1f}%)</div>"
     unk_html = "".join(f"<li><code>{escape(u)}</code></li>" for u in sorted(unknown, key=lambda x: lower(x))) or "<li><i>—</i></li>"
+    amb_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(amb, key=lambda x: lower(x))) or "<li><i>—</i></li>"
     asy_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(asym)) or "<li><i>—</i></li>"
+    return f"<b>Diagnóstico {head}</b>{cov_html}<b>Ambiguas (NI duplicada):</b><ul>{amb_html}</ul><b>Faltantes:</b><ul>{unk_html}</ul><b>Asimetrías:</b><ul>{asy_html}</ul>"
 # ====== UI (CSS / acordeones / fuentes) ======
 LABELS={
     "ES":{
         "title":"Traductor Español ↔ Neoíbero",
+        "subtitle":"CSV estricto (BI-only 1:1; sin heurísticas; .gz) — determinista",
         "in_label_es":"✏️ Entrada (Español)",
         "in_label_ni":"✏️ Entrada (Neoíbero)",
         "in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
     },
     "EN":{
         "title":"Spanish ↔ Neo-Iberian Translator",
+        "subtitle":"Strict BI-only (1:1 surfaces; no heuristics; .gz) — deterministic",
         "in_label_es":"✏️ Input (Spanish)",
         "in_label_ni":"✏️ Input (Neo-Iberian)",
         "in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.",
             "🎓 Background & design choices",
             "🏛️ Possible inheritance from ancient Iberian",
             "🎨 Conlang design (Neo-Iberian)",
+            "⚙️ Translator pipeline (strict 1:1)",
             "🔤 Orthography, Iberian line & keys",
             "❓/❗ Vascoid modality (-na / -ba)",
             "🧩 CSV-driven expansions: plurals (S) & 3pl (3/V3)",
         "**Escritura y datos.** Un **único CSV con `pair_id`** y superficies exactas. La traducción ES↔NI es **1:1** por superficie.",
         "**Herencia plausible del íbero.** Fonotaxis CV(C); p→b; r/ŕ; casos -k/-te/-ka/-ar/-en/-i.",
         "**Diseño del neoíbero.** TAM: PRS -ke, PST -bo, FUT -ta, IPFV -ri, IMP -tu, COND/SBJV -ni, FUT_SBJV -ra.",
+        "**Pipeline (BI-estricto 1:1).** Tokeniza; sustitución exacta; NI ambigua **se bloquea** y sale como `[AMB-NI:…]`.",
         "**Ortografía y línea ibérica.** Tokens BA/BE/…; tridots '/'; p→b; codas N/S/Ś/R/Ŕ/L/M/K/T.",
+        "**Modalidad (-na/-ba).** ES→NI puede omitir ¿?¡! (si está activo). NI→ES inserta `¿…?`/`¡…!` al final de la oración marcada, **no en comas**.",
+        "**Expansiones por CSV (deterministas).** `flags=S` plural regular; `flags=3|V3` 3ª plural del presente. Solo si lo marcas.",
         "**Gramática mínima.** Visualización; la gramática no se “calcula”.",
         "**Bibliografía.** Untermann; de Hoz; Ferrer i Jané; Correa…",
+        "**Glosario & datasets.** Faltas → `[SIN-LEX:…]` / `[?:…]`. Ambiguas → `[AMB-NI:…]` (limpia tu CSV).",
         "**Simetría por pair_id.** El diagnóstico avisa si una NI apunta a dos ES distintos."
     ],
     "EN":[
+        "One bilingual CSV with `pair_id` and exact surfaces. ES↔NI is strictly 1:1.",
         "Possible inheritance (non-palaeographic).",
         "Neo-Iberian design (phonology & morphology).",
+        "Pipeline: tokenise → exact replacement. Ambiguous NI are **blocked** and rendered as `[AMB-NI:…]`.",
         "Orthography, Iberian line & keys.",
+        "Modality (-na/-ba): ES→NI can drop ¿?¡!. NI→ES places them at sentence end, not at commas.",
+        "CSV-driven expansions (deterministic): `S` plural; `3|V3` present 3pl.",
         "Minimal grammar (v1.2).",
         "Selected references.",
         "Glossary & datasets.",
+        "Pair-id symmetry diagnostics."
     ]
 }
 # ====== smoke opcional ======
 def _symmetry_smoketest():
+    print("\n[SMOKE] Prueba ES↔NI (BI-estricto, determinista)…")
     probes = [
+        "nuker-ke ni etxe-ka ?",
+        "¿Pagaste 12,75 en la cafetería?",
+        "Marta llega a las 18:30.",
+        "[SIN-LEX:Tomás]-na euŕak-ke !"
     ]
     for p in probes:
         es_from_ni = translate_ni_to_es_bi(p)
     demo.queue().launch()