Update app.py
Browse files
app.py
CHANGED
|
@@ -1,13 +1,13 @@
|
|
| 1 |
-
# app.py — Traductor Español ↔ Neoíbero (BI-ONLY 1:1 estricto)
|
| 2 |
# UI completa + CSS “íbero” + TTS + Línea ibérica (codificación appOld)
|
| 3 |
# Requiere un ÚNICO CSV con superficies exactas (UTF-8) y columnas:
|
| 4 |
-
# - source_es (o es/es_surface)
|
| 5 |
# - target_ni (o ni/ni_surface)
|
| 6 |
-
# - target_es (opcional pero RECOMENDADO) ← superficies ES usadas para NI→ES
|
| 7 |
# - pair_id (opcional)
|
| 8 |
#
|
| 9 |
# El motor NO hace heurísticas ni morfología: 1:1 exacto por superficie.
|
| 10 |
# Puntuación y números pasan tal cual. Desconocidos -> [SIN-LEX:...] / [?:...]
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import os, csv, re, base64, unicodedata, gzip
|
|
@@ -57,13 +57,13 @@ CSV_BI = _cand(
|
|
| 57 |
|
| 58 |
# ====== estructuras strict BI ======
|
| 59 |
# Clave = superficie exacta en minúsculas. Valor = (superficie_original_opuesta, pair_id)
|
| 60 |
-
ES2NI = {} # es_surface_lower -> (ni_surface, pair_id)
|
| 61 |
-
NI2ES = {} # ni_surface_lower -> (
|
| 62 |
|
| 63 |
-
#
|
| 64 |
ESPHRASE2NI = {} # "el saco" -> (ni_surface, pair_id)
|
| 65 |
-
NIPHRASE2ES = {} # "…-ke ni etxe-ka" -> (
|
| 66 |
-
MAX_NGRAM = 3
|
| 67 |
|
| 68 |
# ====== signos / tokenización mínima ======
|
| 69 |
VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
|
|
@@ -71,16 +71,12 @@ _num_re = re.compile(r"^\d+([.,]\d+)?$")
|
|
| 71 |
def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
|
| 72 |
|
| 73 |
# --- separadores de cláusula + placeholders atómicos ---
|
| 74 |
-
CLAUSE_BREAKS = {",", ";", "—", "–", ":"}
|
| 75 |
PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
|
| 76 |
def is_placeholder(tok: str) -> bool:
|
| 77 |
return bool(PLACEHOLDER_RE.match(tok or ""))
|
| 78 |
|
| 79 |
def _restore_brk(tok, protected):
|
| 80 |
-
"""
|
| 81 |
-
Restaura __BRKn__ y también __BRKn__-na / __BRKn__-ba a su forma original,
|
| 82 |
-
manteniendo el sufijo modal si existe (p.ej. '[SIN-LEX:Tomás]-na').
|
| 83 |
-
"""
|
| 84 |
m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
|
| 85 |
if not m: return tok
|
| 86 |
idx = int(m.group(1))
|
|
@@ -97,39 +93,29 @@ def simple_tokenize(text:str):
|
|
| 97 |
key = f"__BRK{len(protected)}__"
|
| 98 |
protected.append(m.group(0))
|
| 99 |
return key
|
| 100 |
-
|
| 101 |
-
# protegemos bloques [ ... ]
|
| 102 |
t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
|
| 103 |
t = re.sub(r"\s+"," ", t)
|
| 104 |
t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
|
| 105 |
toks = [tok for tok in t.split() if tok]
|
| 106 |
-
|
| 107 |
-
# restaura bloques protegidos (con soporte -na/-ba adheridos)
|
| 108 |
for i, tok in enumerate(toks):
|
| 109 |
-
# si viene pegado el sufijo modal, no se habrá restaurado; hacemos la restauración robusta
|
| 110 |
if tok.startswith("__BRK") and "__" in tok:
|
| 111 |
toks[i] = _restore_brk(tok, protected)
|
| 112 |
return toks
|
| 113 |
|
| 114 |
def detokenize(tokens):
|
| 115 |
s = " ".join(tokens)
|
| 116 |
-
# cerrar espacios antes de .,;:!?
|
| 117 |
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
|
| 118 |
-
# quitar espacio tras abridores invertidos
|
| 119 |
s = re.sub(r"([¿¡])\s+", r"\1", s)
|
| 120 |
-
# paréntesis
|
| 121 |
s = re.sub(r"\(\s+", "(", s)
|
| 122 |
s = re.sub(r"\s+\)", ")", s)
|
| 123 |
s = re.sub(r"\s{2,}", " ", s).strip()
|
| 124 |
return s
|
| 125 |
|
| 126 |
# ====== Modalidad vascoide (-na / -ba) ======
|
| 127 |
-
# Configuración
|
| 128 |
MODAL_SUFFIX_ENABLE = True
|
| 129 |
MODAL_ONLY_ON_FINITE = True
|
| 130 |
MODAL_STRIP_QE_IN_NI = True
|
| 131 |
|
| 132 |
-
# Conjuntos y ayudas
|
| 133 |
SENT_END = {".", "!", "?", "…"}
|
| 134 |
OPEN_FOR = {"?": "¿", "!": "¡"}
|
| 135 |
WRAP_PREFIX = set(list("«“‘([{\"'"))
|
|
@@ -202,8 +188,8 @@ def add_modal_suffixes_es2ni(tokens):
|
|
| 202 |
|
| 203 |
def strip_modal_suffixes_ni(tokens):
|
| 204 |
"""
|
| 205 |
-
Interpreta -na/-ba como modalidad;
|
| 206 |
-
|
| 207 |
"""
|
| 208 |
if not MODAL_SUFFIX_ENABLE:
|
| 209 |
return tokens
|
|
@@ -226,25 +212,19 @@ def strip_modal_suffixes_ni(tokens):
|
|
| 226 |
|
| 227 |
toks = tokens + ["."]
|
| 228 |
for i, t in enumerate(toks):
|
| 229 |
-
# Abridores explícitos
|
| 230 |
if t in ("¿", "¡"):
|
| 231 |
_emit(); mode = "?" if t == "¿" else "!"
|
| 232 |
continue
|
| 233 |
-
# Cierres explícitos
|
| 234 |
if t in ("?", "!"):
|
| 235 |
pending_end = t; _emit(); continue
|
| 236 |
-
# Final de oración
|
| 237 |
if t in SENT_END:
|
| 238 |
pending_end = t; _emit(); continue
|
| 239 |
-
|
|
|
|
| 240 |
if t in CLAUSE_BREAKS and mode in ("?","!"):
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
pass
|
| 244 |
-
else:
|
| 245 |
-
_emit(also_append=t); continue
|
| 246 |
|
| 247 |
-
# Sufijos -na/-ba (en cualquier token, incl. placeholders)
|
| 248 |
m = re.search(r"-(na|ba)$", (t or "").lower())
|
| 249 |
if m:
|
| 250 |
if mode and buf: _emit()
|
|
@@ -270,7 +250,6 @@ def add_inverted_openers(tokens):
|
|
| 270 |
while i < len(out):
|
| 271 |
if out[i] in ("?", "!"):
|
| 272 |
closer = out[i]; opener = OPEN_FOR[closer]
|
| 273 |
-
# inicio del tramo = después del último fin de oración o separador FALSO/VERDADERO
|
| 274 |
j = i - 1
|
| 275 |
while j >= 0 and not _is_true_start_break(j):
|
| 276 |
j -= 1
|
|
@@ -283,7 +262,7 @@ def add_inverted_openers(tokens):
|
|
| 283 |
i += 1
|
| 284 |
return out
|
| 285 |
|
| 286 |
-
# ====== EXPANSIONES
|
| 287 |
EXPANSION_ENABLE = True
|
| 288 |
FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH")
|
| 289 |
FLAG_PLURAL = ("S",)
|
|
@@ -408,100 +387,104 @@ def render_ib_with_tridots(ib_toks):
|
|
| 408 |
return "".join(res).strip()
|
| 409 |
|
| 410 |
# ====== BI loader + diagnóstico ======
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
BI_DIAG_HTML = "<em>Sin CSV cargado.</em>"
|
| 412 |
|
| 413 |
def load_bi_strict_and_diagnose():
|
| 414 |
"""Carga el CSV, llena ES2NI/NI2ES y prepara un HTML de diagnóstico."""
|
| 415 |
global BI_DIAG_HTML
|
|
|
|
|
|
|
|
|
|
|
|
|
| 416 |
if not os.path.exists(CSV_BI):
|
| 417 |
msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}"
|
| 418 |
print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}"
|
| 419 |
return False
|
| 420 |
|
| 421 |
-
def _choose_col(flds, prefer_list, fallback=None):
|
| 422 |
-
for c in prefer_list:
|
| 423 |
-
if c in flds: return c
|
| 424 |
-
return fallback
|
| 425 |
-
|
| 426 |
rows=0; dup_es=0; dup_ni=0; empty_pid=0
|
| 427 |
mismatch_backmap = 0
|
| 428 |
mismatch_samples = []
|
| 429 |
pid_seen=set()
|
| 430 |
|
| 431 |
-
exp_plurals = 0
|
| 432 |
-
exp_3pl = 0
|
| 433 |
-
|
| 434 |
print(f"Detectado CSV bilingüe: {CSV_BI}")
|
| 435 |
try:
|
| 436 |
with _open_maybe_gzip(CSV_BI) as f:
|
| 437 |
rd = csv.DictReader(f)
|
| 438 |
flds=set(rd.fieldnames or [])
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
ES_SURF_COL = _choose_col(flds, ["source_es","es_surface","target_es","es"], "es")
|
| 443 |
-
# NI_COL: superficies NI
|
| 444 |
-
NI_COL = _choose_col(flds, ["target_ni","ni_surface","ni"], "ni")
|
| 445 |
-
# ES_FOR_NI_COL: superficies ES para NI→ES (valores de NI2ES)
|
| 446 |
-
ES_FOR_NI_COL = _choose_col(flds, ["target_es","es_surface","source_es","es"], "es")
|
| 447 |
-
|
| 448 |
FLAGCOL = None
|
| 449 |
for cand in FLAG_COLNAMES:
|
| 450 |
if cand in flds:
|
| 451 |
FLAGCOL = cand; break
|
| 452 |
-
IDCOL = "pair_id" if "pair_id" in flds else ("id" if "id" in flds else None)
|
| 453 |
|
| 454 |
-
base_rows = []
|
| 455 |
for r in rd:
|
| 456 |
-
|
| 457 |
-
ni_orig
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
if not (es_surf_orig and ni_orig):
|
| 461 |
-
continue
|
| 462 |
-
|
| 463 |
-
pid = (norm(r.get(IDCOL)) if IDCOL else "")
|
| 464 |
if not pid: empty_pid += 1
|
| 465 |
else: pid_seen.add(pid)
|
| 466 |
flags = (r.get(FLAGCOL) or "") if FLAGCOL else ""
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
# Frases
|
| 472 |
-
if " " in
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
else:
|
| 483 |
-
|
| 484 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 485 |
rows += 1
|
| 486 |
|
| 487 |
-
# Expansiones deterministas
|
| 488 |
if EXPANSION_ENABLE:
|
| 489 |
-
for
|
| 490 |
if not flags: continue
|
| 491 |
if _has_flag(flags, FLAG_PLURAL):
|
| 492 |
-
pl = _pluralize_es_form(
|
| 493 |
pl_key = lower(pl)
|
| 494 |
if pl_key not in ES2NI:
|
| 495 |
ES2NI[pl_key] = (ni_orig, pid)
|
| 496 |
-
exp_plurals += 1
|
| 497 |
if _has_flag(flags, FLAG_3PL):
|
| 498 |
-
p3 = _present_3pl_from_3sg(
|
| 499 |
p3_key = lower(p3)
|
| 500 |
if p3_key not in ES2NI:
|
| 501 |
ES2NI[p3_key] = (ni_orig, pid)
|
| 502 |
-
exp_3pl += 1
|
| 503 |
|
| 504 |
-
# Diagnóstico
|
| 505 |
for es_low, (ni_surf, _) in ES2NI.items():
|
| 506 |
ni_low = lower(ni_surf)
|
| 507 |
back = NI2ES.get(ni_low)
|
|
@@ -521,7 +504,7 @@ def load_bi_strict_and_diagnose():
|
|
| 521 |
|
| 522 |
print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.")
|
| 523 |
if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).")
|
| 524 |
-
if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (
|
| 525 |
if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
|
| 526 |
if mismatch_backmap:
|
| 527 |
print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
|
|
@@ -534,22 +517,8 @@ def load_bi_strict_and_diagnose():
|
|
| 534 |
)
|
| 535 |
sam_html = f"<details><summary>Muestras</summary><ul>{sam_rows}</ul></details>"
|
| 536 |
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
<div style="margin-top:.5rem">
|
| 540 |
-
<b>Columnas usadas</b>:
|
| 541 |
-
ES→NI = <code>{escape(ES_SURF_COL)}</code> |
|
| 542 |
-
NI→ES = <code>{escape(ES_FOR_NI_COL)}</code> |
|
| 543 |
-
NI = <code>{escape(NI_COL)}</code>
|
| 544 |
-
</div>
|
| 545 |
-
"""
|
| 546 |
-
warn_inf = ""
|
| 547 |
-
if "es" in (ES_SURF_COL,) and ES_FOR_NI_COL == "es":
|
| 548 |
-
warn_inf = "<div style='color:#a00'><b>⚠ Aviso:</b> Se detectó que el CSV sólo tiene <code>es</code>. Si ese campo es <i>lema</i>, la inversa podría irse a infinitivo. Este motor ya intenta usar <code>target_es</code>/<code>es_surface</code> si existen.</div>"
|
| 549 |
-
|
| 550 |
-
exp_html = ""
|
| 551 |
-
if EXPANSION_ENABLE and (exp_plurals or exp_3pl):
|
| 552 |
-
exp_html = f"<div>Expansiones aplicadas → Plurales: <b>{exp_plurals}</b> · 3pl: <b>{exp_3pl}</b></div>"
|
| 553 |
|
| 554 |
BI_DIAG_HTML = f"""
|
| 555 |
<div style="font-family:Georgia,serif">
|
|
@@ -557,15 +526,12 @@ def load_bi_strict_and_diagnose():
|
|
| 557 |
Archivo: <b>{escape(CSV_BI)}</b><br>
|
| 558 |
Filas base (CSV): <b>{rows:,}</b><br>
|
| 559 |
ES únicas (tras expansiones): <b>{es_unique:,}</b> | NI únicas: <b>{ni_unique:,}</b> | pair_id únicos: <b>{pid_unique:,}</b><br>
|
| 560 |
-
Duplicados ES: <b>{dup_es:,}</b> | Duplicados NI: <b>{dup_ni:,}</b> | Sin pair_id: <b>{empty_pid:,}</b><br>
|
| 561 |
-
Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b
|
| 562 |
-
{cols_html}
|
| 563 |
-
{warn_inf}
|
| 564 |
-
<hr style="border:0;border-top:1px solid #caa">
|
| 565 |
-
<small>Regla: el motor usa <b>sólo</b> tablas 1:1 y expansiones <b>explícitas por bandera</b> (flags) en el CSV.
|
| 566 |
-
Nada “adivina”.</small>
|
| 567 |
-
{exp_html}
|
| 568 |
{sam_html}
|
|
|
|
|
|
|
|
|
|
| 569 |
</div>
|
| 570 |
"""
|
| 571 |
return rows > 0
|
|
@@ -578,6 +544,7 @@ def _longest_match(tokens, i, phrase_map):
|
|
| 578 |
"""Devuelve (span, surface) si hay frase que comience en i."""
|
| 579 |
if not phrase_map: return (0, None)
|
| 580 |
max_span = 0; surface = None
|
|
|
|
| 581 |
for span in range(1, MAX_NGRAM+1):
|
| 582 |
if i+span > len(tokens): break
|
| 583 |
cand = " ".join(lower(t) for t in tokens[i:i+span])
|
|
@@ -619,16 +586,21 @@ def sentence_case_spanish(s: str) -> str:
|
|
| 619 |
|
| 620 |
return "".join(out)
|
| 621 |
|
|
|
|
| 622 |
def postprocess_spanish(s: str) -> str:
|
| 623 |
-
# compactar horas y decimales
|
| 624 |
-
s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s)
|
| 625 |
-
s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s)
|
| 626 |
-
# espacios y signos
|
| 627 |
-
s = re.sub(r"\s
|
| 628 |
-
|
| 629 |
-
s = re.sub(r"([
|
|
|
|
| 630 |
s = re.sub(r"([¿¡])\s+", r"\1", s)
|
| 631 |
-
|
|
|
|
|
|
|
|
|
|
| 632 |
|
| 633 |
# ====== Traducción BI estricta ======
|
| 634 |
def translate_es_to_ni_bi(text:str):
|
|
@@ -694,6 +666,9 @@ def translate_ni_to_es_bi(text:str):
|
|
| 694 |
if key in NI2ES:
|
| 695 |
es = NI2ES[key][0] or ""
|
| 696 |
out.append(es if es else f"[?:{t}]")
|
|
|
|
|
|
|
|
|
|
| 697 |
elif is_number(key):
|
| 698 |
out.append(t)
|
| 699 |
else:
|
|
@@ -713,7 +688,7 @@ def diagnose_text(text, dir_label):
|
|
| 713 |
return "<em>Introduce texto para diagnosticar.</em>"
|
| 714 |
|
| 715 |
toks = simple_tokenize(text)
|
| 716 |
-
unknown=set(); asym=set()
|
| 717 |
total_tokens=0; covered=0
|
| 718 |
|
| 719 |
if dir_label.startswith("ES"):
|
|
@@ -748,6 +723,8 @@ def diagnose_text(text, dir_label):
|
|
| 748 |
if span > 1:
|
| 749 |
covered += 1; i += span; continue
|
| 750 |
k=lower(t)
|
|
|
|
|
|
|
| 751 |
if k not in NI2ES:
|
| 752 |
unknown.add(t); i+=1; continue
|
| 753 |
covered += 1
|
|
@@ -761,15 +738,16 @@ def diagnose_text(text, dir_label):
|
|
| 761 |
cov_html = f"<div><b>Tokens (sin puntuación/numéricos):</b> {total_tokens} | <b>Cubiertos:</b> {covered} ({cov_pct:.1f}%)</div>"
|
| 762 |
|
| 763 |
unk_html = "".join(f"<li><code>{escape(u)}</code></li>" for u in sorted(unknown, key=lambda x: lower(x))) or "<li><i>—</i></li>"
|
|
|
|
| 764 |
asy_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(asym)) or "<li><i>—</i></li>"
|
| 765 |
|
| 766 |
-
return f"<b>Diagnóstico {head}</b>{cov_html}<b>Faltantes:</b><ul>{unk_html}</ul><b>Asimetrías:</b><ul>{asy_html}</ul>"
|
| 767 |
|
| 768 |
# ====== UI (CSS / acordeones / fuentes) ======
|
| 769 |
LABELS={
|
| 770 |
"ES":{
|
| 771 |
"title":"Traductor Español ↔ Neoíbero",
|
| 772 |
-
"subtitle":"CSV estricto (BI-only 1:1; sin heurísticas; .gz)",
|
| 773 |
"in_label_es":"✏️ Entrada (Español)",
|
| 774 |
"in_label_ni":"✏️ Entrada (Neoíbero)",
|
| 775 |
"in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
|
|
@@ -799,7 +777,7 @@ LABELS={
|
|
| 799 |
},
|
| 800 |
"EN":{
|
| 801 |
"title":"Spanish ↔ Neo-Iberian Translator",
|
| 802 |
-
"subtitle":"Strict BI-only (1:1 surfaces; no heuristics; .gz)",
|
| 803 |
"in_label_es":"✏️ Input (Spanish)",
|
| 804 |
"in_label_ni":"✏️ Input (Neo-Iberian)",
|
| 805 |
"in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.",
|
|
@@ -817,7 +795,7 @@ LABELS={
|
|
| 817 |
"🎓 Background & design choices",
|
| 818 |
"🏛️ Possible inheritance from ancient Iberian",
|
| 819 |
"🎨 Conlang design (Neo-Iberian)",
|
| 820 |
-
"⚙️ Translator pipeline (strict
|
| 821 |
"🔤 Orthography, Iberian line & keys",
|
| 822 |
"❓/❗ Vascoid modality (-na / -ba)",
|
| 823 |
"🧩 CSV-driven expansions: plurals (S) & 3pl (3/V3)",
|
|
@@ -833,27 +811,27 @@ DOC = {
|
|
| 833 |
"**Escritura y datos.** Un **único CSV con `pair_id`** y superficies exactas. La traducción ES↔NI es **1:1** por superficie.",
|
| 834 |
"**Herencia plausible del íbero.** Fonotaxis CV(C); p→b; r/ŕ; casos -k/-te/-ka/-ar/-en/-i.",
|
| 835 |
"**Diseño del neoíbero.** TAM: PRS -ke, PST -bo, FUT -ta, IPFV -ri, IMP -tu, COND/SBJV -ni, FUT_SBJV -ra.",
|
| 836 |
-
"**Pipeline (BI-estricto 1:1).** Tokeniza; sustitución exacta
|
| 837 |
"**Ortografía y línea ibérica.** Tokens BA/BE/…; tridots '/'; p→b; codas N/S/Ś/R/Ŕ/L/M/K/T.",
|
| 838 |
-
"**Modalidad (-na/-ba).**
|
| 839 |
-
"**Expansiones por CSV (deterministas).**
|
| 840 |
"**Gramática mínima.** Visualización; la gramática no se “calcula”.",
|
| 841 |
"**Bibliografía.** Untermann; de Hoz; Ferrer i Jané; Correa…",
|
| 842 |
-
"**Glosario & datasets.** Faltas → `[SIN-LEX:…]` / `[?:…]`.
|
| 843 |
"**Simetría por pair_id.** El diagnóstico avisa si una NI apunta a dos ES distintos."
|
| 844 |
],
|
| 845 |
"EN":[
|
| 846 |
-
"
|
| 847 |
"Possible inheritance (non-palaeographic).",
|
| 848 |
"Neo-Iberian design (phonology & morphology).",
|
| 849 |
-
"Pipeline
|
| 850 |
"Orthography, Iberian line & keys.",
|
| 851 |
-
"
|
| 852 |
-
"
|
| 853 |
"Minimal grammar (v1.2).",
|
| 854 |
"Selected references.",
|
| 855 |
"Glossary & datasets.",
|
| 856 |
-
"Pair-id symmetry diagnostics
|
| 857 |
]
|
| 858 |
}
|
| 859 |
|
|
@@ -1044,12 +1022,12 @@ with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue
|
|
| 1044 |
|
| 1045 |
# ====== smoke opcional ======
|
| 1046 |
def _symmetry_smoketest():
|
| 1047 |
-
print("\n[SMOKE] Prueba ES↔NI (BI-estricto)…")
|
| 1048 |
probes = [
|
| 1049 |
-
"nuker-ke ni etxe-ka ?",
|
| 1050 |
-
"¿Pagaste 12,75 en la cafetería?",
|
| 1051 |
-
"Marta llega a las 18:30.",
|
| 1052 |
-
"[SIN-LEX:Tomás]-na euŕak-ke !"
|
| 1053 |
]
|
| 1054 |
for p in probes:
|
| 1055 |
es_from_ni = translate_ni_to_es_bi(p)
|
|
@@ -1066,5 +1044,3 @@ if __name__ == "__main__":
|
|
| 1066 |
demo.queue().launch()
|
| 1067 |
|
| 1068 |
|
| 1069 |
-
|
| 1070 |
-
|
|
|
|
| 1 |
+
# app.py — Traductor Español ↔ Neoíbero (BI-ONLY 1:1 estricto, determinista)
|
| 2 |
# UI completa + CSS “íbero” + TTS + Línea ibérica (codificación appOld)
|
| 3 |
# Requiere un ÚNICO CSV con superficies exactas (UTF-8) y columnas:
|
| 4 |
+
# - source_es (o es/es_surface)
|
| 5 |
# - target_ni (o ni/ni_surface)
|
|
|
|
| 6 |
# - pair_id (opcional)
|
| 7 |
#
|
| 8 |
# El motor NO hace heurísticas ni morfología: 1:1 exacto por superficie.
|
| 9 |
# Puntuación y números pasan tal cual. Desconocidos -> [SIN-LEX:...] / [?:...]
|
| 10 |
+
# Determinismo NI→ES: entradas NI duplicadas (ambigüas) quedan bloqueadas y se rinden como [AMB-NI:...]
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import os, csv, re, base64, unicodedata, gzip
|
|
|
|
| 57 |
|
| 58 |
# ====== estructuras strict BI ======
|
| 59 |
# Clave = superficie exacta en minúsculas. Valor = (superficie_original_opuesta, pair_id)
|
| 60 |
+
ES2NI = {} # es_surface_lower -> (ni_surface, pair_id)
|
| 61 |
+
NI2ES = {} # ni_surface_lower -> (es_surface, pair_id)
|
| 62 |
|
| 63 |
+
# N-gramas/frases:
|
| 64 |
ESPHRASE2NI = {} # "el saco" -> (ni_surface, pair_id)
|
| 65 |
+
NIPHRASE2ES = {} # "…-ke ni etxe-ka" -> (es_surface, pair_id)
|
| 66 |
+
MAX_NGRAM = 3
|
| 67 |
|
| 68 |
# ====== signos / tokenización mínima ======
|
| 69 |
VISIBLE_PUNCT = set(list(",.;:!?¡¿…()[]{}\"'«»—–“”‘’"))
|
|
|
|
| 71 |
def is_number(tok:str)->bool: return bool(_num_re.fullmatch(tok or ""))
|
| 72 |
|
| 73 |
# --- separadores de cláusula + placeholders atómicos ---
|
| 74 |
+
CLAUSE_BREAKS = {",", ";", "—", "–", ":"}
|
| 75 |
PLACEHOLDER_RE = re.compile(r"^\[[^\]]+\]$")
|
| 76 |
def is_placeholder(tok: str) -> bool:
|
| 77 |
return bool(PLACEHOLDER_RE.match(tok or ""))
|
| 78 |
|
| 79 |
def _restore_brk(tok, protected):
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
m = re.fullmatch(r"__BRK(\d+)__(?:-(na|ba))?", tok or "")
|
| 81 |
if not m: return tok
|
| 82 |
idx = int(m.group(1))
|
|
|
|
| 93 |
key = f"__BRK{len(protected)}__"
|
| 94 |
protected.append(m.group(0))
|
| 95 |
return key
|
|
|
|
|
|
|
| 96 |
t = re.sub(r"\[[^\]]*\]", _repl, (text or "").strip())
|
| 97 |
t = re.sub(r"\s+"," ", t)
|
| 98 |
t = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–“”‘’])", r" \1 ", t)
|
| 99 |
toks = [tok for tok in t.split() if tok]
|
|
|
|
|
|
|
| 100 |
for i, tok in enumerate(toks):
|
|
|
|
| 101 |
if tok.startswith("__BRK") and "__" in tok:
|
| 102 |
toks[i] = _restore_brk(tok, protected)
|
| 103 |
return toks
|
| 104 |
|
| 105 |
def detokenize(tokens):
|
| 106 |
s = " ".join(tokens)
|
|
|
|
| 107 |
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
|
|
|
|
| 108 |
s = re.sub(r"([¿¡])\s+", r"\1", s)
|
|
|
|
| 109 |
s = re.sub(r"\(\s+", "(", s)
|
| 110 |
s = re.sub(r"\s+\)", ")", s)
|
| 111 |
s = re.sub(r"\s{2,}", " ", s).strip()
|
| 112 |
return s
|
| 113 |
|
| 114 |
# ====== Modalidad vascoide (-na / -ba) ======
|
|
|
|
| 115 |
MODAL_SUFFIX_ENABLE = True
|
| 116 |
MODAL_ONLY_ON_FINITE = True
|
| 117 |
MODAL_STRIP_QE_IN_NI = True
|
| 118 |
|
|
|
|
| 119 |
SENT_END = {".", "!", "?", "…"}
|
| 120 |
OPEN_FOR = {"?": "¿", "!": "¡"}
|
| 121 |
WRAP_PREFIX = set(list("«“‘([{\"'"))
|
|
|
|
| 188 |
|
| 189 |
def strip_modal_suffixes_ni(tokens):
|
| 190 |
"""
|
| 191 |
+
Interpreta -na/-ba como modalidad; ahora SOLO cerramos al final de oración.
|
| 192 |
+
(No cerramos en comas/“:”, salvo que ya haya ?/! explícitos.)
|
| 193 |
"""
|
| 194 |
if not MODAL_SUFFIX_ENABLE:
|
| 195 |
return tokens
|
|
|
|
| 212 |
|
| 213 |
toks = tokens + ["."]
|
| 214 |
for i, t in enumerate(toks):
|
|
|
|
| 215 |
if t in ("¿", "¡"):
|
| 216 |
_emit(); mode = "?" if t == "¿" else "!"
|
| 217 |
continue
|
|
|
|
| 218 |
if t in ("?", "!"):
|
| 219 |
pending_end = t; _emit(); continue
|
|
|
|
| 220 |
if t in SENT_END:
|
| 221 |
pending_end = t; _emit(); continue
|
| 222 |
+
|
| 223 |
+
# ✦ MODALIDAD: en separadores de cláusula NO cerramos todavía:
|
| 224 |
if t in CLAUSE_BREAKS and mode in ("?","!"):
|
| 225 |
+
buf.append(t)
|
| 226 |
+
continue
|
|
|
|
|
|
|
|
|
|
| 227 |
|
|
|
|
| 228 |
m = re.search(r"-(na|ba)$", (t or "").lower())
|
| 229 |
if m:
|
| 230 |
if mode and buf: _emit()
|
|
|
|
| 250 |
while i < len(out):
|
| 251 |
if out[i] in ("?", "!"):
|
| 252 |
closer = out[i]; opener = OPEN_FOR[closer]
|
|
|
|
| 253 |
j = i - 1
|
| 254 |
while j >= 0 and not _is_true_start_break(j):
|
| 255 |
j -= 1
|
|
|
|
| 262 |
i += 1
|
| 263 |
return out
|
| 264 |
|
| 265 |
+
# ====== EXPANSIONES (deterministas, sólo ES→NI) ======
|
| 266 |
EXPANSION_ENABLE = True
|
| 267 |
FLAG_COLNAMES = ("flags","FLAGS","expand","EXPAND","tags","TAGS","morph","MORPH")
|
| 268 |
FLAG_PLURAL = ("S",)
|
|
|
|
| 387 |
return "".join(res).strip()
|
| 388 |
|
| 389 |
# ====== BI loader + diagnóstico ======
|
| 390 |
+
|
| 391 |
+
# ### ★ MODO ESTRICTO Y DETERMINISTA
|
| 392 |
+
STRICT_BI_ENFORCE = True # si True, no se admite NI ambigua
|
| 393 |
+
AMBIG_NI = {} # ni_lower -> set de ES conflictivos
|
| 394 |
BI_DIAG_HTML = "<em>Sin CSV cargado.</em>"
|
| 395 |
|
| 396 |
def load_bi_strict_and_diagnose():
|
| 397 |
"""Carga el CSV, llena ES2NI/NI2ES y prepara un HTML de diagnóstico."""
|
| 398 |
global BI_DIAG_HTML
|
| 399 |
+
# vaciar estructuras antes de cargar (determinismo)
|
| 400 |
+
ES2NI.clear(); NI2ES.clear(); ESPHRASE2NI.clear(); NIPHRASE2ES.clear()
|
| 401 |
+
AMBIG_NI.clear()
|
| 402 |
+
|
| 403 |
if not os.path.exists(CSV_BI):
|
| 404 |
msg=f"[ERROR] No se encontró el CSV bilingüe: {CSV_BI}"
|
| 405 |
print(msg); BI_DIAG_HTML=f"<b>Error:</b> {escape(msg)}"
|
| 406 |
return False
|
| 407 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
rows=0; dup_es=0; dup_ni=0; empty_pid=0
|
| 409 |
mismatch_backmap = 0
|
| 410 |
mismatch_samples = []
|
| 411 |
pid_seen=set()
|
| 412 |
|
|
|
|
|
|
|
|
|
|
| 413 |
print(f"Detectado CSV bilingüe: {CSV_BI}")
|
| 414 |
try:
|
| 415 |
with _open_maybe_gzip(CSV_BI) as f:
|
| 416 |
rd = csv.DictReader(f)
|
| 417 |
flds=set(rd.fieldnames or [])
|
| 418 |
+
ES_COL = "source_es" if "source_es" in flds else "es_surface" if "es_surface" in flds else "es"
|
| 419 |
+
NI_COL = "target_ni" if "target_ni" in flds else "ni_surface" if "ni_surface" in flds else "ni"
|
| 420 |
+
IDCOL = "pair_id" if "pair_id" in flds else "id" if "id" in flds else None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
FLAGCOL = None
|
| 422 |
for cand in FLAG_COLNAMES:
|
| 423 |
if cand in flds:
|
| 424 |
FLAGCOL = cand; break
|
|
|
|
| 425 |
|
| 426 |
+
base_rows = []
|
| 427 |
for r in rd:
|
| 428 |
+
es_orig = (r.get(ES_COL) or "").strip()
|
| 429 |
+
ni_orig = (r.get(NI_COL) or "").strip()
|
| 430 |
+
if not (es_orig and ni_orig): continue
|
| 431 |
+
pid = (r.get(IDCOL) or "").strip() if IDCOL else ""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
if not pid: empty_pid += 1
|
| 433 |
else: pid_seen.add(pid)
|
| 434 |
flags = (r.get(FLAGCOL) or "") if FLAGCOL else ""
|
| 435 |
|
| 436 |
+
es = lower(es_orig)
|
| 437 |
+
ni = lower(ni_orig)
|
| 438 |
+
|
| 439 |
+
# Frases
|
| 440 |
+
if " " in es:
|
| 441 |
+
if es not in ESPHRASE2NI: # determinista: primera manda
|
| 442 |
+
ESPHRASE2NI[es] = (ni_orig, pid)
|
| 443 |
+
if " " in ni:
|
| 444 |
+
if ni not in NIPHRASE2ES:
|
| 445 |
+
NIPHRASE2ES[ni] = (es_orig, pid)
|
| 446 |
+
|
| 447 |
+
# ES→NI (determinista: primera fila gana)
|
| 448 |
+
if es in ES2NI:
|
| 449 |
+
dup_es += 1
|
| 450 |
+
else:
|
| 451 |
+
ES2NI[es] = (ni_orig, pid)
|
| 452 |
+
|
| 453 |
+
# NI→ES (determinista + bloqueo de ambigüedad)
|
| 454 |
+
if ni in NI2ES:
|
| 455 |
+
dup_ni += 1
|
| 456 |
+
# registra ambigüedad
|
| 457 |
+
s = AMBIG_NI.get(ni, set())
|
| 458 |
+
s.add(NI2ES[ni][0]); s.add(es_orig)
|
| 459 |
+
AMBIG_NI[ni] = s
|
| 460 |
+
if STRICT_BI_ENFORCE:
|
| 461 |
+
NI2ES.pop(ni, None) # invalida la superficie NI conflictiva
|
| 462 |
+
else:
|
| 463 |
+
if STRICT_BI_ENFORCE and ni in AMBIG_NI:
|
| 464 |
+
# ya marcada ambigua: no insertar
|
| 465 |
+
pass
|
| 466 |
+
else:
|
| 467 |
+
NI2ES[ni] = (es_orig, pid)
|
| 468 |
+
|
| 469 |
+
base_rows.append((es_orig, ni_orig, pid, flags))
|
| 470 |
rows += 1
|
| 471 |
|
| 472 |
+
# Expansiones deterministas (solo añaden ES2NI; NO tocan NI2ES)
|
| 473 |
if EXPANSION_ENABLE:
|
| 474 |
+
for es_orig, ni_orig, pid, flags in base_rows:
|
| 475 |
if not flags: continue
|
| 476 |
if _has_flag(flags, FLAG_PLURAL):
|
| 477 |
+
pl = _pluralize_es_form(es_orig)
|
| 478 |
pl_key = lower(pl)
|
| 479 |
if pl_key not in ES2NI:
|
| 480 |
ES2NI[pl_key] = (ni_orig, pid)
|
|
|
|
| 481 |
if _has_flag(flags, FLAG_3PL):
|
| 482 |
+
p3 = _present_3pl_from_3sg(es_orig)
|
| 483 |
p3_key = lower(p3)
|
| 484 |
if p3_key not in ES2NI:
|
| 485 |
ES2NI[p3_key] = (ni_orig, pid)
|
|
|
|
| 486 |
|
| 487 |
+
# Diagnóstico asimetrías (no afecta determinismo)
|
| 488 |
for es_low, (ni_surf, _) in ES2NI.items():
|
| 489 |
ni_low = lower(ni_surf)
|
| 490 |
back = NI2ES.get(ni_low)
|
|
|
|
| 504 |
|
| 505 |
print(f"✓ BI-ONLY ESTRICTO cargado: {rows:,} filas.")
|
| 506 |
if dup_es: print(f"[AVISO] {dup_es:,} duplicados ES (se usó la primera).")
|
| 507 |
+
if dup_ni: print(f"[AVISO] {dup_ni:,} duplicados NI (bloqueados en modo estricto).")
|
| 508 |
if empty_pid: print(f"[AVISO] {empty_pid:,} filas sin pair_id.")
|
| 509 |
if mismatch_backmap:
|
| 510 |
print(f"[ALERTA] {mismatch_backmap:,} asimetrías ES↔NI (misma NI apunta a otro ES).")
|
|
|
|
| 517 |
)
|
| 518 |
sam_html = f"<details><summary>Muestras</summary><ul>{sam_rows}</ul></details>"
|
| 519 |
|
| 520 |
+
ambN = sum(len(v) > 1 for v in AMBIG_NI.values())
|
| 521 |
+
ambList = ", ".join(f"{k}→{sorted(list(v))[:3]}" for k,v in list(AMBIG_NI.items())[:5])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 522 |
|
| 523 |
BI_DIAG_HTML = f"""
|
| 524 |
<div style="font-family:Georgia,serif">
|
|
|
|
| 526 |
Archivo: <b>{escape(CSV_BI)}</b><br>
|
| 527 |
Filas base (CSV): <b>{rows:,}</b><br>
|
| 528 |
ES únicas (tras expansiones): <b>{es_unique:,}</b> | NI únicas: <b>{ni_unique:,}</b> | pair_id únicos: <b>{pid_unique:,}</b><br>
|
| 529 |
+
Duplicados ES: <b>{dup_es:,}</b> | Duplicados NI: <b>{dup_ni:,}</b> (bloqueados en estricto) | Sin pair_id: <b>{empty_pid:,}</b><br>
|
| 530 |
+
Asimetrías ES↔NI: <b>{mismatch_backmap:,}</b>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
{sam_html}
|
| 532 |
+
<hr style="border:0;border-top:1px solid #caa">
|
| 533 |
+
<small>NI ambiguas bloqueadas: <b>{ambN:,}</b>{(' · ej.: ' + escape(ambList)) if ambN else ''}</small><br>
|
| 534 |
+
<small>Regla: el motor usa <b>sólo</b> tablas 1:1; NI duplicadas se bloquean y se muestran como <code>[AMB-NI:...]</code>.</small>
|
| 535 |
</div>
|
| 536 |
"""
|
| 537 |
return rows > 0
|
|
|
|
| 544 |
"""Devuelve (span, surface) si hay frase que comience en i."""
|
| 545 |
if not phrase_map: return (0, None)
|
| 546 |
max_span = 0; surface = None
|
| 547 |
+
# determinista: prioriza el span más largo
|
| 548 |
for span in range(1, MAX_NGRAM+1):
|
| 549 |
if i+span > len(tokens): break
|
| 550 |
cand = " ".join(lower(t) for t in tokens[i:i+span])
|
|
|
|
| 586 |
|
| 587 |
return "".join(out)
|
| 588 |
|
| 589 |
+
# ✦ FIX: no re-espaciar horas/decimales y no añadir espacios tras “:”/“,”
|
| 590 |
def postprocess_spanish(s: str) -> str:
|
| 591 |
+
# 1) compactar horas y decimales
|
| 592 |
+
s = re.sub(r"(\d)\s*:\s*(\d)", r"\1:\2", s) # 18:30
|
| 593 |
+
s = re.sub(r"(\d)\s*([.,])\s*(\d)", r"\1\2\3", s) # 12,65 / 3.1415
|
| 594 |
+
# 2) espacios y signos
|
| 595 |
+
s = re.sub(r"\s+([,.;:!?])", r"\1", s) # nada antes de signos
|
| 596 |
+
# añadir espacio SOLO tras . ! ? ; (NO tras coma/“:”)
|
| 597 |
+
s = re.sub(r"([?.!;])(?!\s|$)([^\s])", r"\1 \2", s)
|
| 598 |
+
# 3) signos invertidos
|
| 599 |
s = re.sub(r"([¿¡])\s+", r"\1", s)
|
| 600 |
+
# 4) colapsar espacios
|
| 601 |
+
s = re.sub(r"\s{2,}", " ", s).strip()
|
| 602 |
+
# 5) mayúscula inicial de oración
|
| 603 |
+
return sentence_case_spanish(s)
|
| 604 |
|
| 605 |
# ====== Traducción BI estricta ======
|
| 606 |
def translate_es_to_ni_bi(text:str):
|
|
|
|
| 666 |
if key in NI2ES:
|
| 667 |
es = NI2ES[key][0] or ""
|
| 668 |
out.append(es if es else f"[?:{t}]")
|
| 669 |
+
elif key in AMBIG_NI and STRICT_BI_ENFORCE:
|
| 670 |
+
# ★ determinista: no elegimos al azar superficies NI con colisión
|
| 671 |
+
out.append(f"[AMB-NI:{t}]")
|
| 672 |
elif is_number(key):
|
| 673 |
out.append(t)
|
| 674 |
else:
|
|
|
|
| 688 |
return "<em>Introduce texto para diagnosticar.</em>"
|
| 689 |
|
| 690 |
toks = simple_tokenize(text)
|
| 691 |
+
unknown=set(); asym=set(); amb=set()
|
| 692 |
total_tokens=0; covered=0
|
| 693 |
|
| 694 |
if dir_label.startswith("ES"):
|
|
|
|
| 723 |
if span > 1:
|
| 724 |
covered += 1; i += span; continue
|
| 725 |
k=lower(t)
|
| 726 |
+
if k in AMBIG_NI:
|
| 727 |
+
amb.add(t); i+=1; continue
|
| 728 |
if k not in NI2ES:
|
| 729 |
unknown.add(t); i+=1; continue
|
| 730 |
covered += 1
|
|
|
|
| 738 |
cov_html = f"<div><b>Tokens (sin puntuación/numéricos):</b> {total_tokens} | <b>Cubiertos:</b> {covered} ({cov_pct:.1f}%)</div>"
|
| 739 |
|
| 740 |
unk_html = "".join(f"<li><code>{escape(u)}</code></li>" for u in sorted(unknown, key=lambda x: lower(x))) or "<li><i>—</i></li>"
|
| 741 |
+
amb_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(amb, key=lambda x: lower(x))) or "<li><i>—</i></li>"
|
| 742 |
asy_html = "".join(f"<li><code>{escape(a)}</code></li>" for a in sorted(asym)) or "<li><i>—</i></li>"
|
| 743 |
|
| 744 |
+
return f"<b>Diagnóstico {head}</b>{cov_html}<b>Ambiguas (NI duplicada):</b><ul>{amb_html}</ul><b>Faltantes:</b><ul>{unk_html}</ul><b>Asimetrías:</b><ul>{asy_html}</ul>"
|
| 745 |
|
| 746 |
# ====== UI (CSS / acordeones / fuentes) ======
|
| 747 |
LABELS={
|
| 748 |
"ES":{
|
| 749 |
"title":"Traductor Español ↔ Neoíbero",
|
| 750 |
+
"subtitle":"CSV estricto (BI-only 1:1; sin heurísticas; .gz) — determinista",
|
| 751 |
"in_label_es":"✏️ Entrada (Español)",
|
| 752 |
"in_label_ni":"✏️ Entrada (Neoíbero)",
|
| 753 |
"in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
|
|
|
|
| 777 |
},
|
| 778 |
"EN":{
|
| 779 |
"title":"Spanish ↔ Neo-Iberian Translator",
|
| 780 |
+
"subtitle":"Strict BI-only (1:1 surfaces; no heuristics; .gz) — deterministic",
|
| 781 |
"in_label_es":"✏️ Input (Spanish)",
|
| 782 |
"in_label_ni":"✏️ Input (Neo-Iberian)",
|
| 783 |
"in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.",
|
|
|
|
| 795 |
"🎓 Background & design choices",
|
| 796 |
"🏛️ Possible inheritance from ancient Iberian",
|
| 797 |
"🎨 Conlang design (Neo-Iberian)",
|
| 798 |
+
"⚙️ Translator pipeline (strict 1:1)",
|
| 799 |
"🔤 Orthography, Iberian line & keys",
|
| 800 |
"❓/❗ Vascoid modality (-na / -ba)",
|
| 801 |
"🧩 CSV-driven expansions: plurals (S) & 3pl (3/V3)",
|
|
|
|
| 811 |
"**Escritura y datos.** Un **único CSV con `pair_id`** y superficies exactas. La traducción ES↔NI es **1:1** por superficie.",
|
| 812 |
"**Herencia plausible del íbero.** Fonotaxis CV(C); p→b; r/ŕ; casos -k/-te/-ka/-ar/-en/-i.",
|
| 813 |
"**Diseño del neoíbero.** TAM: PRS -ke, PST -bo, FUT -ta, IPFV -ri, IMP -tu, COND/SBJV -ni, FUT_SBJV -ra.",
|
| 814 |
+
"**Pipeline (BI-estricto 1:1).** Tokeniza; sustitución exacta; NI ambigua **se bloquea** y sale como `[AMB-NI:…]`.",
|
| 815 |
"**Ortografía y línea ibérica.** Tokens BA/BE/…; tridots '/'; p→b; codas N/S/Ś/R/Ŕ/L/M/K/T.",
|
| 816 |
+
"**Modalidad (-na/-ba).** ES→NI puede omitir ¿?¡! (si está activo). NI→ES inserta `¿…?`/`¡…!` al final de la oración marcada, **no en comas**.",
|
| 817 |
+
"**Expansiones por CSV (deterministas).** `flags=S` plural regular; `flags=3|V3` 3ª plural del presente. Solo si lo marcas.",
|
| 818 |
"**Gramática mínima.** Visualización; la gramática no se “calcula”.",
|
| 819 |
"**Bibliografía.** Untermann; de Hoz; Ferrer i Jané; Correa…",
|
| 820 |
+
"**Glosario & datasets.** Faltas → `[SIN-LEX:…]` / `[?:…]`. Ambiguas → `[AMB-NI:…]` (limpia tu CSV).",
|
| 821 |
"**Simetría por pair_id.** El diagnóstico avisa si una NI apunta a dos ES distintos."
|
| 822 |
],
|
| 823 |
"EN":[
|
| 824 |
+
"One bilingual CSV with `pair_id` and exact surfaces. ES↔NI is strictly 1:1.",
|
| 825 |
"Possible inheritance (non-palaeographic).",
|
| 826 |
"Neo-Iberian design (phonology & morphology).",
|
| 827 |
+
"Pipeline: tokenise → exact replacement. Ambiguous NI are **blocked** and rendered as `[AMB-NI:…]`.",
|
| 828 |
"Orthography, Iberian line & keys.",
|
| 829 |
+
"Modality (-na/-ba): ES→NI can drop ¿?¡!. NI→ES places them at sentence end, not at commas.",
|
| 830 |
+
"CSV-driven expansions (deterministic): `S` plural; `3|V3` present 3pl.",
|
| 831 |
"Minimal grammar (v1.2).",
|
| 832 |
"Selected references.",
|
| 833 |
"Glossary & datasets.",
|
| 834 |
+
"Pair-id symmetry diagnostics."
|
| 835 |
]
|
| 836 |
}
|
| 837 |
|
|
|
|
| 1022 |
|
| 1023 |
# ====== smoke opcional ======
|
| 1024 |
def _symmetry_smoketest():
|
| 1025 |
+
print("\n[SMOKE] Prueba ES↔NI (BI-estricto, determinista)…")
|
| 1026 |
probes = [
|
| 1027 |
+
"nuker-ke ni etxe-ka ?",
|
| 1028 |
+
"¿Pagaste 12,75 en la cafetería?",
|
| 1029 |
+
"Marta llega a las 18:30.",
|
| 1030 |
+
"[SIN-LEX:Tomás]-na euŕak-ke !"
|
| 1031 |
]
|
| 1032 |
for p in probes:
|
| 1033 |
es_from_ni = translate_ni_to_es_bi(p)
|
|
|
|
| 1044 |
demo.queue().launch()
|
| 1045 |
|
| 1046 |
|
|
|
|
|
|