Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
# app.py — Universal Conlang Translator (Max Compresión Exacta)
|
| 2 |
# Archivos requeridos en la raíz:
|
| 3 |
-
#
|
| 4 |
-
#
|
| 5 |
-
#
|
| 6 |
#
|
| 7 |
# requirements.txt (para HF Spaces):
|
| 8 |
# gradio>=4.36.0
|
|
@@ -15,8 +15,8 @@ from typing import Dict, Optional, List, Any
|
|
| 15 |
import gradio as gr
|
| 16 |
|
| 17 |
# ------------ Archivos esperados ------------
|
| 18 |
-
LEX_MINI
|
| 19 |
-
LEX_KOMI
|
| 20 |
LEX_MASTER = "lexicon_master.json"
|
| 21 |
|
| 22 |
# ------------ Normalización ------------
|
|
@@ -56,13 +56,9 @@ def load_lexicons():
|
|
| 56 |
|
| 57 |
mini2en = {v:k for k,v in en2mini.items()}
|
| 58 |
komi2en = {v:k for k,v in en2komi.items()}
|
| 59 |
-
return (es2mini, es2komi, mini2es, komi2es,
|
| 60 |
-
en2mini, en2komi, mini2en, komi2en,
|
| 61 |
-
es2en_lemma, en2es_lemma, master)
|
| 62 |
|
| 63 |
-
(ES2MINI, ES2KOMI, MINI2ES, KOMI2ES,
|
| 64 |
-
EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
|
| 65 |
-
ES2EN_LEMMA, EN2ES_LEMMA, MASTER_OBJ) = load_lexicons()
|
| 66 |
|
| 67 |
# ------------ Pronombres ------------
|
| 68 |
PRON_ES = {"yo","tú","vos","usted","él","ella","nosotros","vosotros","ustedes","ellos","ellas","me","te","se","nos","os"}
|
|
@@ -86,17 +82,11 @@ def from_custom_b64(s: str, alphabet: str) -> bytes:
|
|
| 86 |
trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
|
| 87 |
std = s.translate(trans); pad = "=" * ((4 - len(std) % 4) % 4)
|
| 88 |
return base64.b64decode(std + pad)
|
| 89 |
-
|
| 90 |
-
# --- PATCH: evitar OOV vacío ---
|
| 91 |
-
def enc_oov_minimax(token: str) -> str:
|
| 92 |
-
t = token if token else "_" # evita cadena vacía
|
| 93 |
-
return "~" + to_custom_b64(t.encode("utf-8"), ALPHA_MINI64)
|
| 94 |
def dec_oov_minimax(code: str) -> str:
|
| 95 |
try: return from_custom_b64(code[1:], ALPHA_MINI64).decode("utf-8")
|
| 96 |
except Exception: return code
|
| 97 |
-
def enc_oov_komin(token: str) -> str:
|
| 98 |
-
t = token if token else "_" # evita cadena vacía
|
| 99 |
-
return "「" + to_custom_b64(t.encode("utf-8"), ALPHA_CJK64) + "」"
|
| 100 |
def dec_oov_komin(code: str) -> str:
|
| 101 |
try: return from_custom_b64(code[1:-1], ALPHA_CJK64).decode("utf-8")
|
| 102 |
except Exception: return code
|
|
@@ -114,18 +104,11 @@ try:
|
|
| 114 |
except Exception:
|
| 115 |
nlp_es = nlp_en = None
|
| 116 |
|
| 117 |
-
# --- PATCH: lemma_of siempre devuelve algo no vacío ---
|
| 118 |
def lemma_of(tok, src_lang: str) -> str:
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
return
|
| 123 |
-
# Fallback 1: normalizar el texto original
|
| 124 |
-
txt = norm_es(getattr(tok, "text", "")) if src_lang == "Español" else norm_en(getattr(tok, "text", ""))
|
| 125 |
-
if txt:
|
| 126 |
-
return txt
|
| 127 |
-
# Fallback 2: verbo seguro para no dejar vacío
|
| 128 |
-
return "ser" if src_lang == "Español" else "be"
|
| 129 |
|
| 130 |
# ------------ Detección simple y helpers ------------
|
| 131 |
def detect_polarity(doc) -> bool: return "?" in getattr(doc,"text","")
|
|
@@ -202,14 +185,10 @@ def code_en(lemma: str, target: str) -> str:
|
|
| 202 |
TAM_MINI = {"Pres":"P","Past":"T","Fut":"F","UNK":"P"}
|
| 203 |
TAM_KOMI = {"Pres":"Ⓟ","Past":"Ⓣ","Fut":"Ⓕ","UNK":"Ⓟ"}
|
| 204 |
|
| 205 |
-
def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
|
| 206 |
-
semi_lossless=False, person_hint="2s", remove_pronouns=False):
|
| 207 |
root, subs, objs, obls, advs = extract_core(doc)
|
| 208 |
tense = detect_tense(root); is_q, is_neg = detect_polarity(doc), detect_neg(doc)
|
| 209 |
-
vlem
|
| 210 |
-
# --- PATCH: si vlem viene vacío, fallback a verbo seguro ---
|
| 211 |
-
if not vlem:
|
| 212 |
-
vlem = "ser" if src_lang == "Español" else "be"
|
| 213 |
vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
|
| 214 |
tail = TAM_MINI.get(tense, "P")
|
| 215 |
if semi_lossless: tail += (detect_person(root, src_lang) or person_hint)
|
|
@@ -236,14 +215,10 @@ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
|
|
| 236 |
parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else [vcode]+S+O+ADV
|
| 237 |
return " ".join(p for p in parts if p)
|
| 238 |
|
| 239 |
-
def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
|
| 240 |
-
semi_lossless=False, person_hint="2s", remove_pronouns=False):
|
| 241 |
root, subs, objs, obls, advs = extract_core(doc)
|
| 242 |
tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
|
| 243 |
-
vlem
|
| 244 |
-
# --- PATCH: si vlem viene vacío, fallback a verbo seguro ---
|
| 245 |
-
if not vlem:
|
| 246 |
-
vlem = "ser" if src_lang == "Español" else "be"
|
| 247 |
vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
|
| 248 |
P_SUBJ, P_OBJ = "ᵖ", "ᵒ"; Q_FIN = "?"
|
| 249 |
TAM = TAM_KOMI.get(tense,"Ⓟ")
|
|
@@ -270,7 +245,7 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
|
|
| 270 |
return out
|
| 271 |
|
| 272 |
# ------------ Sidecars (compresión exacta) ------------
|
| 273 |
-
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_
|
| 274 |
def b85_enc_raw(s: str) -> str: return base64.a85encode(zlib.compress(s.encode("utf-8"), 9), adobe=False).decode("ascii")
|
| 275 |
def b85_dec_raw(b85s: str) -> str: return zlib.decompress(base64.a85decode(b85s.encode("ascii"), adobe=False)).decode("utf-8")
|
| 276 |
def attach_sidecar_b85(conlang_text: str, original_text: str) -> str: return f"{conlang_text} §({b85_enc_raw(original_text)})"
|
|
@@ -426,9 +401,7 @@ def _en_conj(lemma, tense, person):
|
|
| 426 |
return lemma
|
| 427 |
|
| 428 |
# ================= Helper de construcción/translate =================
|
| 429 |
-
def _build_with_spacy(text: str, src_lang: str, target: str,
|
| 430 |
-
drop_articles: bool, zero_copula: bool,
|
| 431 |
-
semi_lossless: bool, remove_pronouns: bool) -> str:
|
| 432 |
nlp = nlp_es if src_lang=="Español" else nlp_en
|
| 433 |
doc = nlp(text)
|
| 434 |
if target == "Minimax-ASCII":
|
|
@@ -436,19 +409,15 @@ def _build_with_spacy(text: str, src_lang: str, target: str,
|
|
| 436 |
else:
|
| 437 |
return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless, remove_pronouns=remove_pronouns)
|
| 438 |
|
| 439 |
-
def build_sentence(text: str, src_lang: str, target: str,
|
| 440 |
-
drop_articles: bool, zero_copula: bool,
|
| 441 |
-
mode: str, max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
|
| 442 |
if not text.strip(): return ""
|
| 443 |
-
semi = True
|
| 444 |
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi, remove_pronouns) if USE_SPACY else encode_simple(text, src_lang, target)
|
| 445 |
if max_comp_exact:
|
| 446 |
return custom_sidecar_enc(core, text)
|
| 447 |
return core
|
| 448 |
|
| 449 |
-
def universal_translate(text: str, src: str, tgt: str,
|
| 450 |
-
drop_articles: bool, zero_copula: bool,
|
| 451 |
-
mode: str, max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
|
| 452 |
if not text.strip(): return ""
|
| 453 |
if src == tgt: return text
|
| 454 |
|
|
@@ -499,7 +468,7 @@ ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
|
|
| 499 |
EXPLAIN_TAB_TRANSLATE_ES = """
|
| 500 |
**¿Qué hace “Traducir”?**
|
| 501 |
Convierte lo que escribes en **Texto** al **Destino** que elijas (ES/EN/Minimax/Kōmín).
|
| 502 |
-
- Con **Máx. Compresión Exacta**, añade un final
|
| 503 |
- Las casillas de **compactación** (artículos, cópula, pronombres) **sólo se aplican si el Destino es conlang**.
|
| 504 |
"""
|
| 505 |
EXPLAIN_TAB_BUILD_ES = """
|
|
@@ -509,7 +478,7 @@ Obliga a que la salida sea **Minimax** o **Kōmín** (desde ES/EN). Aplica el or
|
|
| 509 |
EXPLAIN_TAB_DECODE_ES = """
|
| 510 |
**¿Qué hace “Decodificar (Conlang → ES/EN)”?**
|
| 511 |
Convierte de **Minimax/Kōmín** a **Español/Inglés**.
|
| 512 |
-
- Si el texto trae
|
| 513 |
- Si no, reconstruimos lo más fiel posible con el **diccionario**.
|
| 514 |
"""
|
| 515 |
EXPLAIN_TAB_ROUNDTRIP_ES = """
|
|
@@ -522,28 +491,28 @@ EXPLAIN_CHECKBOX_ES = """
|
|
| 522 |
- **Omitir artículos** (*el/la/los/las*; *a/an/the*): ahorro típico **~10–15%**.
|
| 523 |
- **Cópula cero** (presente afirmativo): oculta *ser/estar/be* → **~5–10%** extra.
|
| 524 |
- **Quitar pronombres**: suprime pronombres obvios → ahorro **variable**.
|
| 525 |
-
- **Máx. Compresión Exacta**: añade
|
| 526 |
**Guía rápida:** sin casillas **0%**; artículos+cópula **~15–20%**.
|
| 527 |
"""
|
| 528 |
# ¿Qué son los lenguajes?
|
| 529 |
EXPLAIN_CONLANGS_ES = """
|
| 530 |
**¿Qué son Minimax-ASCII y Kōmín-CJK?**
|
| 531 |
-
- **Minimax-ASCII**: versión compacta que usa sólo caracteres comunes (ASCII). Añade marcas como
|
| 532 |
-
- **Kōmín-CJK**: versión visual con partículas (ej.: sujeto
|
| 533 |
-
Ambos son “**conlangs**” pensados para **ahorrar espacio** y permitir **decodificación** a ES/EN (exacta si hay
|
| 534 |
"""
|
| 535 |
|
| 536 |
# EN
|
| 537 |
-
EXPLAIN_TAB_TRANSLATE_EN = "Converts **Text → Target** (ES/EN/Minimax/Kōmín). With **Max Exact**, adds
|
| 538 |
EXPLAIN_TAB_BUILD_EN = "Forces **conlang output** (Minimax/Kōmín) from ES/EN, applying phrasing rules and compaction options."
|
| 539 |
-
EXPLAIN_TAB_DECODE_EN = "Converts **Minimax/Kōmín → ES/EN**. If
|
| 540 |
EXPLAIN_TAB_ROUNDTRIP_EN = "Runs **(ES/EN→Conlang)→(Conlang→ES/EN)** to verify reversibility; with exact, it’s bit-for-bit."
|
| 541 |
EXPLAIN_CHECKBOX_EN = "Drop articles ~10–15%, Zero copula ~5–10% extra, Remove pronouns variable, Max Exact 40–60% for >100 chars."
|
| 542 |
EXPLAIN_CONLANGS_EN = """
|
| 543 |
**What are Minimax-ASCII and Kōmín-CJK?**
|
| 544 |
-
- **Minimax-ASCII**: compact ASCII codes with
|
| 545 |
-
- **Kōmín-CJK**: visual style using particles (subject
|
| 546 |
-
Both are conlangs for **space-saving** and **decoding** back to ES/EN (bit-perfect when
|
| 547 |
"""
|
| 548 |
|
| 549 |
# Léxico (amigable)
|
|
@@ -553,9 +522,9 @@ LEXICON_FRIENDLY_ES = """
|
|
| 553 |
- Limpiamos y ordenamos por **frecuencia de uso**.
|
| 554 |
- Asignamos un **código corto** a cada lema para **Minimax** y para **Kōmín**.
|
| 555 |
- Guardamos tres archivos que la app usa al traducir:
|
| 556 |
-
-
|
| 557 |
-
-
|
| 558 |
-
-
|
| 559 |
**Así** podemos convertir tus frases en **códigos compactos** y volver a texto entendible.
|
| 560 |
"""
|
| 561 |
LEXICON_FRIENDLY_EN = "We use **WordNet (OMW)**, pair ES words with EN, sort by frequency, assign short codes (Minimax/Kōmín), and save three JSONs so the app can encode/decode compactly."
|
|
@@ -574,7 +543,7 @@ def compaction_line_es(text, src, tgt, drop, zero, rm, maxc) -> str:
|
|
| 574 |
msg = f"**Base (sin casillas):** {_pct_comp(text, base):.1f}% · **Con tus opciones:** {_pct_comp(text, curr):.1f}%"
|
| 575 |
if maxc:
|
| 576 |
curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm)
|
| 577 |
-
msg += f" · **Con sidecar
|
| 578 |
return msg
|
| 579 |
|
| 580 |
def compaction_line_en(text, src, tgt, drop, zero, rm, maxc) -> str:
|
|
@@ -586,7 +555,7 @@ def compaction_line_en(text, src, tgt, drop, zero, rm, maxc) -> str:
|
|
| 586 |
msg = f"**Base (no options):** {_pct_comp(text, base):.1f}% · **With your options:** {_pct_comp(text, curr):.1f}%"
|
| 587 |
if maxc:
|
| 588 |
curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm)
|
| 589 |
-
msg += f" · **With
|
| 590 |
return msg
|
| 591 |
|
| 592 |
def master_preview(n: int = 20) -> List[List[Any]]:
|
|
@@ -608,14 +577,12 @@ def make_panel_translate(lang="ES"):
|
|
| 608 |
with gr.Row():
|
| 609 |
src = gr.Dropdown(ALL_LANGS, value=("Español" if lang=="ES" else "English"), label=("Fuente" if lang=="ES" else "Source"))
|
| 610 |
tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label=("Destino" if lang=="ES" else "Target"))
|
| 611 |
-
text = gr.Textbox(lines=3, label=("Texto" if lang=="ES" else "Text"),
|
| 612 |
-
placeholder=("Ej.: Hola, ¿cómo estás?" if lang=="ES" else "e.g., Hello, how are you?"),
|
| 613 |
-
show_copy_button=True)
|
| 614 |
with gr.Row():
|
| 615 |
-
drop = gr.Checkbox(True,
|
| 616 |
zero = gr.Checkbox(False, label=("Cópula cero (presente afirm.)" if lang=="ES" else "Zero copula (present affirmative)"))
|
| 617 |
rmpr = gr.Checkbox(False, label=("Quitar pronombres" if lang=="ES" else "Remove pronouns"))
|
| 618 |
-
exact = gr.Checkbox(False, label=("Máx. Compresión Exacta (sidecar
|
| 619 |
mode_hidden = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 620 |
out = gr.Textbox(lines=6, label=("Traducción" if lang=="ES" else "Translation"), show_copy_button=True)
|
| 621 |
comp = gr.Markdown("")
|
|
@@ -660,7 +627,7 @@ def make_panel_decode(lang="ES"):
|
|
| 660 |
with gr.Row():
|
| 661 |
src = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label=("Fuente" if lang=="ES" else "Source"))
|
| 662 |
tgt = gr.Dropdown(["Español","English"], value=("Español" if lang=="ES" else "English"), label=("Destino" if lang=="ES" else "Target"))
|
| 663 |
-
text = gr.Textbox(lines=3, label=("Texto en conlang (puede incluir
|
| 664 |
out = gr.Textbox(lines=6, label=("Salida" if lang=="ES" else "Output"), show_copy_button=True)
|
| 665 |
def run(t, s, d):
|
| 666 |
if not t.strip(): return ""
|
|
@@ -709,14 +676,14 @@ with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as
|
|
| 709 |
with acc_modes_es: gr.Markdown(
|
| 710 |
"- **🔁 Traducir**: Texto → Destino (ES/EN/Minimax/Kōmín), con opciones de compactación y % mostrado.\n"
|
| 711 |
"- **🛠️ Construir**: Obliga salida en conlang (Minimax/Kōmín) desde ES/EN.\n"
|
| 712 |
-
"- **🗝️ Decodificar**: Conlang → ES/EN (si hay
|
| 713 |
"- **🔄 Prueba ida→vuelta**: Comprueba reversibilidad."
|
| 714 |
)
|
| 715 |
acc_modes_en = gr.Accordion("📖 What does each button / mode do? (EN)", open=False, visible=False)
|
| 716 |
with acc_modes_en: gr.Markdown(
|
| 717 |
"- **🔁 Translate**: Text → Target (ES/EN/Minimax/Kōmín) with compaction and %.\n"
|
| 718 |
"- **🛠️ Build**: Force conlang output from ES/EN.\n"
|
| 719 |
-
"- **🗝️ Decode**: Conlang → ES/EN (if
|
| 720 |
"- **🔄 Round-trip**: Check reversibility."
|
| 721 |
)
|
| 722 |
|
|
@@ -847,3 +814,4 @@ if __name__ == "__main__":
|
|
| 847 |
|
| 848 |
|
| 849 |
|
|
|
|
|
|
| 1 |
# app.py — Universal Conlang Translator (Max Compresión Exacta)
|
| 2 |
# Archivos requeridos en la raíz:
|
| 3 |
+
# - lexicon_minimax.json
|
| 4 |
+
# - lexicon_komin.json
|
| 5 |
+
# - lexicon_master.json
|
| 6 |
#
|
| 7 |
# requirements.txt (para HF Spaces):
|
| 8 |
# gradio>=4.36.0
|
|
|
|
| 15 |
import gradio as gr
|
| 16 |
|
| 17 |
# ------------ Archivos esperados ------------
|
| 18 |
+
LEX_MINI = "lexicon_minimax.json"
|
| 19 |
+
LEX_KOMI = "lexicon_komin.json"
|
| 20 |
LEX_MASTER = "lexicon_master.json"
|
| 21 |
|
| 22 |
# ------------ Normalización ------------
|
|
|
|
| 56 |
|
| 57 |
mini2en = {v:k for k,v in en2mini.items()}
|
| 58 |
komi2en = {v:k for k,v in en2komi.items()}
|
| 59 |
+
return (es2mini, es2komi, mini2es, komi2es, en2mini, en2komi, mini2en, komi2en, es2en_lemma, en2es_lemma, master)
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
(ES2MINI, ES2KOMI, MINI2ES, KOMI2ES, EN2MINI, EN2KOMI, MINI2EN, KOMI2EN, ES2EN_LEMMA, EN2ES_LEMMA, MASTER_OBJ) = load_lexicons()
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# ------------ Pronombres ------------
|
| 64 |
PRON_ES = {"yo","tú","vos","usted","él","ella","nosotros","vosotros","ustedes","ellos","ellas","me","te","se","nos","os"}
|
|
|
|
| 82 |
trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
|
| 83 |
std = s.translate(trans); pad = "=" * ((4 - len(std) % 4) % 4)
|
| 84 |
return base64.b64decode(std + pad)
|
| 85 |
+
def enc_oov_minimax(token: str) -> str: return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
def dec_oov_minimax(code: str) -> str:
|
| 87 |
try: return from_custom_b64(code[1:], ALPHA_MINI64).decode("utf-8")
|
| 88 |
except Exception: return code
|
| 89 |
+
def enc_oov_komin(token: str) -> str: return "「" + to_custom_b64(token.encode("utf-8"), ALPHA_CJK64) + "」"
|
|
|
|
|
|
|
| 90 |
def dec_oov_komin(code: str) -> str:
|
| 91 |
try: return from_custom_b64(code[1:-1], ALPHA_CJK64).decode("utf-8")
|
| 92 |
except Exception: return code
|
|
|
|
| 104 |
except Exception:
|
| 105 |
nlp_es = nlp_en = None
|
| 106 |
|
|
|
|
| 107 |
def lemma_of(tok, src_lang: str) -> str:
|
| 108 |
+
if src_lang == "Español":
|
| 109 |
+
return norm_es(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
|
| 110 |
+
else:
|
| 111 |
+
return norm_en(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
# ------------ Detección simple y helpers ------------
|
| 114 |
def detect_polarity(doc) -> bool: return "?" in getattr(doc,"text","")
|
|
|
|
| 185 |
TAM_MINI = {"Pres":"P","Past":"T","Fut":"F","UNK":"P"}
|
| 186 |
TAM_KOMI = {"Pres":"Ⓟ","Past":"Ⓣ","Fut":"Ⓕ","UNK":"Ⓟ"}
|
| 187 |
|
| 188 |
+
def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s", remove_pronouns=False):
|
|
|
|
| 189 |
root, subs, objs, obls, advs = extract_core(doc)
|
| 190 |
tense = detect_tense(root); is_q, is_neg = detect_polarity(doc), detect_neg(doc)
|
| 191 |
+
vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
|
|
|
|
|
|
|
|
|
|
| 192 |
vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
|
| 193 |
tail = TAM_MINI.get(tense, "P")
|
| 194 |
if semi_lossless: tail += (detect_person(root, src_lang) or person_hint)
|
|
|
|
| 215 |
parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else [vcode]+S+O+ADV
|
| 216 |
return " ".join(p for p in parts if p)
|
| 217 |
|
| 218 |
+
def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s", remove_pronouns=False):
|
|
|
|
| 219 |
root, subs, objs, obls, advs = extract_core(doc)
|
| 220 |
tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
|
| 221 |
+
vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
|
|
|
|
|
|
|
|
|
|
| 222 |
vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
|
| 223 |
P_SUBJ, P_OBJ = "ᵖ", "ᵒ"; Q_FIN = "?"
|
| 224 |
TAM = TAM_KOMI.get(tense,"Ⓟ")
|
|
|
|
| 245 |
return out
|
| 246 |
|
| 247 |
# ------------ Sidecars (compresión exacta) ------------
|
| 248 |
+
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_{|}~]+)\)$")
|
| 249 |
def b85_enc_raw(s: str) -> str: return base64.a85encode(zlib.compress(s.encode("utf-8"), 9), adobe=False).decode("ascii")
|
| 250 |
def b85_dec_raw(b85s: str) -> str: return zlib.decompress(base64.a85decode(b85s.encode("ascii"), adobe=False)).decode("utf-8")
|
| 251 |
def attach_sidecar_b85(conlang_text: str, original_text: str) -> str: return f"{conlang_text} §({b85_enc_raw(original_text)})"
|
|
|
|
| 401 |
return lemma
|
| 402 |
|
| 403 |
# ================= Helper de construcción/translate =================
|
| 404 |
+
def _build_with_spacy(text: str, src_lang: str, target: str, drop_articles: bool, zero_copula: bool, semi_lossless: bool, remove_pronouns: bool) -> str:
|
|
|
|
|
|
|
| 405 |
nlp = nlp_es if src_lang=="Español" else nlp_en
|
| 406 |
doc = nlp(text)
|
| 407 |
if target == "Minimax-ASCII":
|
|
|
|
| 409 |
else:
|
| 410 |
return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless, remove_pronouns=remove_pronouns)
|
| 411 |
|
| 412 |
+
def build_sentence(text: str, src_lang: str, target: str, drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
|
|
|
|
|
|
|
| 413 |
if not text.strip(): return ""
|
| 414 |
+
semi = True # siempre semi-lossless en construcción
|
| 415 |
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi, remove_pronouns) if USE_SPACY else encode_simple(text, src_lang, target)
|
| 416 |
if max_comp_exact:
|
| 417 |
return custom_sidecar_enc(core, text)
|
| 418 |
return core
|
| 419 |
|
| 420 |
+
def universal_translate(text: str, src: str, tgt: str, drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
|
|
|
|
|
|
|
| 421 |
if not text.strip(): return ""
|
| 422 |
if src == tgt: return text
|
| 423 |
|
|
|
|
| 468 |
EXPLAIN_TAB_TRANSLATE_ES = """
|
| 469 |
**¿Qué hace “Traducir”?**
|
| 470 |
Convierte lo que escribes en **Texto** al **Destino** que elijas (ES/EN/Minimax/Kōmín).
|
| 471 |
+
- Con **Máx. Compresión Exacta**, añade un final ~... con el **original comprimido** para recuperarlo tal cual al decodificar.
|
| 472 |
- Las casillas de **compactación** (artículos, cópula, pronombres) **sólo se aplican si el Destino es conlang**.
|
| 473 |
"""
|
| 474 |
EXPLAIN_TAB_BUILD_ES = """
|
|
|
|
| 478 |
EXPLAIN_TAB_DECODE_ES = """
|
| 479 |
**¿Qué hace “Decodificar (Conlang → ES/EN)”?**
|
| 480 |
Convierte de **Minimax/Kōmín** a **Español/Inglés**.
|
| 481 |
+
- Si el texto trae ~..., devolvemos el **original exacto**.
|
| 482 |
- Si no, reconstruimos lo más fiel posible con el **diccionario**.
|
| 483 |
"""
|
| 484 |
EXPLAIN_TAB_ROUNDTRIP_ES = """
|
|
|
|
| 491 |
- **Omitir artículos** (*el/la/los/las*; *a/an/the*): ahorro típico **~10–15%**.
|
| 492 |
- **Cópula cero** (presente afirmativo): oculta *ser/estar/be* → **~5–10%** extra.
|
| 493 |
- **Quitar pronombres**: suprime pronombres obvios → ahorro **variable**.
|
| 494 |
+
- **Máx. Compresión Exacta**: añade ~... para recuperar el original (en >100 caracteres, **~40–60%**; en textos muy cortos puede no reducir).
|
| 495 |
**Guía rápida:** sin casillas **0%**; artículos+cópula **~15–20%**.
|
| 496 |
"""
|
| 497 |
# ¿Qué son los lenguajes?
|
| 498 |
EXPLAIN_CONLANGS_ES = """
|
| 499 |
**¿Qué son Minimax-ASCII y Kōmín-CJK?**
|
| 500 |
+
- **Minimax-ASCII**: versión compacta que usa sólo caracteres comunes (ASCII). Añade marcas como ·P/·T/·F, persona (1s,2p…), negación N y pregunta Q.
|
| 501 |
+
- **Kōmín-CJK**: versión visual con partículas (ej.: sujeto ᵖ, objeto ᵒ) y un circulito de tiempo Ⓟ/Ⓣ/Ⓕ. Puede terminar en ?.
|
| 502 |
+
Ambos son “**conlangs**” pensados para **ahorrar espacio** y permitir **decodificación** a ES/EN (exacta si hay ~...).
|
| 503 |
"""
|
| 504 |
|
| 505 |
# EN
|
| 506 |
+
EXPLAIN_TAB_TRANSLATE_EN = "Converts **Text → Target** (ES/EN/Minimax/Kōmín). With **Max Exact**, adds ~... to recover the **exact original**. Compaction checkboxes apply only when **Target is conlang**."
|
| 507 |
EXPLAIN_TAB_BUILD_EN = "Forces **conlang output** (Minimax/Kōmín) from ES/EN, applying phrasing rules and compaction options."
|
| 508 |
+
EXPLAIN_TAB_DECODE_EN = "Converts **Minimax/Kōmín → ES/EN**. If ~... exists, returns the bit-perfect original; else semi-lossless."
|
| 509 |
EXPLAIN_TAB_ROUNDTRIP_EN = "Runs **(ES/EN→Conlang)→(Conlang→ES/EN)** to verify reversibility; with exact, it’s bit-for-bit."
|
| 510 |
EXPLAIN_CHECKBOX_EN = "Drop articles ~10–15%, Zero copula ~5–10% extra, Remove pronouns variable, Max Exact 40–60% for >100 chars."
|
| 511 |
EXPLAIN_CONLANGS_EN = """
|
| 512 |
**What are Minimax-ASCII and Kōmín-CJK?**
|
| 513 |
+
- **Minimax-ASCII**: compact ASCII codes with ·P/·T/·F, person (1s,2p…), N for negation and Q for question.
|
| 514 |
+
- **Kōmín-CJK**: visual style using particles (subject ᵖ, object ᵒ) and time bubbles Ⓟ/Ⓣ/Ⓕ, may end in ?.
|
| 515 |
+
Both are conlangs for **space-saving** and **decoding** back to ES/EN (bit-perfect when ~... is present).
|
| 516 |
"""
|
| 517 |
|
| 518 |
# Léxico (amigable)
|
|
|
|
| 522 |
- Limpiamos y ordenamos por **frecuencia de uso**.
|
| 523 |
- Asignamos un **código corto** a cada lema para **Minimax** y para **Kōmín**.
|
| 524 |
- Guardamos tres archivos que la app usa al traducir:
|
| 525 |
+
- lexicon_minimax.json (ES → Minimax)
|
| 526 |
+
- lexicon_komin.json (ES → Kōmín)
|
| 527 |
+
- lexicon_master.json (ES + EN + ambos códigos)
|
| 528 |
**Así** podemos convertir tus frases en **códigos compactos** y volver a texto entendible.
|
| 529 |
"""
|
| 530 |
LEXICON_FRIENDLY_EN = "We use **WordNet (OMW)**, pair ES words with EN, sort by frequency, assign short codes (Minimax/Kōmín), and save three JSONs so the app can encode/decode compactly."
|
|
|
|
| 543 |
msg = f"**Base (sin casillas):** {_pct_comp(text, base):.1f}% · **Con tus opciones:** {_pct_comp(text, curr):.1f}%"
|
| 544 |
if maxc:
|
| 545 |
curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm)
|
| 546 |
+
msg += f" · **Con sidecar ~...:** {_pct_comp(text, curr_exact):.1f}%"
|
| 547 |
return msg
|
| 548 |
|
| 549 |
def compaction_line_en(text, src, tgt, drop, zero, rm, maxc) -> str:
|
|
|
|
| 555 |
msg = f"**Base (no options):** {_pct_comp(text, base):.1f}% · **With your options:** {_pct_comp(text, curr):.1f}%"
|
| 556 |
if maxc:
|
| 557 |
curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm)
|
| 558 |
+
msg += f" · **With ~... sidecar:** {_pct_comp(text, curr_exact):.1f}%"
|
| 559 |
return msg
|
| 560 |
|
| 561 |
def master_preview(n: int = 20) -> List[List[Any]]:
|
|
|
|
| 577 |
with gr.Row():
|
| 578 |
src = gr.Dropdown(ALL_LANGS, value=("Español" if lang=="ES" else "English"), label=("Fuente" if lang=="ES" else "Source"))
|
| 579 |
tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label=("Destino" if lang=="ES" else "Target"))
|
| 580 |
+
text = gr.Textbox(lines=3, label=("Texto" if lang=="ES" else "Text"), placeholder=("Ej.: Hola, ¿cómo estás?" if lang=="ES" else "e.g., Hello, how are you?"), show_copy_button=True)
|
|
|
|
|
|
|
| 581 |
with gr.Row():
|
| 582 |
+
drop = gr.Checkbox(True, label=("Omitir artículos (ES/EN → conlang)" if lang=="ES" else "Drop articles (ES/EN → conlang)"))
|
| 583 |
zero = gr.Checkbox(False, label=("Cópula cero (presente afirm.)" if lang=="ES" else "Zero copula (present affirmative)"))
|
| 584 |
rmpr = gr.Checkbox(False, label=("Quitar pronombres" if lang=="ES" else "Remove pronouns"))
|
| 585 |
+
exact = gr.Checkbox(False, label=("Máx. Compresión Exacta (sidecar ~...)" if lang=="ES" else "Max Exact Compression (sidecar ~...)"))
|
| 586 |
mode_hidden = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 587 |
out = gr.Textbox(lines=6, label=("Traducción" if lang=="ES" else "Translation"), show_copy_button=True)
|
| 588 |
comp = gr.Markdown("")
|
|
|
|
| 627 |
with gr.Row():
|
| 628 |
src = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label=("Fuente" if lang=="ES" else "Source"))
|
| 629 |
tgt = gr.Dropdown(["Español","English"], value=("Español" if lang=="ES" else "English"), label=("Destino" if lang=="ES" else "Target"))
|
| 630 |
+
text = gr.Textbox(lines=3, label=("Texto en conlang (puede incluir ~...)" if lang=="ES" else "Conlang text (may include ~...)"), show_copy_button=True)
|
| 631 |
out = gr.Textbox(lines=6, label=("Salida" if lang=="ES" else "Output"), show_copy_button=True)
|
| 632 |
def run(t, s, d):
|
| 633 |
if not t.strip(): return ""
|
|
|
|
| 676 |
with acc_modes_es: gr.Markdown(
|
| 677 |
"- **🔁 Traducir**: Texto → Destino (ES/EN/Minimax/Kōmín), con opciones de compactación y % mostrado.\n"
|
| 678 |
"- **🛠️ Construir**: Obliga salida en conlang (Minimax/Kōmín) desde ES/EN.\n"
|
| 679 |
+
"- **🗝️ Decodificar**: Conlang → ES/EN (si hay ~..., devuelve el original exacto).\n"
|
| 680 |
"- **🔄 Prueba ida→vuelta**: Comprueba reversibilidad."
|
| 681 |
)
|
| 682 |
acc_modes_en = gr.Accordion("📖 What does each button / mode do? (EN)", open=False, visible=False)
|
| 683 |
with acc_modes_en: gr.Markdown(
|
| 684 |
"- **🔁 Translate**: Text → Target (ES/EN/Minimax/Kōmín) with compaction and %.\n"
|
| 685 |
"- **🛠️ Build**: Force conlang output from ES/EN.\n"
|
| 686 |
+
"- **🗝️ Decode**: Conlang → ES/EN (if ~..., exact original).\n"
|
| 687 |
"- **🔄 Round-trip**: Check reversibility."
|
| 688 |
)
|
| 689 |
|
|
|
|
| 814 |
|
| 815 |
|
| 816 |
|
| 817 |
+
|