Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py — Universal Conlang Translator (Max Compresión Exacta)
|
| 2 |
# Archivos necesarios en la raíz:
|
| 3 |
# - lexicon_minimax.json
|
| 4 |
# - lexicon_komin.json
|
|
@@ -140,7 +140,7 @@ def lemma_of(tok, src_lang: str) -> str:
|
|
| 140 |
else:
|
| 141 |
return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
|
| 142 |
|
| 143 |
-
# ------------
|
| 144 |
def detect_polarity(doc) -> bool:
|
| 145 |
return "?" in doc.text
|
| 146 |
|
|
@@ -306,7 +306,7 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
|
|
| 306 |
if is_q: out += " " + Q_FIN
|
| 307 |
return out
|
| 308 |
|
| 309 |
-
# ------------ Sidecars
|
| 310 |
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
|
| 311 |
|
| 312 |
def b85_enc_raw(s: str) -> str:
|
|
@@ -386,7 +386,7 @@ def pluralize(word: str, tgt_lang: str) -> str:
|
|
| 386 |
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
|
| 387 |
|
| 388 |
def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
| 389 |
-
if not text.strip():
|
| 390 |
return ""
|
| 391 |
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
|
| 392 |
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
|
|
@@ -466,7 +466,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 466 |
v_conj = ("no " if tgt_lang == "Español" else "not ") + v_conj
|
| 467 |
out_parts.append(v_conj)
|
| 468 |
continue
|
| 469 |
-
# resto
|
| 470 |
out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
|
| 471 |
|
| 472 |
out_text = " ".join(out_parts)
|
|
@@ -631,163 +630,357 @@ def round_trip(text, src, tgt, mode, max_comp_exact):
|
|
| 631 |
back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
|
| 632 |
return conlang, back
|
| 633 |
|
| 634 |
-
#
|
|
|
|
|
|
|
| 635 |
|
| 636 |
-
|
| 637 |
-
EXPLAIN_TOP_ES = """
|
| 638 |
-
## ¿Qué hace esta app? (versión fácil)
|
| 639 |
-
Convierte frases entre **Español / Inglés** y dos lenguajes inventados (conlangs):
|
| 640 |
-
- **Minimax-ASCII**: versión compacta con letras normales (ASCII).
|
| 641 |
-
- **Kōmín-CJK**: versión compacta con símbolos al estilo asiático.
|
| 642 |
-
|
| 643 |
-
### ¿Para qué sirve?
|
| 644 |
-
- Para **acortar** mensajes manteniendo el sentido.
|
| 645 |
-
- Para **codificar** y luego **recuperar** el texto original si quieres.
|
| 646 |
-
|
| 647 |
-
---
|
| 648 |
-
|
| 649 |
-
## Los 4 botones (pestañas)
|
| 650 |
-
1) **Traducir** — Cambia de **cualquier** sistema a **cualquier** otro (ES, EN, Minimax o Kōmín).
|
| 651 |
-
2) **Construir (ES/EN → Conlang)** — Toma tu frase natural y crea su versión **compacta** (Minimax/Kōmín) con opciones.
|
| 652 |
-
3) **Decodificar (Conlang → ES/EN)** — Pega Minimax/Kōmín y te devuelve Español o Inglés. Si trae `~...`, recupera el **original exacto**.
|
| 653 |
-
4) **Prueba ida→vuelta** — Hace “ir” al conlang y “volver” a tu idioma para comprobar el resultado.
|
| 654 |
|
| 655 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 656 |
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
-
|
| 660 |
-
-
|
| 661 |
-
|
|
|
|
| 662 |
|
| 663 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 664 |
|
| 665 |
-
|
| 666 |
-
- Si no sabes qué elegir, usa **Traducir**.
|
| 667 |
-
- Para **compactar**, usa **Construir** y marca lo que quieras.
|
| 668 |
-
- Si te pasan algo en conlang, usa **Decodificar**.
|
| 669 |
-
- ¿Dudas? Prueba **ida→vuelta**.
|
| 670 |
"""
|
| 671 |
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
- **
|
|
|
|
| 677 |
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
---
|
| 683 |
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
|
|
|
| 689 |
|
| 690 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 691 |
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
- **Zero copula (present affirmative)**: hide “ser/estar/be” where natural → +~5–10%.
|
| 695 |
-
- **Max Exact Compression**: appends `~...` with your **compressed** original text. When decoding, if present, you get the **exact original** back.
|
| 696 |
-
Note: for **very short** texts it might not reduce size because of the `~...` header.
|
| 697 |
|
| 698 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 699 |
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
"""
|
| 706 |
|
| 707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
|
| 709 |
with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
|
| 710 |
-
gr.Markdown("
|
|
|
|
|
|
|
|
|
|
| 711 |
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
|
|
|
| 715 |
|
| 716 |
-
|
| 717 |
-
return gr.update(value=EXPLAIN_TOP_ES if lang == "ES" else EXPLAIN_TOP_EN)
|
| 718 |
-
|
| 719 |
-
lang_ui.change(_set_explain, inputs=[lang_ui], outputs=[explain_md])
|
| 720 |
-
|
| 721 |
-
# --- Traducir (universal) ---
|
| 722 |
-
with gr.Tab("Traducir / Translate"):
|
| 723 |
-
with gr.Row():
|
| 724 |
-
uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente / Source")
|
| 725 |
-
uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino / Target")
|
| 726 |
-
uni_text = gr.Textbox(lines=3, label="Texto / Text", value="", show_copy_button=True)
|
| 727 |
-
with gr.Row():
|
| 728 |
-
uni_drop = gr.Checkbox(value=True, label="Omitir artículos / Drop articles (ES/EN→conlang)")
|
| 729 |
-
uni_zero = gr.Checkbox(value=False, label="Cópula cero / Zero copula (present aff.)")
|
| 730 |
-
uni_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta / Max Exact Compression")
|
| 731 |
-
uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 732 |
-
uni_out = gr.Textbox(lines=6, label="Traducción / Translation", show_copy_button=True)
|
| 733 |
-
gr.Button("Traducir / Translate").click(
|
| 734 |
-
universal_translate,
|
| 735 |
-
[uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
|
| 736 |
-
[uni_out]
|
| 737 |
-
)
|
| 738 |
-
|
| 739 |
-
# --- Construir (ES/EN → Conlang) ---
|
| 740 |
-
with gr.Tab("Construir (ES/EN → Conlang) / Build"):
|
| 741 |
-
with gr.Row():
|
| 742 |
-
src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente / Source")
|
| 743 |
-
target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 744 |
-
text_in = gr.Textbox(lines=3, label="Frase / Sentence", value="", show_copy_button=True)
|
| 745 |
-
with gr.Row():
|
| 746 |
-
drop_articles = gr.Checkbox(value=True, label="Omitir artículos / Drop articles")
|
| 747 |
-
zero_copula = gr.Checkbox(value=False, label="Cópula cero / Zero copula (present aff.)")
|
| 748 |
-
max_comp_build = gr.Checkbox(value=False, label="Máx. Compresión Exacta / Max Exact Compression")
|
| 749 |
-
mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 750 |
-
out = gr.Textbox(lines=6, label="Salida / Output", show_copy_button=True)
|
| 751 |
-
gr.Button("Construir / Build").click(
|
| 752 |
-
build_sentence,
|
| 753 |
-
[text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
|
| 754 |
-
[out]
|
| 755 |
-
)
|
| 756 |
-
|
| 757 |
-
# --- Decodificar (Conlang → ES/EN) ---
|
| 758 |
-
with gr.Tab("Decodificar (Conlang → ES/EN) / Decode"):
|
| 759 |
-
with gr.Row():
|
| 760 |
-
src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Fuente / Source")
|
| 761 |
-
tgt_lang = gr.Dropdown(["Español","English"], value="Español", label="Destino / Target")
|
| 762 |
-
code_in = gr.Textbox(lines=3, label="Texto en conlang (puede incluir `~...`) / Conlang text (may include `~...`)", show_copy_button=True)
|
| 763 |
-
out3 = gr.Textbox(lines=6, label="Salida / Output", show_copy_button=True)
|
| 764 |
-
|
| 765 |
-
def decode_lossless_aware(text, src, tgt):
|
| 766 |
-
orig = extract_custom_sidecar(text)
|
| 767 |
-
if orig is not None: return orig
|
| 768 |
-
orig = extract_sidecar_b85(text)
|
| 769 |
-
if orig is not None: return orig
|
| 770 |
-
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 771 |
-
|
| 772 |
-
gr.Button("Decodificar / Decode").click(
|
| 773 |
-
decode_lossless_aware, [code_in, src_code, tgt_lang], [out3]
|
| 774 |
-
)
|
| 775 |
-
|
| 776 |
-
# --- Round-trip ---
|
| 777 |
-
with gr.Tab("Prueba ida→vuelta / Round-trip"):
|
| 778 |
-
with gr.Row():
|
| 779 |
-
rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente / Source")
|
| 780 |
-
rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 781 |
-
rt_text = gr.Textbox(lines=3, label="Frase / Sentence", value="", show_copy_button=True)
|
| 782 |
-
rt_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta / Max Exact Compression")
|
| 783 |
-
rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 784 |
-
rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida) / Outward", show_copy_button=True)
|
| 785 |
-
rt_out_back = gr.Textbox(lines=3, label="Vuelta / Back", show_copy_button=True)
|
| 786 |
-
gr.Button("Probar / Test").click(
|
| 787 |
-
round_trip,
|
| 788 |
-
[rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp],
|
| 789 |
-
[rt_out_conlang, rt_out_back]
|
| 790 |
-
)
|
| 791 |
|
| 792 |
if __name__ == "__main__":
|
| 793 |
demo.launch()
|
|
|
|
| 1 |
+
# app.py — Universal Conlang Translator (Max Compresión Exacta) — UI bilingüe ES/EN + Explicación de léxico
|
| 2 |
# Archivos necesarios en la raíz:
|
| 3 |
# - lexicon_minimax.json
|
| 4 |
# - lexicon_komin.json
|
|
|
|
| 140 |
else:
|
| 141 |
return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
|
| 142 |
|
| 143 |
+
# ------------ Utilidades de análisis sintáctico ------------
|
| 144 |
def detect_polarity(doc) -> bool:
|
| 145 |
return "?" in doc.text
|
| 146 |
|
|
|
|
| 306 |
if is_q: out += " " + Q_FIN
|
| 307 |
return out
|
| 308 |
|
| 309 |
+
# ------------ Sidecars para compresión exacta ------------
|
| 310 |
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
|
| 311 |
|
| 312 |
def b85_enc_raw(s: str) -> str:
|
|
|
|
| 386 |
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
|
| 387 |
|
| 388 |
def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
| 389 |
+
if not text.strip():
|
| 390 |
return ""
|
| 391 |
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
|
| 392 |
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
|
|
|
|
| 466 |
v_conj = ("no " if tgt_lang == "Español" else "not ") + v_conj
|
| 467 |
out_parts.append(v_conj)
|
| 468 |
continue
|
|
|
|
| 469 |
out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
|
| 470 |
|
| 471 |
out_text = " ".join(out_parts)
|
|
|
|
| 630 |
back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
|
| 631 |
return conlang, back
|
| 632 |
|
| 633 |
+
# =====================================================================================
|
| 634 |
+
# ========================== UI bilingüe con selector global ==========================
|
| 635 |
+
# =====================================================================================
|
| 636 |
|
| 637 |
+
ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
+
# Texto ES (explicación general)
|
| 640 |
+
EXPLAIN_ES = """
|
| 641 |
+
## 🌐 ¿Qué hace esta app?
|
| 642 |
+
Traduce entre **Español / Inglés** y dos lenguajes construidos:
|
| 643 |
+
- **Minimax-ASCII** (compacto y solo ASCII)
|
| 644 |
+
- **Kōmín-CJK** (estilo CJK con partículas)
|
| 645 |
+
|
| 646 |
+
También **comprime sin perder información** si activas **Máx. Compresión Exacta** (`~...` guarda el original).
|
| 647 |
+
Al **decodificar**, si existe ese `~...`, recuperas el texto **exacto**.
|
| 648 |
+
|
| 649 |
+
### 🧠 ¿Por qué me sirve?
|
| 650 |
+
- Para **reducir** tamaño de mensajes/notas.
|
| 651 |
+
- Para **codificar/decodificar** de forma legible y reversible.
|
| 652 |
+
- Para jugar con **conlangs** simples.
|
| 653 |
+
|
| 654 |
+
### ⚙️ Opciones (puedes ignorarlas al principio)
|
| 655 |
+
- **Omitir artículos**: quita *el/la/los/las* o *a/an/the*. Ahorra ~10–15%.
|
| 656 |
+
- **Cópula cero** (presente afirmativo): oculta *ser/estar/be* cuando suena natural. +~5–10%.
|
| 657 |
+
- **Máx. Compresión Exacta**: añade `~...` con el original comprimido (mejor en textos medianos/largos).
|
| 658 |
+
"""
|
| 659 |
|
| 660 |
+
FAQ_ES = """
|
| 661 |
+
### ❓ Preguntas rápidas
|
| 662 |
+
- **¿Se pierde info?** No, con **Máx. Compresión Exacta** el `~...` guarda el original.
|
| 663 |
+
- **¿Sin spaCy?** Funciona igual (modo léxico). Con spaCy suena más natural.
|
| 664 |
+
- **Privacidad**: todo corre dentro de este Space.
|
| 665 |
+
"""
|
| 666 |
|
| 667 |
+
TUTORIAL_ES = """
|
| 668 |
+
### 🏁 Empezar (3 pasos)
|
| 669 |
+
1. Elige **Fuente** y **Destino**.
|
| 670 |
+
2. Escribe tu frase.
|
| 671 |
+
3. Pulsa **Traducir**.
|
| 672 |
|
| 673 |
+
> Para recuperar **exactamente** el original más tarde, activa **Máx. Compresión Exacta**.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
"""
|
| 675 |
|
| 676 |
+
# Texto EN (explicación general)
|
| 677 |
+
EXPLAIN_EN = """
|
| 678 |
+
## 🌐 What does this app do?
|
| 679 |
+
It translates between **Spanish / English** and two constructed languages:
|
| 680 |
+
- **Minimax-ASCII** (compact, ASCII-only)
|
| 681 |
+
- **Kōmín-CJK** (CJK-style with particles)
|
| 682 |
|
| 683 |
+
You can also **compress without losing information** by enabling **Max Exact Compression** (`~...` stores the original).
|
| 684 |
+
When **decoding**, if `~...` exists, you get the **exact original** back.
|
| 685 |
+
"""
|
|
|
|
|
|
|
| 686 |
|
| 687 |
+
FAQ_EN = """
|
| 688 |
+
### ❓ Quick answers
|
| 689 |
+
- **Any loss?** Not with **Max Exact Compression** — the `~...` keeps the original.
|
| 690 |
+
- **No spaCy?** Still works (lexical mode). With spaCy it reads more naturally.
|
| 691 |
+
- **Privacy**: everything runs inside this Space.
|
| 692 |
+
"""
|
| 693 |
|
| 694 |
+
TUTORIAL_EN = """
|
| 695 |
+
### 🏁 Quick start (3 steps)
|
| 696 |
+
1. Pick **Source** and **Target**.
|
| 697 |
+
2. Type your sentence.
|
| 698 |
+
3. Click **Translate**.
|
| 699 |
|
| 700 |
+
> To recover the **exact** original later, enable **Max Exact Compression**.
|
| 701 |
+
"""
|
|
|
|
|
|
|
|
|
|
| 702 |
|
| 703 |
+
# === NUEVO: explicación de cómo se construyó el léxico (OMW → Minimax/Kōmín) ===
|
| 704 |
+
LEXICON_BUILD_ES = """
|
| 705 |
+
### 🧱 Cómo se construyó el léxico (OMW → Minimax/Kōmín)
|
| 706 |
+
Este léxico se genera en un Colab desde **OMW (Open Multilingual WordNet 1.4)**:
|
| 707 |
+
|
| 708 |
+
1) Se recorren los *sinsets* (grupos de significado) y se extraen **lemas ES** y sus **equivalentes EN**.
|
| 709 |
+
2) Se normalizan los lemas y se ordenan por **frecuencia de uso** (palabras comunes primero, con *wordfreq*).
|
| 710 |
+
3) (Opcional) **spaCy** ayuda a lematizar y reducir duplicados; (opcional) **Argos** rellena equivalentes EN.
|
| 711 |
+
4) Se crean **códigos compactos** para cada lema en dos alfabetos:
|
| 712 |
+
- **Minimax-ASCII** (ASCII, muy práctico en texto plano).
|
| 713 |
+
- **Kōmín-CJK** (símbolos densos estilo CJK).
|
| 714 |
+
Los alfabetos se **barajan con una semilla fija (SEED)** y se generan combinaciones hasta una longitud máxima
|
| 715 |
+
(`MAXLEN_MINI`, `MAXLEN_CJK`). Así el mapeo es **reproducible** y prioriza **códigos cortos** para palabras frecuentes.
|
| 716 |
+
5) Se exportan:
|
| 717 |
+
- `lexicon_minimax.json` (ES → código Minimax)
|
| 718 |
+
- `lexicon_komin.json` (ES → código Kōmín)
|
| 719 |
+
- `lexicon_master.json` (ES, EN, Minimax, Kōmín) + `lexicon_master.tsv`
|
| 720 |
+
|
| 721 |
+
**Parámetros útiles**: `SEED`, `MAXLEN_MINI`, `MAXLEN_CJK`, `USE_SPACY`, `USE_ARGOS`, `LIMIT_ES`.
|
| 722 |
+
"""
|
| 723 |
|
| 724 |
+
LEXICON_BUILD_EN = """
|
| 725 |
+
### 🧱 How the lexicon was built (OMW → Minimax/Kōmín)
|
| 726 |
+
The lexicon is generated in Colab from **OMW (Open Multilingual WordNet 1.4)**:
|
| 727 |
+
|
| 728 |
+
1) Iterate synsets (meaning groups) to extract **ES lemmas** and their **EN counterparts**.
|
| 729 |
+
2) Normalize lemmas and sort by **usage frequency** (common words first, via *wordfreq*).
|
| 730 |
+
3) (Optional) **spaCy** helps lemmatization; (optional) **Argos** fills missing EN entries.
|
| 731 |
+
4) Create **compact codes** per lemma in two alphabets:
|
| 732 |
+
- **Minimax-ASCII** (ASCII-only, plain-text friendly).
|
| 733 |
+
- **Kōmín-CJK** (dense CJK-style symbols).
|
| 734 |
+
Alphabets are **shuffled with a fixed SEED** and combinations are generated up to max length
|
| 735 |
+
(`MAXLEN_MINI`, `MAXLEN_CJK`) so mapping is **reproducible** and **short codes** go to frequent words.
|
| 736 |
+
5) Export:
|
| 737 |
+
- `lexicon_minimax.json` (ES → Minimax code)
|
| 738 |
+
- `lexicon_komin.json` (ES → Kōmín code)
|
| 739 |
+
- `lexicon_master.json` (ES, EN, Minimax, Kōmín) + `lexicon_master.tsv`
|
| 740 |
+
|
| 741 |
+
**Handy params**: `SEED`, `MAXLEN_MINI`, `MAXLEN_CJK`, `USE_SPACY`, `USE_ARGOS`, `LIMIT_ES`.
|
| 742 |
"""
|
| 743 |
|
| 744 |
+
def make_group_es():
|
| 745 |
+
with gr.Group(visible=True) as group:
|
| 746 |
+
gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
|
| 747 |
+
with gr.Row():
|
| 748 |
+
with gr.Column(scale=1):
|
| 749 |
+
with gr.Accordion("Resumen (ES)", open=True):
|
| 750 |
+
gr.Markdown(EXPLAIN_ES)
|
| 751 |
+
with gr.Accordion("FAQ (ES)", open=False):
|
| 752 |
+
gr.Markdown(FAQ_ES)
|
| 753 |
+
# NUEVO: acordeón con la explicación del léxico
|
| 754 |
+
with gr.Accordion("Cómo se construyó el léxico (ES)", open=False):
|
| 755 |
+
gr.Markdown(LEXICON_BUILD_ES)
|
| 756 |
+
with gr.Column(scale=1):
|
| 757 |
+
with gr.Accordion("Tutorial (ES)", open=True):
|
| 758 |
+
gr.Markdown(TUTORIAL_ES)
|
| 759 |
+
gr.Markdown("**Consejo:** Los mensajes muy cortos pueden no reducirse por la cabecera del `~...`.")
|
| 760 |
+
with gr.Tab("🔁 Traducir"):
|
| 761 |
+
with gr.Row():
|
| 762 |
+
uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
|
| 763 |
+
uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
|
| 764 |
+
uni_text = gr.Textbox(lines=3, label="Texto", placeholder="Ej.: Hola, ¿cómo estás?", show_copy_button=True)
|
| 765 |
+
with gr.Row():
|
| 766 |
+
uni_drop = gr.Checkbox(value=True, label="Omitir artículos (ES/EN → conlang)")
|
| 767 |
+
uni_zero = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
|
| 768 |
+
uni_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta (sidecar `~...`)")
|
| 769 |
+
uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 770 |
+
with gr.Row():
|
| 771 |
+
btn_translate = gr.Button("🚀 Traducir", variant="primary")
|
| 772 |
+
btn_reset = gr.Button("🧹 Limpiar")
|
| 773 |
+
uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
|
| 774 |
+
|
| 775 |
+
btn_translate.click(
|
| 776 |
+
universal_translate,
|
| 777 |
+
[uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
|
| 778 |
+
[uni_out]
|
| 779 |
+
)
|
| 780 |
+
btn_reset.click(lambda: "", None, [uni_text, uni_out])
|
| 781 |
+
|
| 782 |
+
gr.Markdown("### 🔎 Ejemplos (clic para autocompletar)")
|
| 783 |
+
ex1 = gr.Button("ES→Minimax: «Hola, ¿cómo estás?»")
|
| 784 |
+
ex2 = gr.Button("EN→Kōmín: «This system keeps messages compact.»")
|
| 785 |
+
ex3 = gr.Button("ES→Minimax (con compresión): «El clima hoy es excelente para pasear.»")
|
| 786 |
+
ex4 = gr.Button("EN→Kōmín (con compresión): «Please decode this later with the sidecar.»")
|
| 787 |
+
|
| 788 |
+
ex1.click(lambda: ("Hola, ¿cómo estás?", "Español", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 789 |
+
ex2.click(lambda: ("This system keeps messages compact.", "English", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 790 |
+
ex3.click(lambda: ("El clima hoy es excelente para pasear.", "Español", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 791 |
+
ex4.click(lambda: ("Please decode this later with the sidecar.", "English", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 792 |
+
|
| 793 |
+
with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
|
| 794 |
+
with gr.Row():
|
| 795 |
+
src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
|
| 796 |
+
target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 797 |
+
text_in = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
|
| 798 |
+
with gr.Row():
|
| 799 |
+
drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
|
| 800 |
+
zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
|
| 801 |
+
max_comp_build = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
|
| 802 |
+
mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 803 |
+
with gr.Row():
|
| 804 |
+
btn_build = gr.Button("🏗️ Construir", variant="primary")
|
| 805 |
+
btn_build_clear = gr.Button("🧹 Limpiar")
|
| 806 |
+
out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
|
| 807 |
+
|
| 808 |
+
btn_build.click(
|
| 809 |
+
build_sentence,
|
| 810 |
+
[text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
|
| 811 |
+
[out]
|
| 812 |
+
)
|
| 813 |
+
btn_build_clear.click(lambda: "", None, [text_in, out])
|
| 814 |
+
|
| 815 |
+
with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
|
| 816 |
+
with gr.Row():
|
| 817 |
+
src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Fuente")
|
| 818 |
+
tgt_lang = gr.Dropdown(["Español","English"], value="Español", label="Destino")
|
| 819 |
+
code_in = gr.Textbox(lines=3, label="Texto en conlang (puede incluir `~...`)", show_copy_button=True)
|
| 820 |
+
out3 = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
|
| 821 |
+
|
| 822 |
+
def decode_lossless_aware(text, src, tgt):
|
| 823 |
+
orig = extract_custom_sidecar(text)
|
| 824 |
+
if orig is not None: return orig
|
| 825 |
+
orig = extract_sidecar_b85(text)
|
| 826 |
+
if orig is not None: return orig
|
| 827 |
+
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 828 |
+
|
| 829 |
+
with gr.Row():
|
| 830 |
+
btn_decode = gr.Button("🔓 Decodificar", variant="primary")
|
| 831 |
+
btn_decode_clear = gr.Button("🧹 Limpiar")
|
| 832 |
+
|
| 833 |
+
btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
|
| 834 |
+
btn_decode_clear.click(lambda: "", None, [code_in, out3])
|
| 835 |
+
|
| 836 |
+
gr.Markdown("> **Tip:** si ves `~...`, la decodificación será 100% exacta.")
|
| 837 |
+
|
| 838 |
+
with gr.Tab("🔄 Prueba ida→vuelta"):
|
| 839 |
+
with gr.Row():
|
| 840 |
+
rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
|
| 841 |
+
rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 842 |
+
rt_text = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
|
| 843 |
+
rt_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
|
| 844 |
+
rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 845 |
+
rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
|
| 846 |
+
rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
|
| 847 |
+
with gr.Row():
|
| 848 |
+
btn_rt = gr.Button("▶️ Probar", variant="primary")
|
| 849 |
+
btn_rt_clear = gr.Button("🧹 Limpiar")
|
| 850 |
+
|
| 851 |
+
btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
|
| 852 |
+
btn_rt_clear.click(lambda: "", None, [rt_text, rt_out_conlang, rt_out_back])
|
| 853 |
+
|
| 854 |
+
gr.Markdown("---")
|
| 855 |
+
gr.Markdown("Hecho con ❤️ · **spaCy** (opcional) · Todo se ejecuta en este Space.")
|
| 856 |
+
return group
|
| 857 |
+
|
| 858 |
+
def make_group_en():
|
| 859 |
+
with gr.Group(visible=False) as group:
|
| 860 |
+
gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
|
| 861 |
+
with gr.Row():
|
| 862 |
+
with gr.Column(scale=1):
|
| 863 |
+
with gr.Accordion("Summary (EN)", open=True):
|
| 864 |
+
gr.Markdown(EXPLAIN_EN)
|
| 865 |
+
with gr.Accordion("FAQ (EN)", open=False):
|
| 866 |
+
gr.Markdown(FAQ_EN)
|
| 867 |
+
# NEW: accordion with lexicon explanation
|
| 868 |
+
with gr.Accordion("How the lexicon was built (EN)", open=False):
|
| 869 |
+
gr.Markdown(LEXICON_BUILD_EN)
|
| 870 |
+
with gr.Column(scale=1):
|
| 871 |
+
with gr.Accordion("Tutorial (EN)", open=True):
|
| 872 |
+
gr.Markdown(TUTORIAL_EN)
|
| 873 |
+
gr.Markdown("**Tip:** Very short messages may not shrink due to the `~...` header.")
|
| 874 |
+
with gr.Tab("🔁 Translate"):
|
| 875 |
+
with gr.Row():
|
| 876 |
+
uni_src = gr.Dropdown(ALL_LANGS, value="English", label="Source")
|
| 877 |
+
uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Target")
|
| 878 |
+
uni_text = gr.Textbox(lines=3, label="Text", placeholder="e.g., Hello, how are you?", show_copy_button=True)
|
| 879 |
+
with gr.Row():
|
| 880 |
+
uni_drop = gr.Checkbox(value=True, label="Drop articles (ES/EN → conlang)")
|
| 881 |
+
uni_zero = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
|
| 882 |
+
uni_max_comp = gr.Checkbox(value=False, label="Max Exact Compression (sidecar `~...`)")
|
| 883 |
+
uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 884 |
+
with gr.Row():
|
| 885 |
+
btn_translate = gr.Button("🚀 Translate", variant="primary")
|
| 886 |
+
btn_reset = gr.Button("🧹 Clear")
|
| 887 |
+
uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
|
| 888 |
+
|
| 889 |
+
btn_translate.click(
|
| 890 |
+
universal_translate,
|
| 891 |
+
[uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
|
| 892 |
+
[uni_out]
|
| 893 |
+
)
|
| 894 |
+
btn_reset.click(lambda: "", None, [uni_text, uni_out])
|
| 895 |
+
|
| 896 |
+
gr.Markdown("### 🔎 Examples (click to autofill)")
|
| 897 |
+
ex1 = gr.Button("EN→Minimax: “Hello, how are you?”")
|
| 898 |
+
ex2 = gr.Button("ES→Kōmín: “Este sistema mantiene los mensajes compactos.”")
|
| 899 |
+
ex3 = gr.Button("EN→Minimax (compressed): “The weather today is perfect for a walk.”")
|
| 900 |
+
ex4 = gr.Button("ES→Kōmín (compressed): “Por favor decodifica esto luego con el sidecar.”")
|
| 901 |
+
|
| 902 |
+
ex1.click(lambda: ("Hello, how are you?", "English", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 903 |
+
ex2.click(lambda: ("Este sistema mantiene los mensajes compactos.", "Español", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 904 |
+
ex3.click(lambda: ("The weather today is perfect for a walk.", "English", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 905 |
+
ex4.click(lambda: ("Por favor decodifica esto luego con el sidecar.", "Español", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 906 |
+
|
| 907 |
+
with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
|
| 908 |
+
with gr.Row():
|
| 909 |
+
src_lang = gr.Dropdown(["Español","English"], value="English", label="Source")
|
| 910 |
+
target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 911 |
+
text_in = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
|
| 912 |
+
with gr.Row():
|
| 913 |
+
drop_articles = gr.Checkbox(value=True, label="Drop articles")
|
| 914 |
+
zero_copula = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
|
| 915 |
+
max_comp_build = gr.Checkbox(value=False, label="Max Exact Compression")
|
| 916 |
+
mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 917 |
+
with gr.Row():
|
| 918 |
+
btn_build = gr.Button("🏗️ Build", variant="primary")
|
| 919 |
+
btn_build_clear = gr.Button("🧹 Clear")
|
| 920 |
+
out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
|
| 921 |
+
|
| 922 |
+
btn_build.click(
|
| 923 |
+
build_sentence,
|
| 924 |
+
[text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
|
| 925 |
+
[out]
|
| 926 |
+
)
|
| 927 |
+
btn_build_clear.click(lambda: "", None, [text_in, out])
|
| 928 |
+
|
| 929 |
+
with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
|
| 930 |
+
with gr.Row():
|
| 931 |
+
src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Source")
|
| 932 |
+
tgt_lang = gr.Dropdown(["Español","English"], value="English", label="Target")
|
| 933 |
+
code_in = gr.Textbox(lines=3, label="Conlang text (may include `~...`)", show_copy_button=True)
|
| 934 |
+
out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
|
| 935 |
+
|
| 936 |
+
def decode_lossless_aware(text, src, tgt):
|
| 937 |
+
orig = extract_custom_sidecar(text)
|
| 938 |
+
if orig is not None: return orig
|
| 939 |
+
orig = extract_sidecar_b85(text)
|
| 940 |
+
if orig is not None: return orig
|
| 941 |
+
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 942 |
+
|
| 943 |
+
with gr.Row():
|
| 944 |
+
btn_decode = gr.Button("🔓 Decode", variant="primary")
|
| 945 |
+
btn_decode_clear = gr.Button("🧹 Clear")
|
| 946 |
+
|
| 947 |
+
btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
|
| 948 |
+
btn_decode_clear.click(lambda: "", None, [code_in, out3])
|
| 949 |
+
|
| 950 |
+
gr.Markdown("> **Tip:** if you see `~...`, decoding will be bit-perfect.")
|
| 951 |
+
|
| 952 |
+
with gr.Tab("🔄 Round-trip"):
|
| 953 |
+
with gr.Row():
|
| 954 |
+
rt_src = gr.Dropdown(["Español","English"], value="English", label="Source")
|
| 955 |
+
rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 956 |
+
rt_text = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
|
| 957 |
+
rt_max_comp = gr.Checkbox(value=False, label="Max Exact Compression")
|
| 958 |
+
rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 959 |
+
rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
|
| 960 |
+
rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
|
| 961 |
+
with gr.Row():
|
| 962 |
+
btn_rt = gr.Button("▶️ Test", variant="primary")
|
| 963 |
+
btn_rt_clear = gr.Button("🧹 Clear")
|
| 964 |
+
|
| 965 |
+
btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
|
| 966 |
+
btn_rt_clear.click(lambda: "", None, [rt_text, rt_out_conlang, rt_out_back])
|
| 967 |
+
|
| 968 |
+
gr.Markdown("---")
|
| 969 |
+
gr.Markdown("Made with ❤️ · **spaCy** (optional) · Everything runs inside this Space.")
|
| 970 |
+
return group
|
| 971 |
|
| 972 |
with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
|
| 973 |
+
gr.Markdown("## 🌍 Idioma / Language")
|
| 974 |
+
lang_select = gr.Radio(choices=["ES","EN"], value="ES", label="Selecciona / Select")
|
| 975 |
+
group_es = make_group_es()
|
| 976 |
+
group_en = make_group_en()
|
| 977 |
|
| 978 |
+
def switch_lang(code):
|
| 979 |
+
if code == "EN":
|
| 980 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 981 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 982 |
|
| 983 |
+
lang_select.change(switch_lang, [lang_select], [group_es, group_en])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 984 |
|
| 985 |
if __name__ == "__main__":
|
| 986 |
demo.launch()
|