Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
-
# app.py — Universal Conlang Translator (Max Compresión Exacta)
|
| 2 |
-
#
|
| 3 |
-
# Archivos requeridos en la raíz:
|
| 4 |
# - lexicon_minimax.json
|
| 5 |
# - lexicon_komin.json
|
| 6 |
# - lexicon_master.json
|
|
@@ -16,7 +15,7 @@ import re
|
|
| 16 |
import json
|
| 17 |
import base64
|
| 18 |
import zlib
|
| 19 |
-
from typing import Dict, Optional
|
| 20 |
import gradio as gr
|
| 21 |
|
| 22 |
# ------------ Archivos esperados ------------
|
|
@@ -72,11 +71,11 @@ def load_lexicons():
|
|
| 72 |
|
| 73 |
return (es2mini, es2komi, mini2es, komi2es,
|
| 74 |
en2mini, en2komi, mini2en, komi2en,
|
| 75 |
-
es2en_lemma, en2es_lemma
|
| 76 |
|
| 77 |
(ES2MINI, ES2KOMI, MINI2ES, KOMI2ES,
|
| 78 |
EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
|
| 79 |
-
ES2EN_LEMMA, EN2ES_LEMMA
|
| 80 |
|
| 81 |
# ------------ OOV reversible (Semi-lossless) ------------
|
| 82 |
ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
|
|
@@ -141,7 +140,7 @@ def lemma_of(tok, src_lang: str) -> str:
|
|
| 141 |
else:
|
| 142 |
return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
|
| 143 |
|
| 144 |
-
# ------------ Utilidades de análisis ------------
|
| 145 |
def detect_polarity(doc) -> bool:
|
| 146 |
return "?" in doc.text
|
| 147 |
|
|
@@ -307,7 +306,7 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
|
|
| 307 |
if is_q: out += " " + Q_FIN
|
| 308 |
return out
|
| 309 |
|
| 310 |
-
# ------------ Sidecars
|
| 311 |
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
|
| 312 |
|
| 313 |
def b85_enc_raw(s: str) -> str:
|
|
@@ -351,7 +350,7 @@ def extract_custom_sidecar(text: str) -> Optional[str]:
|
|
| 351 |
def strip_custom_sidecar(text: str) -> str:
|
| 352 |
return text.split('~')[0].rstrip() if '~' in text else text
|
| 353 |
|
| 354 |
-
# ------------
|
| 355 |
def encode_simple(text: str, src_lang: str, target: str) -> str:
|
| 356 |
if not text.strip(): return ""
|
| 357 |
def repl_es(m):
|
|
@@ -701,7 +700,7 @@ TUTORIAL_EN = """
|
|
| 701 |
> To recover the **exact** original later, enable **Max Exact Compression**.
|
| 702 |
"""
|
| 703 |
|
| 704 |
-
# ===
|
| 705 |
LEXICON_BUILD_ES = """
|
| 706 |
### 🧱 Cómo se construyó el léxico (OMW → Minimax/Kōmín)
|
| 707 |
Este léxico se genera en un Colab desde **OMW (Open Multilingual WordNet 1.4)**:
|
|
@@ -711,7 +710,7 @@ Este léxico se genera en un Colab desde **OMW (Open Multilingual WordNet 1.4)**
|
|
| 711 |
3) (Opcional) **spaCy** ayuda a lematizar y reducir duplicados; (opcional) **Argos** rellena equivalentes EN.
|
| 712 |
4) Se crean **códigos compactos** para cada lema en dos alfabetos:
|
| 713 |
- **Minimax-ASCII** (ASCII, muy práctico en texto plano).
|
| 714 |
-
|
| 715 |
Los alfabetos se **barajan con una semilla fija (SEED)** y se generan combinaciones hasta una longitud máxima
|
| 716 |
(`MAXLEN_MINI`, `MAXLEN_CJK`). Así el mapeo es **reproducible** y prioriza **códigos cortos** para palabras frecuentes.
|
| 717 |
5) Se exportan:
|
|
@@ -742,19 +741,6 @@ The lexicon is generated in Colab from **OMW (Open Multilingual WordNet 1.4)**:
|
|
| 742 |
**Handy params**: `SEED`, `MAXLEN_MINI`, `MAXLEN_CJK`, `USE_SPACY`, `USE_ARGOS`, `LIMIT_ES`.
|
| 743 |
"""
|
| 744 |
|
| 745 |
-
# === Helpers UI para mostrar vista de léxico ===
|
| 746 |
-
def master_preview(n: int = 20) -> List[List[Any]]:
|
| 747 |
-
try:
|
| 748 |
-
entries = (MASTER_OBJ or {}).get("entries", [])
|
| 749 |
-
head = entries[:max(0, int(n))]
|
| 750 |
-
rows = [["lemma_es","lemma_en","minimax","komin"]]
|
| 751 |
-
for e in head:
|
| 752 |
-
rows.append([e.get("lemma_es",""), e.get("lemma_en",""), e.get("minimax",""), e.get("komin","")])
|
| 753 |
-
return rows
|
| 754 |
-
except Exception:
|
| 755 |
-
return [["lemma_es","lemma_en","minimax","komin"], ["(no data)","","",""]]
|
| 756 |
-
|
| 757 |
-
# === Construcción de grupos ES/EN ===
|
| 758 |
def make_group_es():
|
| 759 |
with gr.Group(visible=True) as group:
|
| 760 |
gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
|
|
@@ -764,6 +750,7 @@ def make_group_es():
|
|
| 764 |
gr.Markdown(EXPLAIN_ES)
|
| 765 |
with gr.Accordion("FAQ (ES)", open=False):
|
| 766 |
gr.Markdown(FAQ_ES)
|
|
|
|
| 767 |
with gr.Accordion("Cómo se construyó el léxico (ES)", open=False):
|
| 768 |
gr.Markdown(LEXICON_BUILD_ES)
|
| 769 |
with gr.Column(scale=1):
|
|
@@ -877,6 +864,7 @@ def make_group_en():
|
|
| 877 |
gr.Markdown(EXPLAIN_EN)
|
| 878 |
with gr.Accordion("FAQ (EN)", open=False):
|
| 879 |
gr.Markdown(FAQ_EN)
|
|
|
|
| 880 |
with gr.Accordion("How the lexicon was built (EN)", open=False):
|
| 881 |
gr.Markdown(LEXICON_BUILD_EN)
|
| 882 |
with gr.Column(scale=1):
|
|
@@ -981,30 +969,11 @@ def make_group_en():
|
|
| 981 |
gr.Markdown("Made with ❤️ · **spaCy** (optional) · Everything runs inside this Space.")
|
| 982 |
return group
|
| 983 |
|
| 984 |
-
# ============================== Pestaña global de Léxico ==============================
|
| 985 |
-
def make_lexicon_tab():
|
| 986 |
-
with gr.TabItem("ℹ️ Léxico / Lexicon (OMW → Minimax/Kōmín)"):
|
| 987 |
-
gr.Markdown("## 🧱 Construcción del léxico / Lexicon build")
|
| 988 |
-
with gr.Row():
|
| 989 |
-
with gr.Column():
|
| 990 |
-
with gr.Accordion("Resumen (ES)", open=True):
|
| 991 |
-
gr.Markdown(LEXICON_BUILD_ES)
|
| 992 |
-
with gr.Column():
|
| 993 |
-
with gr.Accordion("Summary (EN)", open=False):
|
| 994 |
-
gr.Markdown(LEXICON_BUILD_EN)
|
| 995 |
-
|
| 996 |
-
gr.Markdown("### 👀 Vista de ejemplo (primeras filas de `lexicon_master.json`)")
|
| 997 |
-
n_rows = gr.Slider(5, 100, value=20, step=5, label="Filas a mostrar / Rows to show")
|
| 998 |
-
table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
|
| 999 |
-
gr.Button("Actualizar vista / Refresh").click(lambda n: master_preview(int(n)), [n_rows], [table])
|
| 1000 |
-
|
| 1001 |
-
# ================================ Lanzador de la app =================================
|
| 1002 |
with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
|
| 1003 |
gr.Markdown("## 🌍 Idioma / Language")
|
| 1004 |
lang_select = gr.Radio(choices=["ES","EN"], value="ES", label="Selecciona / Select")
|
| 1005 |
group_es = make_group_es()
|
| 1006 |
group_en = make_group_en()
|
| 1007 |
-
make_lexicon_tab()
|
| 1008 |
|
| 1009 |
def switch_lang(code):
|
| 1010 |
if code == "EN":
|
|
@@ -1018,3 +987,4 @@ if __name__ == "__main__":
|
|
| 1018 |
|
| 1019 |
|
| 1020 |
|
|
|
|
|
|
| 1 |
+
# app.py — Universal Conlang Translator (Max Compresión Exacta) — UI bilingüe ES/EN + Explicación de léxico
|
| 2 |
+
# Archivos necesarios en la raíz:
|
|
|
|
| 3 |
# - lexicon_minimax.json
|
| 4 |
# - lexicon_komin.json
|
| 5 |
# - lexicon_master.json
|
|
|
|
| 15 |
import json
|
| 16 |
import base64
|
| 17 |
import zlib
|
| 18 |
+
from typing import Dict, Optional
|
| 19 |
import gradio as gr
|
| 20 |
|
| 21 |
# ------------ Archivos esperados ------------
|
|
|
|
| 71 |
|
| 72 |
return (es2mini, es2komi, mini2es, komi2es,
|
| 73 |
en2mini, en2komi, mini2en, komi2en,
|
| 74 |
+
es2en_lemma, en2es_lemma)
|
| 75 |
|
| 76 |
(ES2MINI, ES2KOMI, MINI2ES, KOMI2ES,
|
| 77 |
EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
|
| 78 |
+
ES2EN_LEMMA, EN2ES_LEMMA) = load_lexicons()
|
| 79 |
|
| 80 |
# ------------ OOV reversible (Semi-lossless) ------------
|
| 81 |
ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
|
|
|
|
| 140 |
else:
|
| 141 |
return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
|
| 142 |
|
| 143 |
+
# ------------ Utilidades de análisis sintáctico ------------
|
| 144 |
def detect_polarity(doc) -> bool:
|
| 145 |
return "?" in doc.text
|
| 146 |
|
|
|
|
| 306 |
if is_q: out += " " + Q_FIN
|
| 307 |
return out
|
| 308 |
|
| 309 |
+
# ------------ Sidecars para compresión exacta ------------
|
| 310 |
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
|
| 311 |
|
| 312 |
def b85_enc_raw(s: str) -> str:
|
|
|
|
| 350 |
def strip_custom_sidecar(text: str) -> str:
|
| 351 |
return text.split('~')[0].rstrip() if '~' in text else text
|
| 352 |
|
| 353 |
+
# ------------ Codificar/decodificar léxico puro ------------
|
| 354 |
def encode_simple(text: str, src_lang: str, target: str) -> str:
|
| 355 |
if not text.strip(): return ""
|
| 356 |
def repl_es(m):
|
|
|
|
| 700 |
> To recover the **exact** original later, enable **Max Exact Compression**.
|
| 701 |
"""
|
| 702 |
|
| 703 |
+
# === NUEVO: explicación de cómo se construyó el léxico (OMW → Minimax/Kōmín) ===
|
| 704 |
LEXICON_BUILD_ES = """
|
| 705 |
### 🧱 Cómo se construyó el léxico (OMW → Minimax/Kōmín)
|
| 706 |
Este léxico se genera en un Colab desde **OMW (Open Multilingual WordNet 1.4)**:
|
|
|
|
| 710 |
3) (Opcional) **spaCy** ayuda a lematizar y reducir duplicados; (opcional) **Argos** rellena equivalentes EN.
|
| 711 |
4) Se crean **códigos compactos** para cada lema en dos alfabetos:
|
| 712 |
- **Minimax-ASCII** (ASCII, muy práctico en texto plano).
|
| 713 |
+
- **Kōmín-CJK** (símbolos densos estilo CJK).
|
| 714 |
Los alfabetos se **barajan con una semilla fija (SEED)** y se generan combinaciones hasta una longitud máxima
|
| 715 |
(`MAXLEN_MINI`, `MAXLEN_CJK`). Así el mapeo es **reproducible** y prioriza **códigos cortos** para palabras frecuentes.
|
| 716 |
5) Se exportan:
|
|
|
|
| 741 |
**Handy params**: `SEED`, `MAXLEN_MINI`, `MAXLEN_CJK`, `USE_SPACY`, `USE_ARGOS`, `LIMIT_ES`.
|
| 742 |
"""
|
| 743 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 744 |
def make_group_es():
|
| 745 |
with gr.Group(visible=True) as group:
|
| 746 |
gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
|
|
|
|
| 750 |
gr.Markdown(EXPLAIN_ES)
|
| 751 |
with gr.Accordion("FAQ (ES)", open=False):
|
| 752 |
gr.Markdown(FAQ_ES)
|
| 753 |
+
# NUEVO: acordeón con la explicación del léxico
|
| 754 |
with gr.Accordion("Cómo se construyó el léxico (ES)", open=False):
|
| 755 |
gr.Markdown(LEXICON_BUILD_ES)
|
| 756 |
with gr.Column(scale=1):
|
|
|
|
| 864 |
gr.Markdown(EXPLAIN_EN)
|
| 865 |
with gr.Accordion("FAQ (EN)", open=False):
|
| 866 |
gr.Markdown(FAQ_EN)
|
| 867 |
+
# NEW: accordion with lexicon explanation
|
| 868 |
with gr.Accordion("How the lexicon was built (EN)", open=False):
|
| 869 |
gr.Markdown(LEXICON_BUILD_EN)
|
| 870 |
with gr.Column(scale=1):
|
|
|
|
| 969 |
gr.Markdown("Made with ❤️ · **spaCy** (optional) · Everything runs inside this Space.")
|
| 970 |
return group
|
| 971 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 972 |
with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
|
| 973 |
gr.Markdown("## 🌍 Idioma / Language")
|
| 974 |
lang_select = gr.Radio(choices=["ES","EN"], value="ES", label="Selecciona / Select")
|
| 975 |
group_es = make_group_es()
|
| 976 |
group_en = make_group_en()
|
|
|
|
| 977 |
|
| 978 |
def switch_lang(code):
|
| 979 |
if code == "EN":
|
|
|
|
| 987 |
|
| 988 |
|
| 989 |
|
| 990 |
+
|