LoloSemper commited on
Commit
01292b5
·
verified ·
1 Parent(s): de90cf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +501 -193
app.py CHANGED
@@ -1,131 +1,486 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # =====================================================================================
2
  # ========================= UI bilingüe y explicaciones claras ========================
3
  # =====================================================================================
4
 
5
  ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
6
 
7
- # ---- Bloques de explicación (cortos para TÍTULO + largos para CONTENIDO) ----
8
  ACC_TITLES_ES = {
9
- "translate": "🔁 Traducir — ¿Qué hace? (haz clic para desplegar)",
10
  "build": "🛠️ Construir (ES/EN → Conlang) — ¿Qué hace?",
11
  "decode": "🗝️ Decodificar (Conlang → ES/EN) — ¿Qué hace?",
12
  "roundtrip": "🔄 Prueba ida→vuelta — ¿Qué hace?",
13
- "checkbox": "☑️ Opciones y compactación (artículos, cópula, pronombres, exacta)",
14
  "lexicon": "ℹ️ Léxico (OMW → Minimax/Kōmín) — explicación y vista previa"
15
  }
16
-
17
  ACC_TITLES_EN = {
18
- "translate": "🔁 Translate — What does it do? (click to expand)",
19
  "build": "🛠️ Build (ES/EN → Conlang) — What does it do?",
20
  "decode": "🗝️ Decode (Conlang → ES/EN) — What does it do?",
21
  "roundtrip": "🔄 Round-trip — What does it do?",
22
- "checkbox": "☑️ Options & compaction (articles, copula, pronouns, exact)",
23
  "lexicon": "ℹ️ Lexicon (OMW → Minimax/Kōmín) — explainer & preview"
24
  }
25
 
26
- # Contenidos (Markdown) — ya limpios (se verán dentro del Accordion)
27
  EXPLAIN_TAB_TRANSLATE_ES = """
28
- Convierte el **Texto** al **Destino**. Funciona para cualquier combinación: Español, English, Minimax-ASCII y Kōmín-CJK.
29
-
30
- - Si activas **Máx. Compresión Exacta**, añade un remolque `~...` con el **original comprimido** para recuperarlo **exactamente** al decodificar.
31
- - Los **checkbox** (Omitir artículos / Cópula cero / Quitar pronombres) **solo aplican** cuando el **Destino es un conlang** (Minimax o Kōmín).
32
  """
33
-
34
  EXPLAIN_TAB_BUILD_ES = """
35
- Fuerza la salida **en conlang** (Minimax o Kōmín) desde Español o Inglés.
36
- Aplica reglas de fraseo (orden, partículas/TAM) y las opciones de **compactación**.
37
  """
38
-
39
  EXPLAIN_TAB_DECODE_ES = """
40
- Convierte **Minimax/Kōmín** a **Español o Inglés**.
41
-
42
- - Si el texto trae `~...`, devuelve el **original exacto**.
43
- - Si no hay `~...`, la reconstrucción es **semi-lossless** con léxico y pistas simples.
44
  """
45
-
46
  EXPLAIN_TAB_ROUNDTRIP_ES = """
47
- Ejecuta **(ES/EN Conlang) → (Conlang ES/EN)** para comprobar **reversibilidad**.
48
- Con **Máx. Compresión Exacta**, la vuelta coincide **bit a bit**.
49
  """
50
-
51
  EXPLAIN_CHECKBOX_ES = """
52
- **Qué hace cada opción:**
53
-
54
- - **Omitir artículos** (el/la/los/las; a/an/the): ahorro típico **~10–15%**.
55
- - **Cópula cero (presente afirm.)**: omite *ser/estar/be* cuando suena natural → **~5–10%** extra.
56
- - **Quitar pronombres**: elimina pronombres de sujeto/objeto evidentes → ahorro **variable**.
57
- - **Máx. Compresión Exacta**: añade `~...` (zlib) para recuperación exacta. En >100 caracteres, **~40–60%**; en textos cortos puede no reducir.
58
-
59
- **Referencia orientativa:**
60
- - Sin casillas: **0%**
61
- - Solo artículos: **~10–15%**
62
- - Solo cópula: **~5–10%**
63
- - Artículos + cópula: **~15–20%**
64
- - Con exacta: **~40–60%** (si el texto es suficientemente largo)
65
  """
66
 
67
  EXPLAIN_TAB_TRANSLATE_EN = """
68
- Converts **Text** to **Target**. Works for any pair: Spanish, English, Minimax-ASCII, Kōmín-CJK.
69
-
70
- - **Max Exact Compression** appends `~...` with the **exact original** for perfect recovery.
71
- - Checkboxes (Drop articles / Zero copula / Remove pronouns) apply **only when the Target is a conlang**.
72
- """
73
-
74
- EXPLAIN_TAB_BUILD_EN = """
75
- Forces **conlang output** (Minimax or Kōmín) from Spanish/English.
76
- Applies phrasing rules (order, particles/TAM) and **compaction** options.
77
- """
78
-
79
- EXPLAIN_TAB_DECODE_EN = """
80
- Converts **Minimax/Kōmín** to **Spanish/English**.
81
-
82
- - If `~...` is present, returns the **bit-perfect original**.
83
- - Otherwise, reconstructs **semi-losslessly** using the lexicon.
84
  """
85
-
86
- EXPLAIN_TAB_ROUNDTRIP_EN = """
87
- Runs **(ES/EN Conlang) (Conlang ES/EN)** to verify **reversibility**.
88
- With **Max Exact Compression**, the return matches bit-for-bit.
89
- """
90
-
91
  EXPLAIN_CHECKBOX_EN = """
92
- **What each option does:**
93
-
94
- - **Drop articles**: **~10–15%**.
95
- - **Zero copula (present affirmative)**: **~510%** extra.
96
- - **Remove pronouns**: variable savings.
97
- - **Max Exact Compression**: `~...` (zlib) for exact recovery. For >100 chars, **~40–60%**; very short texts may not shrink.
98
-
99
- **Reference (approx):**
100
- - No options: **0%**
101
- - Articles only: **~10–15%**
102
- - Copula only: **~5–10%**
103
- - Articles + Copula: **~15–20%**
104
- - With exact: **~40–60%** (if text is long enough)
105
  """
106
 
107
  LEXICON_BUILD_ES = """
108
- Se construyó así:
109
-
110
- 1. De **OMW/WordNet 1.4** se extraen **lemas ES** y sus **equivalentes EN** por sinset.
111
- 2. Normalización y orden por **frecuencia** (*wordfreq*).
112
- 3. Opcional: **spaCy** refina lemas; **Argos** puede rellenar EN faltantes.
113
- 4. Asignación de **códigos compactos** con alfabetos barajados por **SEED** hasta `MAXLEN_MINI`/`MAXLEN_CJK`.
114
- 5. Exporta: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+ TSV).
115
-
116
- **Vista previa** de `lexicon_master.json` (elige cuántas filas ver) aquí abajo.
117
  """
118
-
119
  LEXICON_BUILD_EN = """
120
- Built as follows:
121
-
122
- 1. From **OMW/WordNet 1.4**, gather **ES lemmas** and **EN counterparts** by synset.
123
- 2. Normalize and sort by **frequency** (*wordfreq*).
124
- 3. Optional: **spaCy** refines lemmas; **Argos** may fill missing EN.
125
- 4. Assign **compact codes** with **SEED-shuffled** alphabets up to `MAXLEN_MINI`/`MAXLEN_CJK`.
126
- 5. Exports: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+ TSV).
127
-
128
- **Preview** of `lexicon_master.json` below.
129
  """
130
 
131
  # ---------- Utilidad: cálculo de compactación ----------
@@ -137,16 +492,12 @@ def compaction_report_es(text, src, tgt, drop, zero, rm, maxc) -> str:
137
  if not text.strip(): return "—"
138
  if tgt not in ("Minimax-ASCII","Kōmín-CJK"):
139
  return "La compactación aplica cuando el **Destino** es Minimax/Kōmín."
140
- # Base (sin casillas, sin sidecar)
141
  base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False)
142
- # Actual (con opciones, sin sidecar)
143
  curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm)
144
- # Si el usuario marcó exacta, también medimos con sidecar
145
- curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm) if maxc else None
146
- p_base = _pct_comp(text, base)
147
- p_curr = _pct_comp(text, curr)
148
  msg = f"**Base (sin casillas):** {p_base:.1f}% · **Con tus opciones:** {p_curr:.1f}%"
149
- if curr_exact is not None:
 
150
  p_exact = _pct_comp(text, curr_exact)
151
  msg += f" · **Con sidecar `~...`:** {p_exact:.1f}%"
152
  return msg
@@ -157,11 +508,10 @@ def compaction_report_en(text, src, tgt, drop, zero, rm, maxc) -> str:
157
  return "Compaction applies when **Target** is Minimax/Kōmín."
158
  base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False)
159
  curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm)
160
- curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm) if maxc else None
161
- p_base = _pct_comp(text, base)
162
- p_curr = _pct_comp(text, curr)
163
  msg = f"**Base (no options):** {p_base:.1f}% · **With your options:** {p_curr:.1f}%"
164
- if curr_exact is not None:
 
165
  p_exact = _pct_comp(text, curr_exact)
166
  msg += f" · **With `~...` sidecar:** {p_exact:.1f}%"
167
  return msg
@@ -181,17 +531,13 @@ def master_preview(n: int = 20) -> List[List[Any]]:
181
  def make_group_es():
182
  with gr.Group(visible=True) as g:
183
  gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
184
- # Acordeones de explicación — MISMO nivel y con contenido Markdown dentro
185
  with gr.Row():
186
  with gr.Column():
187
- with gr.Accordion(ACC_TITLES_ES["translate"], open=False):
188
- gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES)
189
- with gr.Accordion(ACC_TITLES_ES["build"], open=False):
190
- gr.Markdown(EXPLAIN_TAB_BUILD_ES)
191
- with gr.Accordion(ACC_TITLES_ES["decode"], open=False):
192
- gr.Markdown(EXPLAIN_TAB_DECODE_ES)
193
- with gr.Accordion(ACC_TITLES_ES["roundtrip"], open=False):
194
- gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
195
  with gr.Column():
196
  with gr.Accordion(ACC_TITLES_ES["checkbox"], open=False):
197
  gr.Markdown(EXPLAIN_CHECKBOX_ES)
@@ -201,7 +547,7 @@ def make_group_es():
201
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
202
  gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows], [table])
203
 
204
- # ==== Tabs funcionales ====
205
  with gr.Tab("🔁 Traducir"):
206
  with gr.Row():
207
  uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
@@ -212,26 +558,20 @@ def make_group_es():
212
  uni_zero = gr.Checkbox(False, label="Cópula cero (presente afirm.)")
213
  uni_rmpr = gr.Checkbox(False, label="Quitar pronombres")
214
  uni_maxc = gr.Checkbox(False, label="Máx. Compresión Exacta (sidecar `~...`)")
215
-
216
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
217
- with gr.Row():
218
- btn_tr = gr.Button("🚀 Traducir", variant="primary")
219
- btn_tr_cl = gr.Button("🧹 Limpiar")
220
  uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
221
- comp_out = gr.Markdown("") # indicador de compactación
222
 
223
  def do_translate(text, src, tgt, drop, zero, mode, maxc, rm):
 
224
  res = universal_translate(text, src, tgt, drop, zero, mode, maxc, rm)
225
  rep = compaction_report_es(text, src, tgt, drop, zero, rm, maxc)
226
  return res, rep
227
 
228
- btn_tr.click(do_translate,
 
229
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
230
  [uni_out, comp_out])
231
- btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
232
-
233
- with gr.Accordion("Ayuda rápida", open=False):
234
- gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES)
235
 
236
  with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
237
  with gr.Row():
@@ -244,24 +584,19 @@ def make_group_es():
244
  rm_pron_build = gr.Checkbox(False, label="Quitar pronombres")
245
  max_comp_build = gr.Checkbox(False, label="Máx. Compresión Exacta")
246
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
247
- with gr.Row():
248
- btn_b = gr.Button("🏗️ Construir", variant="primary")
249
- btn_b_cl = gr.Button("🧹 Limpiar")
250
  out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
251
  comp_out_b = gr.Markdown("")
252
 
253
  def do_build(text, src, tgt, drop, zero, mode, maxc, rm):
 
254
  res = build_sentence(text, src, tgt, drop, zero, mode, maxc, rm)
255
  rep = compaction_report_es(text, src, tgt, drop, zero, rm, maxc)
256
  return res, rep
257
 
258
- btn_b.click(do_build,
259
- [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
260
- [out, comp_out_b])
261
- btn_b_cl.click(lambda: ("",""), None, [text_in, out])
262
-
263
- with gr.Accordion("Ayuda rápida", open=False):
264
- gr.Markdown(EXPLAIN_TAB_BUILD_ES)
265
 
266
  with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
267
  with gr.Row():
@@ -271,21 +606,15 @@ def make_group_es():
271
  out3 = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
272
 
273
  def decode_lossless_aware(text, src, tgt):
 
274
  orig = extract_custom_sidecar(text)
275
  if orig is not None: return orig
276
  orig = extract_sidecar_b85(text)
277
  if orig is not None: return orig
278
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
279
 
280
- with gr.Row():
281
- btn_d = gr.Button("🔓 Decodificar", variant="primary")
282
- btn_d_cl = gr.Button("🧹 Limpiar")
283
-
284
- btn_d.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
285
- btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
286
-
287
- with gr.Accordion("Ayuda rápida", open=False):
288
- gr.Markdown(EXPLAIN_TAB_DECODE_ES)
289
 
290
  with gr.Tab("🔄 Prueba ida→vuelta"):
291
  with gr.Row():
@@ -296,15 +625,15 @@ def make_group_es():
296
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
297
  rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
298
  rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
299
- with gr.Row():
300
- btn_rt = gr.Button("▶️ Probar", variant="primary")
301
- btn_rt_cl = gr.Button("🧹 Limpiar")
302
 
303
- btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
304
- btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
 
 
 
305
 
306
- with gr.Accordion("Ayuda rápida", open=False):
307
- gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
308
  return g
309
 
310
  def make_group_en():
@@ -312,18 +641,13 @@ def make_group_en():
312
  gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
313
  with gr.Row():
314
  with gr.Column():
315
- with gr.Accordion(ACC_TITLES_EN["translate"], open=False):
316
- gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN)
317
- with gr.Accordion(ACC_TITLES_EN["build"], open=False):
318
- gr.Markdown(EXPLAIN_TAB_BUILD_EN)
319
- with gr.Accordion(ACC_TITLES_EN["decode"], open=False):
320
- gr.Markdown(EXPLAIN_TAB_DECODE_EN)
321
- with gr.Accordion(ACC_TITLES_EN["roundtrip"], open=False):
322
- gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
323
  with gr.Column():
324
- with gr.Accordion(ACC_TITLES_EN["checkbox"], open=False):
325
- gr.Markdown(EXPLAIN_CHECKBOX_EN)
326
- with gr.Accordion(ACC_TITLES_EN["lexicon"], open=False):
327
  gr.Markdown(LEXICON_BUILD_EN)
328
  n_rows = gr.Slider(5, 100, value=20, step=5, label="Rows to show")
329
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
@@ -339,26 +663,20 @@ def make_group_en():
339
  uni_zero = gr.Checkbox(False, label="Zero copula (present affirm.)")
340
  uni_rmpr = gr.Checkbox(False, label="Remove pronouns")
341
  uni_maxc = gr.Checkbox(False, label="Max Exact Compression (sidecar `~...`)")
342
-
343
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
344
- with gr.Row():
345
- btn_tr = gr.Button("🚀 Translate", variant="primary")
346
- btn_tr_cl = gr.Button("🧹 Clear")
347
  uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
348
  comp_out = gr.Markdown("")
349
 
350
  def do_translate_en(text, src, tgt, drop, zero, mode, maxc, rm):
 
351
  res = universal_translate(text, src, tgt, drop, zero, mode, maxc, rm)
352
  rep = compaction_report_en(text, src, tgt, drop, zero, rm, maxc)
353
  return res, rep
354
 
355
- btn_tr.click(do_translate_en,
 
356
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
357
  [uni_out, comp_out])
358
- btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
359
-
360
- with gr.Accordion("Quick help", open=False):
361
- gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN)
362
 
363
  with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
364
  with gr.Row():
@@ -371,24 +689,19 @@ def make_group_en():
371
  rm_pron_build = gr.Checkbox(False, label="Remove pronouns")
372
  max_comp_build = gr.Checkbox(False, label="Max Exact Compression")
373
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
374
- with gr.Row():
375
- btn_b = gr.Button("🏗️ Build", variant="primary")
376
- btn_b_cl = gr.Button("🧹 Clear")
377
  out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
378
  comp_out_b = gr.Markdown("")
379
 
380
  def do_build_en(text, src, tgt, drop, zero, mode, maxc, rm):
 
381
  res = build_sentence(text, src, tgt, drop, zero, mode, maxc, rm)
382
  rep = compaction_report_en(text, src, tgt, drop, zero, rm, maxc)
383
  return res, rep
384
 
385
- btn_b.click(do_build_en,
386
- [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
387
- [out, comp_out_b])
388
- btn_b_cl.click(lambda: ("",""), None, [text_in, out])
389
-
390
- with gr.Accordion("Quick help", open=False):
391
- gr.Markdown(EXPLAIN_TAB_BUILD_EN)
392
 
393
  with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
394
  with gr.Row():
@@ -398,21 +711,15 @@ def make_group_en():
398
  out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
399
 
400
  def decode_lossless_aware_en(text, src, tgt):
 
401
  orig = extract_custom_sidecar(text)
402
  if orig is not None: return orig
403
  orig = extract_sidecar_b85(text)
404
  if orig is not None: return orig
405
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
406
 
407
- with gr.Row():
408
- btn_d = gr.Button("🔓 Decode", variant="primary")
409
- btn_d_cl = gr.Button("🧹 Clear")
410
-
411
- btn_d.click(decode_lossless_aware_en, [code_in, src_code, tgt_lang], [out3])
412
- btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
413
-
414
- with gr.Accordion("Quick help", open=False):
415
- gr.Markdown(EXPLAIN_TAB_DECODE_EN)
416
 
417
  with gr.Tab("🔄 Round-trip"):
418
  with gr.Row():
@@ -423,15 +730,15 @@ def make_group_en():
423
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
424
  rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
425
  rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
426
- with gr.Row():
427
- btn_rt = gr.Button("▶️ Test", variant="primary")
428
- btn_rt_cl = gr.Button("🧹 Clear")
429
 
430
- btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
431
- btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
 
 
 
432
 
433
- with gr.Accordion("Quick help", open=False):
434
- gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
435
  return g
436
 
437
  # ================================ App ================================
@@ -456,3 +763,4 @@ if __name__ == "__main__":
456
 
457
 
458
 
 
 
1
+ # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
+ # Archivos requeridos en la raíz:
3
+ # - lexicon_minimax.json
4
+ # - lexicon_komin.json
5
+ # - lexicon_master.json
6
+ #
7
+ # requirements.txt (para HF Spaces):
8
+ # gradio>=4.36.0
9
+ # spacy>=3.7.4
10
+ # es_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl
11
+ # en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
12
+
13
+ import os, re, json, base64, zlib
14
+ from typing import Dict, Optional, List, Any # <- FIX: List/Any importados
15
+ import gradio as gr
16
+
17
+ # ------------ Archivos esperados ------------
18
+ LEX_MINI = "lexicon_minimax.json"
19
+ LEX_KOMI = "lexicon_komin.json"
20
+ LEX_MASTER = "lexicon_master.json"
21
+
22
+ # ------------ Normalización ------------
23
+ WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
24
+ STRIP = str.maketrans("ÁÉÍÓÚÜÑáéíóúüñ", "AEIOUUNaeiouun")
25
+ def norm_es(w: str) -> str: return re.sub(r"[^a-záéíóúüñ]", "", (w or "").lower()).translate(STRIP)
26
+ def norm_en(w: str) -> str: return re.sub(r"[^a-z]", "", (w or "").lower())
27
+
28
+ # ------------ Carga de léxicos ------------
29
+ def load_json(path: str):
30
+ if not os.path.exists(path): return None
31
+ with open(path, "r", encoding="utf-8") as f: return json.load(f)
32
+
33
+ def load_lexicons():
34
+ mm = load_json(LEX_MINI) or {}
35
+ kk = load_json(LEX_KOMI) or {}
36
+ master = load_json(LEX_MASTER) or {}
37
+
38
+ es2mini = mm.get("mapping", {})
39
+ es2komi = kk.get("mapping", {})
40
+ mini2es = {v:k for k,v in es2mini.items()}
41
+ komi2es = {v:k for k,v in es2komi.items()}
42
+
43
+ es2en_lemma: Dict[str,str] = {}
44
+ en2es_lemma: Dict[str,str] = {}
45
+ en2mini, en2komi = {}, {}
46
+ mini2en, komi2en = {}, {}
47
+
48
+ if isinstance(master, dict) and "entries" in master:
49
+ for e in master["entries"]:
50
+ es = norm_es(str(e.get("lemma_es",""))); en = norm_en(str(e.get("lemma_en","")))
51
+ mi = str(e.get("minimax","")); ko = str(e.get("komin",""))
52
+ if es and en:
53
+ es2en_lemma.setdefault(es, en); en2es_lemma.setdefault(en, es)
54
+ if en and mi: en2mini.setdefault(en, mi)
55
+ if en and ko: en2komi.setdefault(en, ko)
56
+
57
+ mini2en = {v:k for k,v in en2mini.items()}
58
+ komi2en = {v:k for k,v in en2komi.items()}
59
+ return (es2mini, es2komi, mini2es, komi2es,
60
+ en2mini, en2komi, mini2en, komi2en,
61
+ es2en_lemma, en2es_lemma, master)
62
+
63
+ (ES2MINI, ES2KOMI, MINI2ES, KOMI2ES,
64
+ EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
65
+ ES2EN_LEMMA, EN2ES_LEMMA, MASTER_OBJ) = load_lexicons()
66
+
67
+ # ------------ Pronombres (para “Quitar pronombres”) ------------
68
+ PRON_ES = {"yo","tú","vos","usted","él","ella","nosotros","vosotros","ustedes","ellos","ellas","me","te","se","nos","os"}
69
+ PRON_EN = {"i","you","he","she","it","we","they","me","him","her","us","them"}
70
+
71
+ # ------------ OOV reversible (Semi-lossless) ------------
72
+ ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
73
+ CJK_BASE = (
74
+ "天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
75
+ "東西南北中外上下午夜明暗手口目耳心言書家道路門"
76
+ "大小長短早晚高低新古青紅白黒金銀銅玉米茶酒米"
77
+ "文学楽音画体気電海空森林雪雲砂島橋城村国自由静"
78
+ )
79
+ ALPHA_CJK64 = (CJK_BASE * 2)[:64]
80
+
81
+ def to_custom_b64(b: bytes, alphabet: str) -> str:
82
+ std = base64.b64encode(b).decode("ascii")
83
+ trans = str.maketrans("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", alphabet)
84
+ return std.translate(trans).rstrip("=")
85
+ def from_custom_b64(s: str, alphabet: str) -> bytes:
86
+ trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
87
+ std = s.translate(trans); pad = "=" * ((4 - len(std) % 4) % 4)
88
+ return base64.b64decode(std + pad)
89
+ def enc_oov_minimax(token: str) -> str: return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64)
90
+ def dec_oov_minimax(code: str) -> str:
91
+ try: return from_custom_b64(code[1:], ALPHA_MINI64).decode("utf-8")
92
+ except Exception: return code
93
+ def enc_oov_komin(token: str) -> str: return "「" + to_custom_b64(token.encode("utf-8"), ALPHA_CJK64) + "」"
94
+ def dec_oov_komin(code: str) -> str:
95
+ try: return from_custom_b64(code[1:-1], ALPHA_CJK64).decode("utf-8")
96
+ except Exception: return code
97
+ def is_oov_minimax(code: str) -> bool: return code.startswith("~") and len(code) > 1
98
+ def is_oov_komin(code: str) -> bool: return len(code) >= 2 and code.startswith("「") and code.endswith("」")
99
+
100
+ # ------------ spaCy opcional ------------
101
+ USE_SPACY = False
102
+ try:
103
+ import spacy
104
+ try:
105
+ nlp_es = spacy.load("es_core_news_sm"); nlp_en = spacy.load("en_core_web_sm"); USE_SPACY = True
106
+ except Exception:
107
+ nlp_es = nlp_en = None
108
+ except Exception:
109
+ nlp_es = nlp_en = None
110
+
111
+ def lemma_of(tok, src_lang: str) -> str:
112
+ if src_lang == "Español":
113
+ return norm_es(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
114
+ else:
115
+ return norm_en(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
116
+
117
+ # ------------ Detección simple y helpers ------------
118
+ def detect_polarity(doc) -> bool: return "?" in getattr(doc,"text","")
119
+ def detect_neg(doc) -> bool:
120
+ for t in doc:
121
+ if getattr(t,"dep_","")=="neg" or getattr(t,"lower_","").lower() in ("no","not","n't"):
122
+ return True
123
+ return False
124
+ def detect_tense(root):
125
+ m = str(getattr(root,"morph",""))
126
+ if "Tense=Past" in m: return "Past"
127
+ if "Tense=Fut" in m: return "Fut"
128
+ if "Tense=Pres" in m: return "Pres"
129
+ for c in getattr(root,"children",[]):
130
+ if getattr(c,"pos_","")=="AUX":
131
+ cm = str(getattr(c,"morph",""))
132
+ if "Tense=Past" in cm: return "Past"
133
+ if getattr(c,"lower_","").lower()=="will": return "Fut"
134
+ return "Pres"
135
+ def extract_core(doc):
136
+ tokens = list(doc)
137
+ root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT" and getattr(t,"pos_","") in ("VERB","AUX")), tokens[0] if tokens else doc)
138
+ subs, objs, obls, advs = [], [], [], []
139
+ for t in getattr(root,"children",[]):
140
+ dep = getattr(t,"dep_",""); pos = getattr(t,"pos_","")
141
+ if dep in ("nsubj","nsubj:pass","csubj"): subs.append(t)
142
+ elif dep in ("obj","dobj","iobj"): objs.append(t)
143
+ elif dep in ("obl","pobj"): obls.append(t)
144
+ elif dep in ("advmod","advcl") and pos=="ADV": advs.append(t)
145
+ for arr in (subs,objs,obls,advs): arr.sort(key=lambda x: getattr(x,"i",0))
146
+ return root, subs, objs, obls, advs
147
+ def _person_of_doc(doc, src_lang: str) -> Optional[str]:
148
+ try:
149
+ tokens = list(doc)
150
+ root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT"), tokens[0])
151
+ subj = next((t for t in getattr(root,"children",[]) if getattr(t,"dep_","").startswith("nsubj")), None)
152
+ if subj is None: return None
153
+ plur = ("Number=Plur" in str(getattr(subj,"morph",""))) if src_lang=="Español" else (getattr(subj,"tag_","") in ("NNS","NNPS"))
154
+ low = getattr(subj,"lower_","").lower()
155
+ if src_lang=="Español":
156
+ if low in ("yo",): return "1p" if plur else "1s"
157
+ if low in ("tú","vos"): return "2p" if plur else "2s"
158
+ if low in ("usted","él","ella"): return "3p" if plur else "3s"
159
+ lem = lemma_of(subj, "Español")
160
+ if lem in ("yo","nosotros"): return "1p" if plur else "1s"
161
+ if lem in ("tú","vosotros"): return "2p" if plur else "2s"
162
+ return "3p" if plur else "3s"
163
+ else:
164
+ if low in ("i",): return "1p" if plur else "1s"
165
+ if low in ("you",): return "2p" if plur else "2s"
166
+ if low in ("he","she","it"): return "3p" if plur else "3s"
167
+ return "3p" if plur else "3s"
168
+ except Exception:
169
+ return None
170
+ def detect_person(root, src_lang: str) -> Optional[str]:
171
+ m = str(getattr(root,"morph","")); person_str, number_str = "3","s"
172
+ if "Person=" in m:
173
+ for feat in m.split("|"):
174
+ if feat.startswith("Person="): person_str = feat.split("=")[1]
175
+ elif feat.startswith("Number="): number_str = "p" if feat.split("=")[1]=="Plur" else "s"
176
+ return person_str + number_str
177
+ return _person_of_doc(root.doc, src_lang)
178
+
179
+ # ------------ Mapeo y fraseadores ------------
180
+ def code_es(lemma: str, target: str) -> str:
181
+ lemma = norm_es(lemma)
182
+ if target=="Minimax-ASCII": return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
183
+ return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
184
+ def code_en(lemma: str, target: str) -> str:
185
+ lemma = norm_en(lemma)
186
+ if target=="Minimax-ASCII": return (EN2MINI.get(lemma) if EN2MINI else None) or enc_oov_minimax(lemma)
187
+ return (EN2KOMI.get(lemma) if EN2KOMI else None) or enc_oov_komin(lemma)
188
+
189
+ TAM_MINI = {"Pres":"P","Past":"T","Fut":"F","UNK":"P"}
190
+ TAM_KOMI = {"Pres":"Ⓟ","Past":"Ⓣ","Fut":"Ⓕ","UNK":"Ⓟ"}
191
+
192
+ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
193
+ semi_lossless=False, person_hint="2s", remove_pronouns=False):
194
+ root, subs, objs, obls, advs = extract_core(doc)
195
+ tense = detect_tense(root); is_q, is_neg = detect_polarity(doc), detect_neg(doc)
196
+ vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
197
+ vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
198
+ tail = TAM_MINI.get(tense, "P")
199
+ if semi_lossless: tail += (detect_person(root, src_lang) or person_hint)
200
+ if is_neg: tail += "N"
201
+ if is_q: tail += "Q"
202
+ if tail: vcode = f"{vcode}·{tail}"
203
+
204
+ def realize_np(tokens):
205
+ outs=[]
206
+ for t in tokens:
207
+ if remove_pronouns:
208
+ txt = (getattr(t,"text","") or "").lower()
209
+ if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue
210
+ lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
211
+ outs.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII"))
212
+ return outs
213
+
214
+ S = realize_np(subs); O = realize_np(objs)+realize_np(obls)
215
+ ADV=[]
216
+ for a in advs:
217
+ lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
218
+ ADV.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII"))
219
+
220
+ parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else [vcode]+S+O+ADV
221
+ return " ".join(p for p in parts if p)
222
+
223
+ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
224
+ semi_lossless=False, person_hint="2s", remove_pronouns=False):
225
+ root, subs, objs, obls, advs = extract_core(doc)
226
+ tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
227
+ vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
228
+ vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
229
+ P_SUBJ, P_OBJ = "ᵖ", "ᵒ"; Q_FIN = "?"
230
+ TAM = TAM_KOMI.get(tense,"Ⓟ")
231
+ if semi_lossless: TAM = TAM + f"[{detect_person(root, src_lang) or person_hint}]"
232
+
233
+ def realize_np(tokens, particle):
234
+ outs=[]
235
+ for t in tokens:
236
+ if remove_pronouns:
237
+ txt = (getattr(t,"text","") or "").lower()
238
+ if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue
239
+ lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
240
+ outs.append((code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK")) + particle)
241
+ return outs
242
+
243
+ S = realize_np(subs, P_SUBJ); O = realize_np(objs+obls, P_OBJ)
244
+ ADV=[]
245
+ for a in advs:
246
+ lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
247
+ ADV.append(code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK"))
248
+ parts = S+O+ADV+[vcode+TAM]
249
+ out = " ".join(parts)
250
+ if is_q: out += " " + Q_FIN
251
+ return out
252
+
253
+ # ------------ Sidecars (compresión exacta) ------------
254
+ SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
255
+ def b85_enc_raw(s: str) -> str: return base64.a85encode(zlib.compress(s.encode("utf-8"), 9), adobe=False).decode("ascii")
256
+ def b85_dec_raw(b85s: str) -> str: return zlib.decompress(base64.a85decode(b85s.encode("ascii"), adobe=False)).decode("utf-8")
257
+ def attach_sidecar_b85(conlang_text: str, original_text: str) -> str: return f"{conlang_text} §({b85_enc_raw(original_text)})"
258
+ def extract_sidecar_b85(text: str) -> Optional[str]:
259
+ m = SIDECAR_B85_RE.search(text)
260
+ if not m: return None
261
+ try: return b85_dec_raw(m.group("b85"))
262
+ except Exception: return None
263
+ def strip_sidecar_b85(text: str) -> str: return SIDECAR_B85_RE.sub("", text).rstrip()
264
+ def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
265
+ blob = to_custom_b64(zlib.compress(original_text.encode("utf-8"), 9), ALPHA_MINI64)
266
+ return f"{conlang_text} ~{blob}"
267
+ def extract_custom_sidecar(text: str) -> Optional[str]:
268
+ if '~' in text:
269
+ _, blob = text.rsplit('~', 1)
270
+ try: return zlib.decompress(from_custom_b64(blob, ALPHA_MINI64)).decode("utf-8")
271
+ except Exception: return None
272
+ return None
273
+ def strip_custom_sidecar(text: str) -> str: return text.split('~')[0].rstrip() if '~' in text else text
274
+
275
+ # ------------ Codificación / decodificación simple ------------
276
+ def encode_simple(text: str, src_lang: str, target: str) -> str:
277
+ if not text.strip(): return ""
278
+ def repl_es(m):
279
+ key = norm_es(m.group(0))
280
+ code = ES2MINI.get(key) if target=="Minimax-ASCII" else ES2KOMI.get(key)
281
+ return code or (enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0)))
282
+ def repl_en(m):
283
+ key = norm_en(m.group(0)); table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
284
+ if table and key in table: return table[key]
285
+ return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
286
+ repl = repl_es if src_lang=="Español" else repl_en
287
+ return WORD_RE.sub(repl, text)
288
+
289
+ def pluralize_es(word: str) -> str:
290
+ exceptions = {"uno":"unos","buen":"buenos","hombre":"hombres"}
291
+ if word in exceptions: return exceptions[word]
292
+ if word.endswith("z"): return word[:-1]+"ces"
293
+ if word.endswith(("a","e","i","o")): return word+"s"
294
+ return word+"es"
295
+ def pluralize_en(word: str) -> str:
296
+ exceptions = {"man":"men","woman":"women","child":"children"}
297
+ if word in exceptions: return exceptions[word]
298
+ if word.endswith("y") and len(word)>1 and word[-2] not in "aeiou": return word[:-1]+"ies"
299
+ if word.endswith(("s","sh","ch","x","z")): return word+"es"
300
+ return word+"s"
301
+ def pluralize(word: str, tgt_lang: str) -> str: return pluralize_es(word) if tgt_lang=="Español" else pluralize_en(word)
302
+
303
+ mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
304
+
305
+ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
306
+ if not text.strip(): return ""
307
+ code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
308
+ code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
309
+ if source=="Kōmín-CJK":
310
+ text = text.replace("?","?").replace(" "," ")
311
+ return " ".join([code2es.get(w,w) for w in text.split() if w!="?"])
312
+ tokens = text.split()
313
+ if not tokens: return ""
314
+ lemma_tokens, pl_flags = [], []
315
+ verb_idx=-1; verb_lemma=None; verb_tense="Pres"; verb_person="3s"; has_q=False; is_neg=False
316
+ for part in tokens:
317
+ look = part.replace("[PL]",""); had_pl = "[PL]" in part; pl_flags.append(had_pl)
318
+ m = mini_tail_re.match(look)
319
+ if m:
320
+ verb_idx = len(lemma_tokens); stem=m.group("stem"); tail=m.group("tail")
321
+ vlem_es = code2es.get(stem); vlem_en = code2en.get(stem) if code2en else None
322
+ vlem = vlem_es if tgt_lang=="Español" else (vlem_en or vlem_es or stem)
323
+ if not vlem: vlem = dec_oov_minimax(stem) if is_oov_minimax(stem) else stem
324
+ lemma_tokens.append(vlem); pl_flags.append(False)
325
+ if tail:
326
+ if tail[0] in "PTF":
327
+ verb_tense = {"P":"Pres","T":"Past","F":"Fut"}[tail[0]]; pos=1
328
+ if len(tail)>pos and tail[pos] in "123":
329
+ pos+=1; verb_person = tail[pos-1] + (tail[pos] if len(tail)>pos and tail[pos] in "sp" else "s")
330
+ if len(tail)>pos and tail[pos] in "sp": pos+=1
331
+ is_neg = "N" in tail[pos:]; has_q = "Q" in tail[pos:]
332
+ verb_lemma = vlem; continue
333
+ w_es = code2es.get(look); w_en = code2en.get(look) if code2en else None
334
+ w = w_es if tgt_lang=="Español" else (w_en or w_es or look)
335
+ if not w: w = dec_oov_minimax(look) if is_oov_minimax(look) else look
336
+ lemma_tokens.append(w); pl_flags.append(had_pl)
337
+ out_parts=[]
338
+ for idx, lem in enumerate(lemma_tokens):
339
+ if idx==verb_idx:
340
+ v = _es_conj(verb_lemma, verb_tense, verb_person) if tgt_lang=="Español" else _en_conj(verb_lemma, verb_tense, verb_person)
341
+ if is_neg: v = ("no " if tgt_lang=="Español" else "not ") + v
342
+ out_parts.append(v)
343
+ else:
344
+ out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
345
+ out_text = " ".join(out_parts)
346
+ if has_q:
347
+ start_q = "¿" if tgt_lang=="Español" else ""
348
+ out_text = f"{start_q}{out_text.capitalize()}?"
349
+ return out_text
350
+
351
+ # ------------ Conjugadores mínimos ------------
352
+ def _es_conj_regular(lemma, tense, person):
353
+ if not lemma.endswith(("ar","er","ir")): return lemma
354
+ stem, vtype = lemma[:-2], lemma[-2:]
355
+ pres={"ar":{"1s":"o","2s":"as","3s":"a","1p":"amos","2p":"áis","3p":"an"},
356
+ "er":{"1s":"o","2s":"es","3s":"e","1p":"emos","2p":"éis","3p":"en"},
357
+ "ir":{"1s":"o","2s":"es","3s":"e","1p":"imos","2p":"ís","3p":"en"}}
358
+ pret={"ar":{"1s":"é","2s":"aste","3s":"ó","1p":"amos","2p":"asteis","3p":"aron"},
359
+ "er":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"},
360
+ "ir":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"}}
361
+ fut={"1s":"é","2s":"ás","3s":"á","1p":"emos","2p":"éis","3p":"án"}
362
+ if tense=="Pres": return stem + pres[vtype].get(person, pres[vtype]["3s"])
363
+ if tense=="Past": return stem + pret[vtype].get(person, pret[vtype]["3s"])
364
+ return lemma + fut.get(person, fut["3s"])
365
+ def _es_conj(lemma, tense, person):
366
+ if lemma=="ser":
367
+ tab={"Pres":{"1s":"soy","2s":"eres","3s":"es","1p":"somos","2p":"sois","3p":"son"},
368
+ "Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"},
369
+ "Fut":{"1s":"seré","2s":"serás","3s":"será","1p":"seremos","2p":"seréis","3p":"serán"}}
370
+ return tab[tense].get(person, tab[tense]["3s"])
371
+ if lemma=="estar":
372
+ tab={"Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
373
+ "Past":{"1s":"estuve","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"},
374
+ "Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"}}
375
+ return tab[tense].get(person, tab[tense]["3s"])
376
+ if lemma=="ir":
377
+ tab={"Pres":{"1s":"voy","2s":"vas","3s":"va","1p":"vamos","2p":"vais","3p":"van"},
378
+ "Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"},
379
+ "Fut":{"1s":"iré","2s":"irás","3s":"irá","1p":"iremos","2p":"iréis","3p":"irán"}}
380
+ return tab[tense].get(person, tab[tense]["3s"])
381
+ return _es_conj_regular(lemma, tense, person)
382
+ def _en_conj(lemma, tense, person):
383
+ if lemma=="be":
384
+ if tense=="Pres": return {"1s":"am","2s":"are","3s":"is","1p":"are","2p":"are","3p":"are"}.get(person,"is")
385
+ if tense=="Past": return {"1s":"was","2s":"were","3s":"was","1p":"were","2p":"were","3p":"were"}.get(person,"was")
386
+ return "be"
387
+ if lemma=="have":
388
+ if tense=="Pres": return "has" if person=="3s" else "have"
389
+ if tense=="Past": return "had"
390
+ return "have"
391
+ if lemma=="go":
392
+ if tense=="Past": return "went"
393
+ return "goes" if (tense=="Pres" and person=="3s") else "go"
394
+ if lemma=="do":
395
+ if tense=="Past": return "did"
396
+ return "does" if (tense=="Pres" and person=="3s") else "do"
397
+ if tense=="Pres":
398
+ if person=="3s":
399
+ if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1]+"ies"
400
+ if lemma.endswith(("s","sh","ch","x","z","o")): return lemma+"es"
401
+ return lemma+"s"
402
+ return lemma
403
+ if tense=="Past":
404
+ if lemma.endswith("e"): return lemma+"d"
405
+ if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1]+"ied"
406
+ return lemma+"ed"
407
+ return lemma
408
+
409
  # =====================================================================================
410
  # ========================= UI bilingüe y explicaciones claras ========================
411
  # =====================================================================================
412
 
413
  ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
414
 
415
+ # ---- Títulos y contenidos de acordeones (resumen) ----
416
  ACC_TITLES_ES = {
417
+ "translate": "🔁 Traducir — ¿Qué hace? (despliega)",
418
  "build": "🛠️ Construir (ES/EN → Conlang) — ¿Qué hace?",
419
  "decode": "🗝️ Decodificar (Conlang → ES/EN) — ¿Qué hace?",
420
  "roundtrip": "🔄 Prueba ida→vuelta — ¿Qué hace?",
421
+ "checkbox": "☑️ Opciones y compactación",
422
  "lexicon": "ℹ️ Léxico (OMW → Minimax/Kōmín) — explicación y vista previa"
423
  }
 
424
  ACC_TITLES_EN = {
425
+ "translate": "🔁 Translate — What does it do? (expand)",
426
  "build": "🛠️ Build (ES/EN → Conlang) — What does it do?",
427
  "decode": "🗝️ Decode (Conlang → ES/EN) — What does it do?",
428
  "roundtrip": "🔄 Round-trip — What does it do?",
429
+ "checkbox": "☑️ Options & compaction",
430
  "lexicon": "ℹ️ Lexicon (OMW → Minimax/Kōmín) — explainer & preview"
431
  }
432
 
 
433
  EXPLAIN_TAB_TRANSLATE_ES = """
434
+ Convierte el **Texto** al **Destino** (ES/EN/Minimax/Kōmín).
435
+ - **Máx. Compresión Exacta** añade `~...` para recuperar el **original exacto**.
436
+ - Los **checkbox** (artículos/cópula/pronombres) **solo aplican** cuando el **Destino es conlang**.
 
437
  """
 
438
  EXPLAIN_TAB_BUILD_ES = """
439
+ Fuerza salida **en conlang** (Minimax/Kōmín) desde Español o Inglés, con reglas de fraseo y opciones de compactación.
 
440
  """
 
441
  EXPLAIN_TAB_DECODE_ES = """
442
+ Convierte **Minimax/Kōmín ES/EN**. Si hay `~...`, devuelve el **original exacto**; si no, hace reconstrucción **semi-lossless**.
 
 
 
443
  """
 
444
  EXPLAIN_TAB_ROUNDTRIP_ES = """
445
+ Ejecuta **(ES/EN→Conlang) → (Conlang→ES/EN)** para comprobar **reversibilidad**. Con exacta, la vuelta es **bit a bit**.
 
446
  """
 
447
  EXPLAIN_CHECKBOX_ES = """
448
+ - **Omitir artículos**: **~10–15%**
449
+ - **Cópula cero (presente afirm.)**: **~5–10%** extra
450
+ - **Quitar pronombres**: ahorro variable
451
+ - **Máx. Compresión Exacta**: **~40–60%** en >100 caracteres (sidecar `~...`)
452
+ **Referencia:** sin casillas **0%**; artículos+cópula **~15–20%**.
 
 
 
 
 
 
 
 
453
  """
454
 
455
  EXPLAIN_TAB_TRANSLATE_EN = """
456
+ Converts **Text → Target** (ES/EN/Minimax/Kōmín). **Max Exact Compression** adds `~...` for bit-perfect recovery. Checkboxes apply when **Target is a conlang**.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
457
  """
458
+ EXPLAIN_TAB_BUILD_EN = """Forces **conlang output** (Minimax/Kōmín) with phrasing rules and compaction options."""
459
+ EXPLAIN_TAB_DECODE_EN = """Converts **Minimax/Kōmín → ES/EN**. If `~...` exists, returns the exact original; otherwise semi-lossless."""
460
+ EXPLAIN_TAB_ROUNDTRIP_EN = """Runs **(ES/EN→Conlang)→(Conlang→ES/EN)** to verify reversibility."""
 
 
 
461
  EXPLAIN_CHECKBOX_EN = """
462
+ - **Drop articles**: **~10–15%**
463
+ - **Zero copula (present affirm.)**: **~5–10%** extra
464
+ - **Remove pronouns**: variable
465
+ - **Max Exact Compression**: **~4060%** for >100 chars (`~...`)
466
+ Reference: no options **0%**; articles+copula **~15–20%**.
 
 
 
 
 
 
 
 
467
  """
468
 
469
  LEXICON_BUILD_ES = """
470
+ **Cómo se construyó el léxico**
471
+ 1) OMW/WordNet: lemas **ES** y equivalentes **EN** por sinset.
472
+ 2) Normaliza y ordena por **frecuencia** (*wordfreq*).
473
+ 3) (Opcional) **spaCy** refina; **Argos** puede rellenar EN.
474
+ 4) Asigna **códigos** con alfabetos barajados por **SEED** hasta `MAXLEN`.
475
+ 5) Exporta: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+TSV).
 
 
 
476
  """
 
477
  LEXICON_BUILD_EN = """
478
+ **How the lexicon was built**
479
+ 1) OMW/WordNet ES lemmas + EN counterparts per synset.
480
+ 2) Normalize & sort by **frequency** (*wordfreq*).
481
+ 3) (Optional) **spaCy** refine; **Argos** may fill EN.
482
+ 4) Assign **codes** with **SEED-shuffled** alphabets up to `MAXLEN`.
483
+ 5) Exports: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+TSV).
 
 
 
484
  """
485
 
486
  # ---------- Utilidad: cálculo de compactación ----------
 
492
  if not text.strip(): return "—"
493
  if tgt not in ("Minimax-ASCII","Kōmín-CJK"):
494
  return "La compactación aplica cuando el **Destino** es Minimax/Kōmín."
 
495
  base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False)
 
496
  curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm)
497
+ p_base = _pct_comp(text, base); p_curr = _pct_comp(text, curr)
 
 
 
498
  msg = f"**Base (sin casillas):** {p_base:.1f}% · **Con tus opciones:** {p_curr:.1f}%"
499
+ if maxc:
500
+ curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm)
501
  p_exact = _pct_comp(text, curr_exact)
502
  msg += f" · **Con sidecar `~...`:** {p_exact:.1f}%"
503
  return msg
 
508
  return "Compaction applies when **Target** is Minimax/Kōmín."
509
  base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False)
510
  curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm)
511
+ p_base = _pct_comp(text, base); p_curr = _pct_comp(text, curr)
 
 
512
  msg = f"**Base (no options):** {p_base:.1f}% · **With your options:** {p_curr:.1f}%"
513
+ if maxc:
514
+ curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm)
515
  p_exact = _pct_comp(text, curr_exact)
516
  msg += f" · **With `~...` sidecar:** {p_exact:.1f}%"
517
  return msg
 
531
  def make_group_es():
532
  with gr.Group(visible=True) as g:
533
  gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
534
+ # Acordeones de explicación — MISMO nivel
535
  with gr.Row():
536
  with gr.Column():
537
+ with gr.Accordion(ACC_TITLES_ES["translate"], open=False): gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES)
538
+ with gr.Accordion(ACC_TITLES_ES["build"], open=False): gr.Markdown(EXPLAIN_TAB_BUILD_ES)
539
+ with gr.Accordion(ACC_TITLES_ES["decode"], open=False): gr.Markdown(EXPLAIN_TAB_DECODE_ES)
540
+ with gr.Accordion(ACC_TITLES_ES["roundtrip"], open=False): gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
 
 
 
 
541
  with gr.Column():
542
  with gr.Accordion(ACC_TITLES_ES["checkbox"], open=False):
543
  gr.Markdown(EXPLAIN_CHECKBOX_ES)
 
547
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
548
  gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows], [table])
549
 
550
+ # ==== Tabs reactivas (sin botones) ====
551
  with gr.Tab("🔁 Traducir"):
552
  with gr.Row():
553
  uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
 
558
  uni_zero = gr.Checkbox(False, label="Cópula cero (presente afirm.)")
559
  uni_rmpr = gr.Checkbox(False, label="Quitar pronombres")
560
  uni_maxc = gr.Checkbox(False, label="Máx. Compresión Exacta (sidecar `~...`)")
 
561
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
 
 
 
562
  uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
563
+ comp_out = gr.Markdown("")
564
 
565
  def do_translate(text, src, tgt, drop, zero, mode, maxc, rm):
566
+ if not text.strip(): return "", ""
567
  res = universal_translate(text, src, tgt, drop, zero, mode, maxc, rm)
568
  rep = compaction_report_es(text, src, tgt, drop, zero, rm, maxc)
569
  return res, rep
570
 
571
+ for c in [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_rmpr, uni_maxc]:
572
+ c.change(do_translate,
573
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
574
  [uni_out, comp_out])
 
 
 
 
575
 
576
  with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
577
  with gr.Row():
 
584
  rm_pron_build = gr.Checkbox(False, label="Quitar pronombres")
585
  max_comp_build = gr.Checkbox(False, label="Máx. Compresión Exacta")
586
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
 
 
 
587
  out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
588
  comp_out_b = gr.Markdown("")
589
 
590
  def do_build(text, src, tgt, drop, zero, mode, maxc, rm):
591
+ if not text.strip(): return "", ""
592
  res = build_sentence(text, src, tgt, drop, zero, mode, maxc, rm)
593
  rep = compaction_report_es(text, src, tgt, drop, zero, rm, maxc)
594
  return res, rep
595
 
596
+ for c in [text_in, src_lang, target, drop_articles, zero_copula, rm_pron_build, max_comp_build]:
597
+ c.change(do_build,
598
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
599
+ [out, comp_out_b])
 
 
 
600
 
601
  with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
602
  with gr.Row():
 
606
  out3 = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
607
 
608
  def decode_lossless_aware(text, src, tgt):
609
+ if not text.strip(): return ""
610
  orig = extract_custom_sidecar(text)
611
  if orig is not None: return orig
612
  orig = extract_sidecar_b85(text)
613
  if orig is not None: return orig
614
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
615
 
616
+ for c in [code_in, src_code, tgt_lang]:
617
+ c.change(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
 
 
 
 
 
 
 
618
 
619
  with gr.Tab("🔄 Prueba ida→vuelta"):
620
  with gr.Row():
 
625
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
626
  rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
627
  rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
 
 
 
628
 
629
+ def do_roundtrip(text, src, tgt, mode, maxc):
630
+ if not text.strip(): return "", ""
631
+ conlang = universal_translate(text, src, tgt, True, False, mode, maxc, False)
632
+ back = universal_translate(conlang, tgt, src, True, False, mode, maxc, False)
633
+ return conlang, back
634
 
635
+ for c in [rt_text, rt_src, rt_tgt, rt_max_comp]:
636
+ c.change(do_roundtrip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
637
  return g
638
 
639
  def make_group_en():
 
641
  gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
642
  with gr.Row():
643
  with gr.Column():
644
+ with gr.Accordion(ACC_TITLES_EN["translate"], open=False): gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN)
645
+ with gr.Accordion(ACC_TITLES_EN["build"], open=False): gr.Markdown(EXPLAIN_TAB_BUILD_EN)
646
+ with gr.Accordion(ACC_TITLES_EN["decode"], open=False): gr.Markdown(EXPLAIN_TAB_DECODE_EN)
647
+ with gr.Accordion(ACC_TITLES_EN["roundtrip"], open=False): gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
 
 
 
 
648
  with gr.Column():
649
+ with gr.Accordion(ACC_TITLES_EN["checkbox"], open=False): gr.Markdown(EXPLAIN_CHECKBOX_EN)
650
+ with gr.Accordion(ACC_TITLES_EN["lexicon"], open=False):
 
651
  gr.Markdown(LEXICON_BUILD_EN)
652
  n_rows = gr.Slider(5, 100, value=20, step=5, label="Rows to show")
653
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
 
663
  uni_zero = gr.Checkbox(False, label="Zero copula (present affirm.)")
664
  uni_rmpr = gr.Checkbox(False, label="Remove pronouns")
665
  uni_maxc = gr.Checkbox(False, label="Max Exact Compression (sidecar `~...`)")
 
666
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
 
 
 
667
  uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
668
  comp_out = gr.Markdown("")
669
 
670
  def do_translate_en(text, src, tgt, drop, zero, mode, maxc, rm):
671
+ if not text.strip(): return "", ""
672
  res = universal_translate(text, src, tgt, drop, zero, mode, maxc, rm)
673
  rep = compaction_report_en(text, src, tgt, drop, zero, rm, maxc)
674
  return res, rep
675
 
676
+ for c in [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_rmpr, uni_maxc]:
677
+ c.change(do_translate_en,
678
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
679
  [uni_out, comp_out])
 
 
 
 
680
 
681
  with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
682
  with gr.Row():
 
689
  rm_pron_build = gr.Checkbox(False, label="Remove pronouns")
690
  max_comp_build = gr.Checkbox(False, label="Max Exact Compression")
691
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
 
 
 
692
  out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
693
  comp_out_b = gr.Markdown("")
694
 
695
  def do_build_en(text, src, tgt, drop, zero, mode, maxc, rm):
696
+ if not text.strip(): return "", ""
697
  res = build_sentence(text, src, tgt, drop, zero, mode, maxc, rm)
698
  rep = compaction_report_en(text, src, tgt, drop, zero, rm, maxc)
699
  return res, rep
700
 
701
+ for c in [text_in, src_lang, target, drop_articles, zero_copula, rm_pron_build, max_comp_build]:
702
+ c.change(do_build_en,
703
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
704
+ [out, comp_out_b])
 
 
 
705
 
706
  with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
707
  with gr.Row():
 
711
  out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
712
 
713
  def decode_lossless_aware_en(text, src, tgt):
714
+ if not text.strip(): return ""
715
  orig = extract_custom_sidecar(text)
716
  if orig is not None: return orig
717
  orig = extract_sidecar_b85(text)
718
  if orig is not None: return orig
719
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
720
 
721
+ for c in [code_in, src_code, tgt_lang]:
722
+ c.change(decode_lossless_aware_en, [code_in, src_code, tgt_lang], [out3])
 
 
 
 
 
 
 
723
 
724
  with gr.Tab("🔄 Round-trip"):
725
  with gr.Row():
 
730
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
731
  rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
732
  rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
 
 
 
733
 
734
+ def do_roundtrip_en(text, src, tgt, mode, maxc):
735
+ if not text.strip(): return "", ""
736
+ conlang = universal_translate(text, src, tgt, True, False, mode, maxc, False)
737
+ back = universal_translate(conlang, tgt, src, True, False, mode, maxc, False)
738
+ return conlang, back
739
 
740
+ for c in [rt_text, rt_src, rt_tgt, rt_max_comp]:
741
+ c.change(do_roundtrip_en, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
742
  return g
743
 
744
  # ================================ App ================================
 
763
 
764
 
765
 
766
+