LoloSemper commited on
Commit
1a22fab
·
verified ·
1 Parent(s): 2b3d83b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +340 -214
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
  # Archivos necesarios en la raíz:
3
  # - lexicon_minimax.json
4
  # - lexicon_komin.json
@@ -15,8 +15,7 @@ import re
15
  import json
16
  import base64
17
  import zlib
18
- import hashlib
19
- from typing import Dict, Tuple, Optional
20
  import gradio as gr
21
 
22
  # ------------ Archivos esperados ------------
@@ -78,7 +77,7 @@ def load_lexicons():
78
  EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
79
  ES2EN_LEMMA, EN2ES_LEMMA) = load_lexicons()
80
 
81
- # ------------ OOV reversible (modo Semi-lossless) ------------
82
  ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
83
  CJK_BASE = (
84
  "天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
@@ -141,47 +140,7 @@ def lemma_of(tok, src_lang: str) -> str:
141
  else:
142
  return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
143
 
144
- # ------------ Selección de oración predicativa ------------
145
- def pick_predicative_sentence(doc):
146
- sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
147
- candidates = []
148
- for s in sents:
149
- roots = [t for t in s if t.dep_ == "ROOT" and t.pos_ in ("VERB","AUX")]
150
- if not roots:
151
- continue
152
- root = roots[0]
153
- has_q = "?" in s.text
154
- has_subj = any(t.dep_.startswith("nsubj") for t in root.children)
155
- score = (1 if has_q else 0) + (1 if has_subj else 0) + (len(s) / 1000.0)
156
- candidates.append((score, s))
157
- if not candidates:
158
- return doc
159
- return sorted(candidates, key=lambda x: x[0], reverse=True)[0][1].as_doc()
160
-
161
- def is_content_token(t) -> bool:
162
- return True # No filtra para exactitud
163
-
164
- # ------------ Mapeo lema→código ------------
165
- def code_es(lemma: str, target: str) -> str:
166
- lemma = norm_es(lemma)
167
- if target == "Minimax-ASCII":
168
- return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
169
- else:
170
- return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
171
-
172
- def code_en(lemma: str, target: str) -> str:
173
- lemma = norm_en(lemma)
174
- if target == "Minimax-ASCII":
175
- if EN2MINI: return EN2MINI.get(lemma) or enc_oov_minimax(lemma)
176
- return enc_oov_minimax(lemma)
177
- else:
178
- if EN2KOMI: return EN2KOMI.get(lemma) or enc_oov_komin(lemma)
179
- return enc_oov_komin(lemma)
180
-
181
- # ------------ Fraseador compacto ------------
182
- TAM_MINI = {"Pres":"P", "Past":"T", "Fut":"F", "UNK":"P"}
183
- TAM_KOMI = {"Pres":"Ⓟ", "Past":"Ⓣ", "Fut":"Ⓕ", "UNK":"Ⓟ"}
184
-
185
  def detect_polarity(doc) -> bool:
186
  return "?" in doc.text
187
 
@@ -255,6 +214,26 @@ def _person_of_doc(doc, src_lang: str) -> Optional[str]:
255
  except Exception:
256
  return None
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
259
  root, subs, objs, obls, advs = extract_core(doc)
260
  tense = detect_tense(root)
@@ -275,21 +254,14 @@ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, se
275
  def realize_np(tokens):
276
  outs=[]
277
  for t in tokens:
278
- if not USE_SPACY or is_content_token(t):
279
- lem = lemma_of(t, src_lang) if USE_SPACY else (t.text)
280
- code = code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII")
281
- if semi_lossless and USE_SPACY and (t.tag_ in ("NNS","NNPS") or "Number=Plur" in str(t.morph)):
282
- code = f"{code}[PL]"
283
- outs.append(code)
284
  return outs
285
 
286
  S = realize_np(subs)
287
  O = realize_np(objs) + realize_np(obls)
288
- ADV=[]
289
- for a in advs:
290
- if not USE_SPACY or is_content_token(a):
291
- lem = lemma_of(a, src_lang) if USE_SPACY else a.text
292
- ADV.append(code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII"))
293
 
294
  if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
295
  parts = S + O + ADV
@@ -315,21 +287,14 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
315
  def realize_np(tokens, particle):
316
  outs=[]
317
  for t in tokens:
318
- if not USE_SPACY or is_content_token(t):
319
- lem = lemma_of(t, src_lang) if USE_SPACY else t.text
320
- code = code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK")
321
- if semi_lossless and USE_SPACY and (t.tag_ in ("NNS","NNPS") or "Number=Plur" in str(t.morph)):
322
- code = f"{code}[PL]"
323
- outs.append(code + particle)
324
  return outs
325
 
326
  S = realize_np(subs, P_SUBJ)
327
  O = realize_np(objs + obls, P_OBJ)
328
- ADV=[]
329
- for a in advs:
330
- if not USE_SPACY or is_content_token(a):
331
- lem = lemma_of(a, src_lang) if USE_SPACY else a.text
332
- ADV.append(code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK"))
333
 
334
  v_form = vcode + TAM + (NEG_M if is_neg else "")
335
 
@@ -341,7 +306,7 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
341
  if is_q: out += " " + Q_FIN
342
  return out
343
 
344
- # ------------ Lossless (Base85 comprimido) ------------
345
  SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
346
 
347
  def b85_enc_raw(s: str) -> str:
@@ -367,7 +332,6 @@ def extract_sidecar_b85(text: str) -> Optional[str]:
367
  def strip_sidecar_b85(text: str) -> str:
368
  return SIDECAR_B85_RE.sub("", text).rstrip()
369
 
370
- # ------------ Custom sidecar para max compresión exacta ------------
371
  def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
372
  comp = zlib.compress(original_text.encode("utf-8"), 9)
373
  blob = to_custom_b64(comp, ALPHA_MINI64)
@@ -386,7 +350,7 @@ def extract_custom_sidecar(text: str) -> Optional[str]:
386
  def strip_custom_sidecar(text: str) -> str:
387
  return text.split('~')[0].rstrip() if '~' in text else text
388
 
389
- # ------------ Codificar / Decodificar léxisco puro ------------
390
  def encode_simple(text: str, src_lang: str, target: str) -> str:
391
  if not text.strip(): return ""
392
  def repl_es(m):
@@ -419,9 +383,6 @@ def pluralize_en(word: str) -> str:
419
  def pluralize(word: str, tgt_lang: str) -> str:
420
  return pluralize_es(word) if tgt_lang == "Español" else pluralize_en(word)
421
 
422
- PRON_ES = {"yo", "tú", "él", "ella", "nosotros", "vosotros", "ellos", "ellas", "usted", "ustedes"}
423
- PRON_EN = {"i", "you", "he", "she", "it", "we", "they"}
424
-
425
  mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
426
 
427
  def decode_simple(text: str, source: str, tgt_lang: str) -> str:
@@ -429,7 +390,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
429
  return ""
430
  code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
431
  code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
432
- pron_set = PRON_ES if tgt_lang == "Español" else PRON_EN
433
 
434
  if source == "Kōmín-CJK":
435
  text = text.replace("?", "?").replace(" ", " ")
@@ -447,7 +407,7 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
447
  has_q = False
448
  is_neg = False
449
 
450
- for i, part in enumerate(tokens):
451
  look = part.replace("[PL]", "")
452
  had_pl = "[PL]" in part
453
  pl_flags.append(had_pl)
@@ -468,7 +428,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
468
  lemma_tokens.append(vlem)
469
  pl_flags.append(False)
470
 
471
- # Parse tail
472
  if tail:
473
  if len(tail) > 0 and tail[0] in "PTF":
474
  verb_tense = {"P": "Pres", "T": "Past", "F": "Fut"}.get(tail[0], "Pres")
@@ -487,7 +446,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
487
  verb_lemma = vlem
488
  continue
489
 
490
- # No verbo
491
  w_es = code2es.get(look)
492
  w_en = code2en.get(look) if code2en else None
493
  w = w_es if tgt_lang == "Español" else (w_en or w_es or look)
@@ -500,50 +458,23 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
500
  pl_flags.append(had_pl)
501
 
502
  out_parts = []
503
- greeting = None
504
- wh = None
505
  for idx, lem in enumerate(lemma_tokens):
506
  if idx == verb_idx:
507
  conj_func = _es_conj if tgt_lang == "Español" else _en_conj
508
  v_conj = conj_func(verb_lemma, verb_tense, verb_person)
509
  if is_neg:
510
- neg_prefix = "no " if tgt_lang == "Español" else "not "
511
- v_conj = neg_prefix + v_conj
512
  out_parts.append(v_conj)
513
  continue
 
514
 
515
- w = pluralize(lem, tgt_lang) if pl_flags[idx] else lem
516
- if w.lower() in {"hola", "hello", "hi", "hey"}:
517
- greeting = w
518
- elif w.lower() in {"como", "cómo", "what", "how"} and has_q:
519
- wh = w
520
- if tgt_lang == "Español" and w.lower() == "como":
521
- wh = "cómo"
522
- else:
523
- out_parts.append(w)
524
-
525
- # Reorden: Greeting + wh + S V O ADV
526
- final_out = []
527
- if greeting:
528
- final_out.append(greeting.capitalize())
529
- if wh:
530
- final_out.append(wh)
531
- final_out += out_parts
532
-
533
- out_text = " ".join(final_out)
534
-
535
- # Pregunta
536
  if has_q:
537
  start_q = "¿" if tgt_lang == "Español" else ""
538
- end_q = "?" if tgt_lang == "Español" else "?"
539
- out_text = f"{start_q}{out_text.capitalize()}{end_q}"
540
-
541
  return out_text
542
 
543
  # ------------ Conjugadores mínimos ------------
544
- _ES_SUBJ = {"1s":"yo","2s":"tú","3s":"él/ella","1p":"nosotros","2p":"vosotros","3p":"ellos"}
545
- _EN_SUBJ = {"1s":"I","2s":"you","3s":"he","1p":"we","2p":"you","3p":"they"}
546
-
547
  def _es_conj_regular(lemma, tense, person):
548
  if not lemma.endswith(("ar","er","ir")): return lemma
549
  stem = lemma[:-2]; vtype = lemma[-2:]
@@ -572,7 +503,7 @@ def _es_conj(lemma, tense, person):
572
  if lemma == "estar":
573
  tab = {
574
  "Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
575
- "Past":{"1s":"estuve","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"},
576
  "Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"},
577
  }; return tab[tense].get(person, tab[tense]["3s"])
578
  if lemma == "ir":
@@ -616,7 +547,7 @@ def _en_conj(lemma, tense, person):
616
  else:
617
  return lemma
618
 
619
- # ------------ Semi-lossless (rutas) ------------
620
  def _build_with_spacy(text: str, src_lang: str, target: str,
621
  drop_articles: bool, zero_copula: bool, semi_lossless: bool) -> str:
622
  nlp = nlp_es if src_lang=="Español" else nlp_en
@@ -629,7 +560,7 @@ def _build_with_spacy(text: str, src_lang: str, target: str,
629
  def build_sentence(text: str, src_lang: str, target: str,
630
  drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
631
  if not text.strip(): return ""
632
- semi = True # Siempre semi-lossless
633
  core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi_lossless=semi) if USE_SPACY else encode_simple(text, src_lang, target)
634
  if max_comp_exact:
635
  return custom_sidecar_enc(core, text)
@@ -640,31 +571,19 @@ def universal_translate(text: str, src: str, tgt: str,
640
  mode: str, max_comp_exact: bool = False) -> str:
641
  if not text.strip(): return ""
642
  if src == tgt: return text
643
-
644
- # Natural → Conlang
645
  if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
646
  return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact)
647
-
648
- # Conlang → Natural (considera sidecars)
649
  if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
650
- # Custom sidecar para exact
651
  orig = extract_custom_sidecar(text)
652
  if orig is not None: return orig
653
- # Fallback b85 si hay
654
  orig = extract_sidecar_b85(text)
655
  if orig is not None: return orig
656
- # Semi-lossless
657
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
658
-
659
- # Natural ↔ Natural (lemas)
660
  if src in ("Español","English") and tgt in ("Español","English"):
661
  return translate_natural(text, src, tgt)
662
-
663
- # Conlang ↔ Conlang (simplificado)
664
  if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
665
  orig = extract_custom_sidecar(text)
666
  if orig is not None:
667
- # Preserva sidecar
668
  core = strip_custom_sidecar(text)
669
  es_lemmas = decode_simple(core, src, "Español")
670
  words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
@@ -672,28 +591,20 @@ def universal_translate(text: str, src: str, tgt: str,
672
  for w in words:
673
  if re.fullmatch(r"\w+", w):
674
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
675
- if not code:
676
- code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)
677
- out.append(code)
678
  else:
679
  out.append(w)
680
- out_text = " ".join(out)
681
- return custom_sidecar_enc(out_text, orig)
682
- # Sin sidecar, normal
683
- core = text
684
- es_lemmas = decode_simple(core, src, "Español")
685
  words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
686
  out=[]
687
  for w in words:
688
  if re.fullmatch(r"\w+", w):
689
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
690
- if not code:
691
- code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)
692
- out.append(code)
693
  else:
694
  out.append(w)
695
  return " ".join(out)
696
-
697
  return "[No soportado]"
698
 
699
  def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
@@ -719,91 +630,306 @@ def round_trip(text, src, tgt, mode, max_comp_exact):
719
  back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
720
  return conlang, back
721
 
722
- # ------------ UI y explicaciones ------------
 
 
 
 
 
 
723
  EXPLAIN_ES = """
724
- **Modo único:Sin ninguna casilla marcada: 0% de compactación (el conlang es similar al original, sin omisiones ni compresión extra).</br>
725
- Omitir artículos marcada: Aproximadamente 10-15% de compactación (elimina artículos como "el", "la", que representan alrededor del 10% de las palabras en textos españoles típicos).
726
- </br>Cópula cero marcada: Aproximadamente 5-10% de compactación (omita verbos copulativos como "ser" o "estar" en oraciones afirmativas presentes, común en ~5-10% de las oraciones).
727
- Ambas (Omitir artículos + Cópula cero): Aproximadamente 15-20% de compactación (combinación aditiva de omisiones).
728
- </br>Max Compresión Exacta marcada: Aproximadamente 40-60% de compactación en textos medianos/largos (>100 caracteres, gracias a zlib que comprime texto español en ratios de 2:1 a 5:1). Para textos cortos (<30 caracteres), ~0% o incluso aumento por overhead de compresión. Esto asegura traducción inversa 100% exacta.: Minimax (VSO, ·TAMpersonNQ), Kōmín (SOV, ᵖ/ᵒ Ⓟ[2s]̆?).
 
 
 
 
 
 
 
 
 
 
 
 
729
  """
 
 
 
 
 
 
 
 
 
 
 
730
 
731
- ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
 
732
 
733
- with gr.Blocks(title="Universal Conlang Translator") as demo:
734
- gr.Markdown("# Universal Conlang Translator · Max Compresión Exacta")
735
- gr.Markdown(EXPLAIN_ES)
 
 
 
736
 
737
- # --- Traducir (universal) ---
738
- with gr.Tab("Traducir"):
739
- with gr.Row():
740
- uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
741
- uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
742
- uni_text = gr.Textbox(lines=3, label="Texto", value="")
743
- with gr.Row():
744
- uni_drop = gr.Checkbox(value=True, label="Omitir artículos (ES/EN→conlang)")
745
- uni_zero = gr.Checkbox(value=False, label="Cópula cero (presente afirm.) (ES/EN→conlang)")
746
- uni_max_comp = gr.Checkbox(value=False, label="Max Compresión Exacta (sidecar oculto)")
747
- uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
748
- uni_out = gr.Textbox(lines=6, label="Traducción")
749
- gr.Button("Traducir").click(
750
- universal_translate,
751
- [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
752
- [uni_out]
753
- )
754
-
755
- # --- Construir frase (ES/EN → Conlang) ---
756
- with gr.Tab("Construir frase (ES/EN → Conlang)"):
757
- with gr.Row():
758
- src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
759
- target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
760
- text_in = gr.Textbox(lines=3, label="Frase", value="")
761
- with gr.Row():
762
- drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
763
- zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
764
- max_comp_build = gr.Checkbox(value=False, label="Max Compresión Exacta (sidecar oculto)")
765
- mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
766
- out = gr.Textbox(lines=6, label="Salida")
767
- gr.Button("Construir").click(
768
- build_sentence,
769
- [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
770
- [out]
771
- )
772
-
773
- # --- Decodificar (Conlang → ES/EN) ---
774
- with gr.Tab("Decodificar (Conlang → ES/EN)"):
775
  with gr.Row():
776
- src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Fuente")
777
- tgt_lang = gr.Dropdown(["Español","English"], value="Español", label="Destino")
778
- code_in = gr.Textbox(lines=3, label="Texto en conlang (incluye ~blob si procede)")
779
- out3 = gr.Textbox(lines=6, label="Salida")
780
-
781
- def decode_lossless_aware(text, src, tgt):
782
- orig = extract_custom_sidecar(text)
783
- if orig is not None: return orig
784
- orig = extract_sidecar_b85(text)
785
- if orig is not None: return orig
786
- return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
787
-
788
- gr.Button("Decodificar").click(
789
- decode_lossless_aware, [code_in, src_code, tgt_lang], [out3]
790
- )
791
-
792
- # --- Round-trip ---
793
- with gr.Tab("Prueba ida→vuelta"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
794
  with gr.Row():
795
- rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
796
- rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
797
- rt_text = gr.Textbox(lines=3, label="Frase", value="")
798
- rt_max_comp = gr.Checkbox(value=False, label="Max Compresión Exacta")
799
- rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
800
- rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)")
801
- rt_out_back = gr.Textbox(lines=3, label="Vuelta")
802
- gr.Button("Probar").click(
803
- round_trip,
804
- [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp],
805
- [rt_out_conlang, rt_out_back]
806
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
807
 
808
  if __name__ == "__main__":
809
- demo.launch()
 
1
+ # app.py — Universal Conlang Translator (Max Compresión Exacta) — UI bilingüe ES/EN
2
  # Archivos necesarios en la raíz:
3
  # - lexicon_minimax.json
4
  # - lexicon_komin.json
 
15
  import json
16
  import base64
17
  import zlib
18
+ from typing import Dict, Optional
 
19
  import gradio as gr
20
 
21
  # ------------ Archivos esperados ------------
 
77
  EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
78
  ES2EN_LEMMA, EN2ES_LEMMA) = load_lexicons()
79
 
80
+ # ------------ OOV reversible (Semi-lossless) ------------
81
  ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
82
  CJK_BASE = (
83
  "天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
 
140
  else:
141
  return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
142
 
143
+ # ------------ Utilidades de análisis sintáctico ------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
144
  def detect_polarity(doc) -> bool:
145
  return "?" in doc.text
146
 
 
214
  except Exception:
215
  return None
216
 
217
+ # ------------ Mapeo lema→código y fraseadores ------------
218
+ def code_es(lemma: str, target: str) -> str:
219
+ lemma = norm_es(lemma)
220
+ if target == "Minimax-ASCII":
221
+ return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
222
+ else:
223
+ return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
224
+
225
+ def code_en(lemma: str, target: str) -> str:
226
+ lemma = norm_en(lemma)
227
+ if target == "Minimax-ASCII":
228
+ if EN2MINI: return EN2MINI.get(lemma) or enc_oov_minimax(lemma)
229
+ return enc_oov_minimax(lemma)
230
+ else:
231
+ if EN2KOMI: return EN2KOMI.get(lemma) or enc_oov_komin(lemma)
232
+ return enc_oov_komin(lemma)
233
+
234
+ TAM_MINI = {"Pres":"P", "Past":"T", "Fut":"F", "UNK":"P"}
235
+ TAM_KOMI = {"Pres":"Ⓟ", "Past":"Ⓣ", "Fut":"Ⓕ", "UNK":"Ⓟ"}
236
+
237
  def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
238
  root, subs, objs, obls, advs = extract_core(doc)
239
  tense = detect_tense(root)
 
254
  def realize_np(tokens):
255
  outs=[]
256
  for t in tokens:
257
+ lem = lemma_of(t, src_lang) if USE_SPACY else (t.text)
258
+ code = code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII")
259
+ outs.append(code)
 
 
 
260
  return outs
261
 
262
  S = realize_np(subs)
263
  O = realize_np(objs) + realize_np(obls)
264
+ ADV=[code_es(lemma_of(a, src_lang), "Minimax-ASCII") if src_lang=="Español" else code_en(lemma_of(a, src_lang), "Minimax-ASCII") for a in advs] if USE_SPACY else []
 
 
 
 
265
 
266
  if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
267
  parts = S + O + ADV
 
287
  def realize_np(tokens, particle):
288
  outs=[]
289
  for t in tokens:
290
+ lem = lemma_of(t, src_lang) if USE_SPACY else t.text
291
+ code = code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK")
292
+ outs.append(code + particle)
 
 
 
293
  return outs
294
 
295
  S = realize_np(subs, P_SUBJ)
296
  O = realize_np(objs + obls, P_OBJ)
297
+ ADV=[code_es(lemma_of(a, src_lang), "Kōmín-CJK") if src_lang=="Español" else code_en(lemma_of(a, src_lang), "Kōmín-CJK") for a in advs] if USE_SPACY else []
 
 
 
 
298
 
299
  v_form = vcode + TAM + (NEG_M if is_neg else "")
300
 
 
306
  if is_q: out += " " + Q_FIN
307
  return out
308
 
309
+ # ------------ Sidecars para compresión exacta ------------
310
  SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
311
 
312
  def b85_enc_raw(s: str) -> str:
 
332
  def strip_sidecar_b85(text: str) -> str:
333
  return SIDECAR_B85_RE.sub("", text).rstrip()
334
 
 
335
  def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
336
  comp = zlib.compress(original_text.encode("utf-8"), 9)
337
  blob = to_custom_b64(comp, ALPHA_MINI64)
 
350
  def strip_custom_sidecar(text: str) -> str:
351
  return text.split('~')[0].rstrip() if '~' in text else text
352
 
353
+ # ------------ Codificar/decodificar léxico puro ------------
354
  def encode_simple(text: str, src_lang: str, target: str) -> str:
355
  if not text.strip(): return ""
356
  def repl_es(m):
 
383
  def pluralize(word: str, tgt_lang: str) -> str:
384
  return pluralize_es(word) if tgt_lang == "Español" else pluralize_en(word)
385
 
 
 
 
386
  mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
387
 
388
  def decode_simple(text: str, source: str, tgt_lang: str) -> str:
 
390
  return ""
391
  code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
392
  code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
 
393
 
394
  if source == "Kōmín-CJK":
395
  text = text.replace("?", "?").replace(" ", " ")
 
407
  has_q = False
408
  is_neg = False
409
 
410
+ for part in tokens:
411
  look = part.replace("[PL]", "")
412
  had_pl = "[PL]" in part
413
  pl_flags.append(had_pl)
 
428
  lemma_tokens.append(vlem)
429
  pl_flags.append(False)
430
 
 
431
  if tail:
432
  if len(tail) > 0 and tail[0] in "PTF":
433
  verb_tense = {"P": "Pres", "T": "Past", "F": "Fut"}.get(tail[0], "Pres")
 
446
  verb_lemma = vlem
447
  continue
448
 
 
449
  w_es = code2es.get(look)
450
  w_en = code2en.get(look) if code2en else None
451
  w = w_es if tgt_lang == "Español" else (w_en or w_es or look)
 
458
  pl_flags.append(had_pl)
459
 
460
  out_parts = []
 
 
461
  for idx, lem in enumerate(lemma_tokens):
462
  if idx == verb_idx:
463
  conj_func = _es_conj if tgt_lang == "Español" else _en_conj
464
  v_conj = conj_func(verb_lemma, verb_tense, verb_person)
465
  if is_neg:
466
+ v_conj = ("no " if tgt_lang == "Español" else "not ") + v_conj
 
467
  out_parts.append(v_conj)
468
  continue
469
+ out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
470
 
471
+ out_text = " ".join(out_parts)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
472
  if has_q:
473
  start_q = "¿" if tgt_lang == "Español" else ""
474
+ out_text = f"{start_q}{out_text.capitalize()}?"
 
 
475
  return out_text
476
 
477
  # ------------ Conjugadores mínimos ------------
 
 
 
478
  def _es_conj_regular(lemma, tense, person):
479
  if not lemma.endswith(("ar","er","ir")): return lemma
480
  stem = lemma[:-2]; vtype = lemma[-2:]
 
503
  if lemma == "estar":
504
  tab = {
505
  "Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
506
+ "Past":{"1s":"estuviste","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"},
507
  "Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"},
508
  }; return tab[tense].get(person, tab[tense]["3s"])
509
  if lemma == "ir":
 
547
  else:
548
  return lemma
549
 
550
+ # ------------ Rutas principales ------------
551
  def _build_with_spacy(text: str, src_lang: str, target: str,
552
  drop_articles: bool, zero_copula: bool, semi_lossless: bool) -> str:
553
  nlp = nlp_es if src_lang=="Español" else nlp_en
 
560
  def build_sentence(text: str, src_lang: str, target: str,
561
  drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
562
  if not text.strip(): return ""
563
+ semi = True # siempre semi-lossless
564
  core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi_lossless=semi) if USE_SPACY else encode_simple(text, src_lang, target)
565
  if max_comp_exact:
566
  return custom_sidecar_enc(core, text)
 
571
  mode: str, max_comp_exact: bool = False) -> str:
572
  if not text.strip(): return ""
573
  if src == tgt: return text
 
 
574
  if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
575
  return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact)
 
 
576
  if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
 
577
  orig = extract_custom_sidecar(text)
578
  if orig is not None: return orig
 
579
  orig = extract_sidecar_b85(text)
580
  if orig is not None: return orig
 
581
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
 
 
582
  if src in ("Español","English") and tgt in ("Español","English"):
583
  return translate_natural(text, src, tgt)
 
 
584
  if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
585
  orig = extract_custom_sidecar(text)
586
  if orig is not None:
 
587
  core = strip_custom_sidecar(text)
588
  es_lemmas = decode_simple(core, src, "Español")
589
  words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
 
591
  for w in words:
592
  if re.fullmatch(r"\w+", w):
593
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
594
+ out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
 
 
595
  else:
596
  out.append(w)
597
+ return custom_sidecar_enc(" ".join(out), orig)
598
+ es_lemmas = decode_simple(text, src, "Español")
 
 
 
599
  words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
600
  out=[]
601
  for w in words:
602
  if re.fullmatch(r"\w+", w):
603
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
604
+ out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
 
 
605
  else:
606
  out.append(w)
607
  return " ".join(out)
 
608
  return "[No soportado]"
609
 
610
  def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
 
630
  back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
631
  return conlang, back
632
 
633
+ # =====================================================================================
634
+ # ========================== UI bilingüe con selector global ==========================
635
+ # =====================================================================================
636
+
637
+ ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
638
+
639
+ # Texto ES
640
  EXPLAIN_ES = """
641
+ ## 🌐 ¿Qué hace esta app?
642
+ Traduce entre **Español / Inglés** y dos lenguajes construidos:
643
+ - **Minimax-ASCII** (compacto y solo ASCII)
644
+ - **Kōmín-CJK** (estilo CJK con partículas)
645
+
646
+ También **comprime sin perder información** si activas **Máx. Compresión Exacta** (`~...` guarda el original).
647
+ Al **decodificar**, si existe ese `~...`, recuperas el texto **exacto**.
648
+
649
+ ### 🧠 ¿Por qué me sirve?
650
+ - Para **reducir** tamaño de mensajes/notas.
651
+ - Para **codificar/decodificar** de forma legible y reversible.
652
+ - Para jugar con **conlangs** simples.
653
+
654
+ ### ⚙️ Opciones (puedes ignorarlas al principio)
655
+ - **Omitir artículos**: quita *el/la/los/las* o *a/an/the*. Ahorra ~10–15%.
656
+ - **Cópula cero** (presente afirmativo): oculta *ser/estar/be* cuando suena natural. +~5–10%.
657
+ - **Máx. Compresión Exacta**: añade `~...` con el original comprimido (mejor en textos medianos/largos).
658
  """
659
+ FAQ_ES = """
660
+ ### ❓ Preguntas rápidas
661
+ - **¿Se pierde info?** No, con **Máx. Compresión Exacta** el `~...` guarda el original.
662
+ - **¿Sin spaCy?** Funciona igual (modo léxico). Con spaCy suena más natural.
663
+ - **Privacidad**: todo corre dentro de este Space.
664
+ """
665
+ TUTORIAL_ES = """
666
+ ### 🏁 Empezar (3 pasos)
667
+ 1. Elige **Fuente** y **Destino**.
668
+ 2. Escribe tu frase.
669
+ 3. Pulsa **Traducir**.
670
 
671
+ > Para recuperar **exactamente** el original más tarde, activa **Máx. Compresión Exacta**.
672
+ """
673
 
674
+ # Texto EN
675
+ EXPLAIN_EN = """
676
+ ## 🌐 What does this app do?
677
+ It translates between **Spanish / English** and two constructed languages:
678
+ - **Minimax-ASCII** (compact, ASCII-only)
679
+ - **Kōmín-CJK** (CJK-style with particles)
680
 
681
+ You can also **compress without losing information** by enabling **Max Exact Compression** (`~...` stores the original).
682
+ When **decoding**, if `~...` exists, you get the **exact original** back.
683
+ """
684
+ FAQ_EN = """
685
+ ### Quick answers
686
+ - **Any loss?** Not with **Max Exact Compression** — the `~...` keeps the original.
687
+ - **No spaCy?** Still works (lexical mode). With spaCy it reads more naturally.
688
+ - **Privacy**: everything runs inside this Space.
689
+ """
690
+ TUTORIAL_EN = """
691
+ ### 🏁 Quick start (3 steps)
692
+ 1. Pick **Source** and **Target**.
693
+ 2. Type your sentence.
694
+ 3. Click **Translate**.
695
+
696
+ > To recover the **exact** original later, enable **Max Exact Compression**.
697
+ """
698
+
699
+ def make_group_es():
700
+ with gr.Group(visible=True) as group:
701
+ gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
702
  with gr.Row():
703
+ with gr.Column(scale=1):
704
+ with gr.Accordion("Resumen (ES)", open=True):
705
+ gr.Markdown(EXPLAIN_ES)
706
+ with gr.Accordion("FAQ (ES)", open=False):
707
+ gr.Markdown(FAQ_ES)
708
+ with gr.Column(scale=1):
709
+ with gr.Accordion("Tutorial (ES)", open=True):
710
+ gr.Markdown(TUTORIAL_ES)
711
+ gr.Markdown("**Consejo:** Los mensajes muy cortos pueden no reducirse por la cabecera del `~...`.")
712
+ with gr.Tab("🔁 Traducir"):
713
+ with gr.Row():
714
+ uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
715
+ uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
716
+ uni_text = gr.Textbox(lines=3, label="Texto", placeholder="Ej.: Hola, ¿cómo estás?", show_copy_button=True)
717
+ with gr.Row():
718
+ uni_drop = gr.Checkbox(value=True, label="Omitir artículos (ES/EN → conlang)")
719
+ uni_zero = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
720
+ uni_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta (sidecar `~...`)")
721
+ uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
722
+ with gr.Row():
723
+ btn_translate = gr.Button("🚀 Traducir", variant="primary")
724
+ btn_reset = gr.Button("🧹 Limpiar")
725
+ uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
726
+
727
+ btn_translate.click(
728
+ universal_translate,
729
+ [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
730
+ [uni_out]
731
+ )
732
+ btn_reset.click(lambda: "", None, [uni_text, uni_out])
733
+
734
+ gr.Markdown("### 🔎 Ejemplos (clic para autocompletar)")
735
+ ex1 = gr.Button("ES→Minimax: «Hola, ¿cómo estás?»")
736
+ ex2 = gr.Button("EN→Kōmín: «This system keeps messages compact.»")
737
+ ex3 = gr.Button("ES→Minimax (con compresión): «El clima hoy es excelente para pasear.»")
738
+ ex4 = gr.Button("EN→Kōmín (con compresión): «Please decode this later with the sidecar.»")
739
+
740
+ ex1.click(lambda: ("Hola, ¿cómo estás?", "Español", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
741
+ ex2.click(lambda: ("This system keeps messages compact.", "English", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
742
+ ex3.click(lambda: ("El clima hoy es excelente para pasear.", "Español", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
743
+ ex4.click(lambda: ("Please decode this later with the sidecar.", "English", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
744
+
745
+ with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
746
+ with gr.Row():
747
+ src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
748
+ target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
749
+ text_in = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
750
+ with gr.Row():
751
+ drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
752
+ zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
753
+ max_comp_build = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
754
+ mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
755
+ with gr.Row():
756
+ btn_build = gr.Button("🏗️ Construir", variant="primary")
757
+ btn_build_clear = gr.Button("🧹 Limpiar")
758
+ out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
759
+
760
+ btn_build.click(
761
+ build_sentence,
762
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
763
+ [out]
764
+ )
765
+ btn_build_clear.click(lambda: "", None, [text_in, out])
766
+
767
+ with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
768
+ with gr.Row():
769
+ src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Fuente")
770
+ tgt_lang = gr.Dropdown(["Español","English"], value="Español", label="Destino")
771
+ code_in = gr.Textbox(lines=3, label="Texto en conlang (puede incluir `~...`)", show_copy_button=True)
772
+ out3 = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
773
+
774
+ def decode_lossless_aware(text, src, tgt):
775
+ orig = extract_custom_sidecar(text)
776
+ if orig is not None: return orig
777
+ orig = extract_sidecar_b85(text)
778
+ if orig is not None: return orig
779
+ return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
780
+
781
+ with gr.Row():
782
+ btn_decode = gr.Button("🔓 Decodificar", variant="primary")
783
+ btn_decode_clear = gr.Button("🧹 Limpiar")
784
+
785
+ btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
786
+ btn_decode_clear.click(lambda: "", None, [code_in, out3])
787
+
788
+ gr.Markdown("> **Tip:** si ves `~...`, la decodificación será 100% exacta.")
789
+
790
+ with gr.Tab("🔄 Prueba ida→vuelta"):
791
+ with gr.Row():
792
+ rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
793
+ rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
794
+ rt_text = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
795
+ rt_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
796
+ rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
797
+ rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
798
+ rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
799
+ with gr.Row():
800
+ btn_rt = gr.Button("▶️ Probar", variant="primary")
801
+ btn_rt_clear = gr.Button("🧹 Limpiar")
802
+
803
+ btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
804
+ btn_rt_clear.click(lambda: "", None, [rt_text, rt_out_conlang, rt_out_back])
805
+
806
+ gr.Markdown("---")
807
+ gr.Markdown("Hecho con ❤️ · **spaCy** (opcional) · Todo se ejecuta en este Space.")
808
+ return group
809
+
810
+ def make_group_en():
811
+ with gr.Group(visible=False) as group:
812
+ gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
813
  with gr.Row():
814
+ with gr.Column(scale=1):
815
+ with gr.Accordion("Summary (EN)", open=True):
816
+ gr.Markdown(EXPLAIN_EN)
817
+ with gr.Accordion("FAQ (EN)", open=False):
818
+ gr.Markdown(FAQ_EN)
819
+ with gr.Column(scale=1):
820
+ with gr.Accordion("Tutorial (EN)", open=True):
821
+ gr.Markdown(TUTORIAL_EN)
822
+ gr.Markdown("**Tip:** Very short messages may not shrink due to the `~...` header.")
823
+ with gr.Tab("🔁 Translate"):
824
+ with gr.Row():
825
+ uni_src = gr.Dropdown(ALL_LANGS, value="English", label="Source")
826
+ uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Target")
827
+ uni_text = gr.Textbox(lines=3, label="Text", placeholder="e.g., Hello, how are you?", show_copy_button=True)
828
+ with gr.Row():
829
+ uni_drop = gr.Checkbox(value=True, label="Drop articles (ES/EN → conlang)")
830
+ uni_zero = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
831
+ uni_max_comp = gr.Checkbox(value=False, label="Max Exact Compression (sidecar `~...`)")
832
+ uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
833
+ with gr.Row():
834
+ btn_translate = gr.Button("🚀 Translate", variant="primary")
835
+ btn_reset = gr.Button("🧹 Clear")
836
+ uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
837
+
838
+ btn_translate.click(
839
+ universal_translate,
840
+ [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
841
+ [uni_out]
842
+ )
843
+ btn_reset.click(lambda: "", None, [uni_text, uni_out])
844
+
845
+ gr.Markdown("### 🔎 Examples (click to autofill)")
846
+ ex1 = gr.Button("EN→Minimax: “Hello, how are you?”")
847
+ ex2 = gr.Button("ES→Kōmín: “Este sistema mantiene los mensajes compactos.”")
848
+ ex3 = gr.Button("EN→Minimax (compressed): “The weather today is perfect for a walk.”")
849
+ ex4 = gr.Button("ES→Kōmín (compressed): “Por favor decodifica esto luego con el sidecar.”")
850
+
851
+ ex1.click(lambda: ("Hello, how are you?", "English", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
852
+ ex2.click(lambda: ("Este sistema mantiene los mensajes compactos.", "Español", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
853
+ ex3.click(lambda: ("The weather today is perfect for a walk.", "English", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
854
+ ex4.click(lambda: ("Por favor decodifica esto luego con el sidecar.", "Español", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
855
+
856
+ with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
857
+ with gr.Row():
858
+ src_lang = gr.Dropdown(["Español","English"], value="English", label="Source")
859
+ target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
860
+ text_in = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
861
+ with gr.Row():
862
+ drop_articles = gr.Checkbox(value=True, label="Drop articles")
863
+ zero_copula = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
864
+ max_comp_build = gr.Checkbox(value=False, label="Max Exact Compression")
865
+ mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
866
+ with gr.Row():
867
+ btn_build = gr.Button("🏗️ Build", variant="primary")
868
+ btn_build_clear = gr.Button("🧹 Clear")
869
+ out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
870
+
871
+ btn_build.click(
872
+ build_sentence,
873
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
874
+ [out]
875
+ )
876
+ btn_build_clear.click(lambda: "", None, [text_in, out])
877
+
878
+ with gr.Tab("����️ Decode (Conlang → ES/EN)"):
879
+ with gr.Row():
880
+ src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Source")
881
+ tgt_lang = gr.Dropdown(["Español","English"], value="English", label="Target")
882
+ code_in = gr.Textbox(lines=3, label="Conlang text (may include `~...`)", show_copy_button=True)
883
+ out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
884
+
885
+ def decode_lossless_aware(text, src, tgt):
886
+ orig = extract_custom_sidecar(text)
887
+ if orig is not None: return orig
888
+ orig = extract_sidecar_b85(text)
889
+ if orig is not None: return orig
890
+ return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
891
+
892
+ with gr.Row():
893
+ btn_decode = gr.Button("🔓 Decode", variant="primary")
894
+ btn_decode_clear = gr.Button("🧹 Clear")
895
+
896
+ btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
897
+ btn_decode_clear.click(lambda: "", None, [code_in, out3])
898
+
899
+ gr.Markdown("> **Tip:** if you see `~...`, decoding will be bit-perfect.")
900
+
901
+ with gr.Tab("🔄 Round-trip"):
902
+ with gr.Row():
903
+ rt_src = gr.Dropdown(["Español","English"], value="English", label="Source")
904
+ rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
905
+ rt_text = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
906
+ rt_max_comp = gr.Checkbox(value=False, label="Max Exact Compression")
907
+ rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
908
+ rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
909
+ rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
910
+ with gr.Row():
911
+ btn_rt = gr.Button("▶️ Test", variant="primary")
912
+ btn_rt_clear = gr.Button("🧹 Clear")
913
+
914
+ btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
915
+ btn_rt_clear.click(lambda: "", None, [rt_text, rt_out_conlang, rt_out_back])
916
+
917
+ gr.Markdown("---")
918
+ gr.Markdown("Made with ❤️ · **spaCy** (optional) · Everything runs inside this Space.")
919
+ return group
920
+
921
+ with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
922
+ gr.Markdown("## 🌍 Idioma / Language")
923
+ lang_select = gr.Radio(choices=["ES","EN"], value="ES", label="Selecciona / Select")
924
+ group_es = make_group_es()
925
+ group_en = make_group_en()
926
+
927
+ def switch_lang(code):
928
+ if code == "EN":
929
+ return gr.update(visible=False), gr.update(visible=True)
930
+ return gr.update(visible=True), gr.update(visible=False)
931
+
932
+ lang_select.change(switch_lang, [lang_select], [group_es, group_en])
933
 
934
  if __name__ == "__main__":
935
+ demo.launch()