LoloSemper commited on
Commit
eff6688
·
verified ·
1 Parent(s): 1054168

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +267 -360
app.py CHANGED
@@ -1,6 +1,5 @@
1
  # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
- # UI bilingüe ES/EN + Botón de explicación de léxico + Acordeones plegables
3
- # Archivos requeridos:
4
  # - lexicon_minimax.json
5
  # - lexicon_komin.json
6
  # - lexicon_master.json
@@ -23,18 +22,13 @@ LEX_MASTER = "lexicon_master.json"
23
  # ------------ Normalización ------------
24
  WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
25
  STRIP = str.maketrans("ÁÉÍÓÚÜÑáéíóúüñ", "AEIOUUNaeiouun")
26
-
27
- def norm_es(w: str) -> str:
28
- return re.sub(r"[^a-záéíóúüñ]", "", (w or "").lower()).translate(STRIP)
29
-
30
- def norm_en(w: str) -> str:
31
- return re.sub(r"[^a-z]", "", (w or "").lower())
32
 
33
  # ------------ Carga de léxicos ------------
34
  def load_json(path: str):
35
  if not os.path.exists(path): return None
36
- with open(path, "r", encoding="utf-8") as f:
37
- return json.load(f)
38
 
39
  def load_lexicons():
40
  mm = load_json(LEX_MINI) or {}
@@ -53,19 +47,15 @@ def load_lexicons():
53
 
54
  if isinstance(master, dict) and "entries" in master:
55
  for e in master["entries"]:
56
- es = norm_es(str(e.get("lemma_es","")))
57
- en = norm_en(str(e.get("lemma_en","")))
58
- mi = str(e.get("minimax",""))
59
- ko = str(e.get("komin",""))
60
  if es and en:
61
- es2en_lemma.setdefault(es, en)
62
- en2es_lemma.setdefault(en, es)
63
  if en and mi: en2mini.setdefault(en, mi)
64
  if en and ko: en2komi.setdefault(en, ko)
65
 
66
  mini2en = {v:k for k,v in en2mini.items()}
67
  komi2en = {v:k for k,v in en2komi.items()}
68
-
69
  return (es2mini, es2komi, mini2es, komi2es,
70
  en2mini, en2komi, mini2en, komi2en,
71
  es2en_lemma, en2es_lemma, master)
@@ -92,11 +82,9 @@ def to_custom_b64(b: bytes, alphabet: str) -> str:
92
  std = base64.b64encode(b).decode("ascii")
93
  trans = str.maketrans("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", alphabet)
94
  return std.translate(trans).rstrip("=")
95
-
96
  def from_custom_b64(s: str, alphabet: str) -> bytes:
97
  trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
98
- std = s.translate(trans)
99
- pad = "=" * ((4 - len(std) % 4) % 4)
100
  return base64.b64decode(std + pad)
101
 
102
  def enc_oov_minimax(token: str) -> str: return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64)
@@ -115,9 +103,7 @@ USE_SPACY = False
115
  try:
116
  import spacy
117
  try:
118
- nlp_es = spacy.load("es_core_news_sm")
119
- nlp_en = spacy.load("en_core_web_sm")
120
- USE_SPACY = True
121
  except Exception:
122
  nlp_es = nlp_en = None
123
  except Exception:
@@ -125,54 +111,48 @@ except Exception:
125
 
126
  def lemma_of(tok, src_lang: str) -> str:
127
  if src_lang == "Español":
128
- return norm_es(tok.lemma_ if getattr(tok, "lemma_", "") else tok.text)
129
  else:
130
- return norm_en(tok.lemma_ if getattr(tok, "lemma_", "") else tok.text)
131
-
132
- # ------------ Herramientas análisis simple ------------
133
- def detect_polarity(doc) -> bool: return "?" in doc.text
134
 
 
 
135
  def detect_neg(doc) -> bool:
136
  for t in doc:
137
- if getattr(t, "dep_", "") == "neg" or getattr(t, "lower_", "").lower() in ("no","not","n't"):
138
  return True
139
  return False
140
-
141
  def detect_tense(root):
142
- m = str(getattr(root, "morph", ""))
143
  if "Tense=Past" in m: return "Past"
144
  if "Tense=Fut" in m: return "Fut"
145
  if "Tense=Pres" in m: return "Pres"
146
- for c in getattr(root, "children", []):
147
- if getattr(c, "pos_", "") == "AUX":
148
- cm = str(getattr(c, "morph", ""))
149
  if "Tense=Past" in cm: return "Past"
150
- if getattr(c, "lower_", "").lower() == "will": return "Fut"
151
  return "Pres"
152
-
153
  def extract_core(doc):
154
  tokens = list(doc)
155
- root = next((t for t in tokens if getattr(t, "dep_", "")=="ROOT" and getattr(t, "pos_", "") in ("VERB","AUX")), tokens[0] if tokens else doc)
156
  subs, objs, obls, advs = [], [], [], []
157
- for t in getattr(root, "children", []):
158
- dep = getattr(t, "dep_", "")
159
- pos = getattr(t, "pos_", "")
160
  if dep in ("nsubj","nsubj:pass","csubj"): subs.append(t)
161
  elif dep in ("obj","dobj","iobj"): objs.append(t)
162
  elif dep in ("obl","pobj"): obls.append(t)
163
  elif dep in ("advmod","advcl") and pos=="ADV": advs.append(t)
164
- sortkey = lambda x: getattr(x, "i", 0)
165
- for arr in (subs,objs,obls,advs): arr.sort(key=sortkey)
166
  return root, subs, objs, obls, advs
167
-
168
  def _person_of_doc(doc, src_lang: str) -> Optional[str]:
169
  try:
170
  tokens = list(doc)
171
- root = next((t for t in tokens if getattr(t, "dep_", "")=="ROOT"), tokens[0])
172
- subj = next((t for t in getattr(root, "children", []) if getattr(t, "dep_", "").startswith("nsubj")), None)
173
  if subj is None: return None
174
- plur = ("Number=Plur" in str(getattr(subj, "morph",""))) if src_lang=="Español" else (getattr(subj, "tag_", "") in ("NNS","NNPS"))
175
- low = getattr(subj, "lower_", "").lower()
176
  if src_lang=="Español":
177
  if low in ("yo",): return "1p" if plur else "1s"
178
  if low in ("tú","vos"): return "2p" if plur else "2s"
@@ -188,44 +168,39 @@ def _person_of_doc(doc, src_lang: str) -> Optional[str]:
188
  return "3p" if plur else "3s"
189
  except Exception:
190
  return None
191
-
192
  def detect_person(root, src_lang: str) -> Optional[str]:
193
- m = str(getattr(root, "morph", ""))
194
- person_str = "3"; number_str = "s"
195
  if "Person=" in m:
196
  for feat in m.split("|"):
197
  if feat.startswith("Person="): person_str = feat.split("=")[1]
198
- elif feat.startswith("Number="): number_str = "p" if feat.split("=")[1] == "Plur" else "s"
199
  return person_str + number_str
200
  return _person_of_doc(root.doc, src_lang)
201
 
202
- # ------------ Mapeo lema→código y fraseadores ------------
203
  def code_es(lemma: str, target: str) -> str:
204
  lemma = norm_es(lemma)
205
- return ES2MINI.get(lemma) if target=="Minimax-ASCII" else ES2KOMI.get(lemma) or (enc_oov_komin(lemma) if target!="Minimax-ASCII" else enc_oov_minimax(lemma))
206
-
 
207
  def code_en(lemma: str, target: str) -> str:
208
  lemma = norm_en(lemma)
209
- if target == "Minimax-ASCII":
210
  return (EN2MINI.get(lemma) if EN2MINI else None) or enc_oov_minimax(lemma)
211
- else:
212
- return (EN2KOMI.get(lemma) if EN2KOMI else None) or enc_oov_komin(lemma)
213
 
214
- TAM_MINI = {"Pres":"P", "Past":"T", "Fut":"F", "UNK":"P"}
215
- TAM_KOMI = {"Pres":"Ⓟ", "Past":"Ⓣ", "Fut":"Ⓕ", "UNK":"Ⓟ"}
216
 
217
  def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
218
  semi_lossless=False, person_hint="2s", remove_pronouns=False):
219
  root, subs, objs, obls, advs = extract_core(doc)
220
- tense = detect_tense(root)
221
- is_q, is_neg = detect_polarity(doc), detect_neg(doc)
222
  vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
223
  vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
224
  tail = TAM_MINI.get(tense, "P")
225
- if semi_lossless:
226
- pi = detect_person(root, src_lang) or person_hint
227
- tail += pi
228
- if is_neg: tail += "N"
229
  if is_q: tail += "Q"
230
  if tail: vcode = f"{vcode}·{tail}"
231
 
@@ -234,24 +209,18 @@ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
234
  for t in tokens:
235
  if remove_pronouns:
236
  txt = (getattr(t,"text","") or "").lower()
237
- if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN):
238
- continue
239
- lem = lemma_of(t, src_lang) if USE_SPACY else (getattr(t,"text",""))
240
- code = code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII")
241
- outs.append(code)
242
  return outs
243
 
244
- S = realize_np(subs)
245
- O = realize_np(objs) + realize_np(obls)
246
  ADV=[]
247
  for a in advs:
248
- lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
249
- ADV.append(code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII"))
250
 
251
- if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
252
- parts = S + O + ADV
253
- else:
254
- parts = [vcode] + S + O + ADV
255
  return " ".join(p for p in parts if p)
256
 
257
  def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
@@ -260,38 +229,27 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
260
  tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
261
  vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
262
  vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
263
-
264
- P_SUBJ, P_OBJ = "ᵖ", ""
265
- NEG_M, Q_FIN = "̆", "?"
266
- TAM = TAM_KOMI.get(tense, "Ⓟ")
267
- if semi_lossless:
268
- pi = detect_person(root, src_lang) or person_hint
269
- TAM = TAM + f"[{pi}]"
270
 
271
  def realize_np(tokens, particle):
272
  outs=[]
273
  for t in tokens:
274
  if remove_pronouns:
275
  txt = (getattr(t,"text","") or "").lower()
276
- if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN):
277
- continue
278
- lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
279
- code = code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK")
280
- outs.append(code + particle)
281
  return outs
282
 
283
- S = realize_np(subs, P_SUBJ)
284
- O = realize_np(objs + obls, P_OBJ)
285
  ADV=[]
286
  for a in advs:
287
- lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
288
- ADV.append(code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK"))
289
- v_form = vcode + TAM + (NEG_M if is_neg else "")
290
 
291
- if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
292
- parts = S + O + ADV
293
- else:
294
- parts = S + O + ADV + [v_form]
295
  out = " ".join(parts)
296
  if is_q: out += " " + Q_FIN
297
  return out
@@ -326,29 +284,25 @@ def encode_simple(text: str, src_lang: str, target: str) -> str:
326
  code = ES2MINI.get(key) if target=="Minimax-ASCII" else ES2KOMI.get(key)
327
  return code or (enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0)))
328
  def repl_en(m):
329
- key = norm_en(m.group(0))
330
- table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
331
  if table and key in table: return table[key]
332
  return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
333
  repl = repl_es if src_lang=="Español" else repl_en
334
  return WORD_RE.sub(repl, text)
335
 
336
  def pluralize_es(word: str) -> str:
337
- exceptions = {"uno": "unos", "buen": "buenos", "hombre": "hombres"}
338
  if word in exceptions: return exceptions[word]
339
- if word.endswith("z"): return word[:-1] + "ces"
340
- if word.endswith(("a","e","i","o")): return word + "s"
341
- return word + "es"
342
-
343
  def pluralize_en(word: str) -> str:
344
  exceptions = {"man":"men","woman":"women","child":"children"}
345
  if word in exceptions: return exceptions[word]
346
- if word.endswith("y") and len(word)>1 and word[-2] not in "aeiou": return word[:-1] + "ies"
347
- if word.endswith(("s","sh","ch","x","z")): return word + "es"
348
- return word + "s"
349
-
350
- def pluralize(word: str, tgt_lang: str) -> str:
351
- return pluralize_es(word) if tgt_lang=="Español" else pluralize_en(word)
352
 
353
  mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
354
 
@@ -356,59 +310,41 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
356
  if not text.strip(): return ""
357
  code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
358
  code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
359
-
360
- if source == "Kōmín-CJK":
361
- text = text.replace("?","?").replace(" ", " ")
362
- return " ".join([code2es.get(w, w) for w in text.split() if w != "?"])
363
-
364
- tokens = text.split()
365
  if not tokens: return ""
366
-
367
- lemma_tokens, pl_flags = [], []
368
- verb_idx, verb_lemma, verb_tense, verb_person = -1, None, "Pres", "3s"
369
- has_q, is_neg = False, False
370
-
371
  for part in tokens:
372
- look = part.replace("[PL]","")
373
- had_pl = "[PL]" in part
374
- pl_flags.append(had_pl)
375
-
376
  m = mini_tail_re.match(look)
377
  if m:
378
- verb_idx = len(lemma_tokens)
379
- stem, tail = m.group("stem"), m.group("tail")
380
  vlem_es = code2es.get(stem); vlem_en = code2en.get(stem) if code2en else None
381
  vlem = vlem_es if tgt_lang=="Español" else (vlem_en or vlem_es or stem)
382
- if not vlem:
383
- vlem = dec_oov_minimax(stem) if is_oov_minimax(stem) else stem
384
  lemma_tokens.append(vlem); pl_flags.append(False)
385
  if tail:
386
  if tail[0] in "PTF":
387
- verb_tense = {"P":"Pres","T":"Past","F":"Fut"}[tail[0]]
388
- pos=1
389
  if len(tail)>pos and tail[pos] in "123":
390
- pos+=1
391
- verb_person = tail[pos-1] + (tail[pos] if len(tail)>pos and tail[pos] in "sp" else "s")
392
  if len(tail)>pos and tail[pos] in "sp": pos+=1
393
- is_neg = "N" in tail[pos:]
394
- has_q = "Q" in tail[pos:]
395
- verb_lemma = vlem
396
- continue
397
-
398
  w_es = code2es.get(look); w_en = code2en.get(look) if code2en else None
399
  w = w_es if tgt_lang=="Español" else (w_en or w_es or look)
400
  if not w: w = dec_oov_minimax(look) if is_oov_minimax(look) else look
401
  lemma_tokens.append(w); pl_flags.append(had_pl)
402
-
403
  out_parts=[]
404
  for idx, lem in enumerate(lemma_tokens):
405
- if idx == verb_idx:
406
  v_conj = _es_conj(verb_lemma, verb_tense, verb_person) if tgt_lang=="Español" else _en_conj(verb_lemma, verb_tense, verb_person)
407
  if is_neg: v_conj = ("no " if tgt_lang=="Español" else "not ") + v_conj
408
  out_parts.append(v_conj)
409
  else:
410
  out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
411
-
412
  out_text = " ".join(out_parts)
413
  if has_q:
414
  start_q = "¿" if tgt_lang=="Español" else ""
@@ -419,17 +355,16 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
419
  def _es_conj_regular(lemma, tense, person):
420
  if not lemma.endswith(("ar","er","ir")): return lemma
421
  stem, vtype = lemma[:-2], lemma[-2:]
422
- pres = {"ar":{"1s":"o","2s":"as","3s":"a","1p":"amos","2p":"áis","3p":"an"},
423
- "er":{"1s":"o","2s":"es","3s":"e","1p":"emos","2p":"éis","3p":"en"},
424
- "ir":{"1s":"o","2s":"es","3s":"e","1p":"imos","2p":"ís","3p":"en"}}
425
- pret = {"ar":{"1s":"é","2s":"aste","3s":"ó","1p":"amos","2p":"asteis","3p":"aron"},
426
- "er":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"},
427
- "ir":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"}}
428
- fut = {"1s":"é","2s":"ás","3s":"á","1p":"emos","2p":"éis","3p":"án"}
429
  if tense=="Pres": return stem + pres[vtype].get(person, pres[vtype]["3s"])
430
  if tense=="Past": return stem + pret[vtype].get(person, pret[vtype]["3s"])
431
  return lemma + fut.get(person, fut["3s"])
432
-
433
  def _es_conj(lemma, tense, person):
434
  if lemma=="ser":
435
  tab={"Pres":{"1s":"soy","2s":"eres","3s":"es","1p":"somos","2p":"sois","3p":"son"},
@@ -447,7 +382,6 @@ def _es_conj(lemma, tense, person):
447
  "Fut":{"1s":"iré","2s":"irás","3s":"irá","1p":"iremos","2p":"iréis","3p":"irán"}}
448
  return tab[tense].get(person, tab[tense]["3s"])
449
  return _es_conj_regular(lemma, tense, person)
450
-
451
  def _en_conj(lemma, tense, person):
452
  if lemma=="be":
453
  if tense=="Pres": return {"1s":"am","2s":"are","3s":"is","1p":"are","2p":"are","3p":"are"}.get(person,"is")
@@ -481,23 +415,18 @@ def _build_with_spacy(text: str, src_lang: str, target: str,
481
  remove_pronouns: bool) -> str:
482
  nlp = nlp_es if src_lang=="Español" else nlp_en
483
  doc = nlp(text)
484
- if target == "Minimax-ASCII":
485
- return realize_minimax(doc, src_lang, drop_articles, zero_copula, semi_lossless=semi_lossless,
486
- remove_pronouns=remove_pronouns)
487
  else:
488
- return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless=semi_lossless,
489
- remove_pronouns=remove_pronouns)
490
 
491
  def build_sentence(text: str, src_lang: str, target: str,
492
  drop_articles: bool, zero_copula: bool, mode: str,
493
  max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
494
  if not text.strip(): return ""
495
- semi = True # siempre semi-lossless
496
  if USE_SPACY:
497
- core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi,
498
- semi_lossless=semi, remove_pronouns=remove_pronouns)
499
  else:
500
- # Modo léxico simple: quitar pronombres por forma si procede
501
  if remove_pronouns:
502
  pron = PRON_ES if src_lang=="Español" else PRON_EN
503
  tokens = re.findall(r"\w+|[^\w\s]+", text)
@@ -526,24 +455,20 @@ def universal_translate(text: str, src: str, tgt: str,
526
  if orig is not None:
527
  core = strip_custom_sidecar(text)
528
  es_lemmas = decode_simple(core, src, "Español")
529
- words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
530
- out=[]
531
  for w in words:
532
  if re.fullmatch(r"\w+", w):
533
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
534
  out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
535
- else:
536
- out.append(w)
537
  return custom_sidecar_enc(" ".join(out), orig)
538
  es_lemmas = decode_simple(text, src, "Español")
539
- words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
540
- out=[]
541
  for w in words:
542
  if re.fullmatch(r"\w+", w):
543
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
544
  out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
545
- else:
546
- out.append(w)
547
  return " ".join(out)
548
  return "[No soportado]"
549
 
@@ -551,11 +476,9 @@ def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
551
  if not text.strip(): return ""
552
  if not USE_SPACY: return text
553
  nlp = nlp_es if src_lang=="Español" else nlp_en
554
- doc = nlp(text)
555
- out=[]
556
  for t in doc:
557
- if not getattr(t, "is_alpha", False):
558
- out.append(getattr(t,"text","")); continue
559
  lem = lemma_of(t, src_lang)
560
  if src_lang=="Español":
561
  tr = ES2EN_LEMMA.get(lem); out.append(tr if tr else lem)
@@ -569,49 +492,94 @@ def round_trip(text, src, tgt, mode, max_comp_exact):
569
  return conlang, back
570
 
571
  # =====================================================================================
572
- # ========================== UI bilingüe con selector global ==========================
573
  # =====================================================================================
574
 
575
  ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
576
 
577
- # Secciones de ayuda (plegables)
578
  COMPACT_ES = """
579
- **📏 Compactación orientativa**
580
- - Sin casillas: 0%
581
- - Omitir artículos: **~10–15%**
582
- - Cópula cero: **~5–10%**
583
- - Ambas: **~15–20%**
584
- - Máx. Compresión Exacta: **~40–60%** en >100 caracteres (zlib). En textos muy cortos puede no reducir.
585
  """
586
  COMPACT_EN = """
587
- **📏 Typical compaction**
588
- - No options: 0%
589
- - Drop articles: **~10–15%**
590
- - Zero copula: **~5–10%**
591
- - Both: **~15–20%**
592
- - Max Exact Compression: **~40–60%** for >100 chars (zlib). Very short texts may not shrink.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
  """
594
 
595
  LEXICON_BUILD_ES = """
596
- ### 🧱 Cómo se construyó el léxico (OMW → Minimax/Kōmín)
597
- 1) OMW/WordNet extrae **lemas ES** y sus **equivalentes EN** por sinset.
598
- 2) Normaliza y ordena por **frecuencia** (wordfreq).
599
  3) Opcional: **spaCy** refina lemas; **Argos** puede rellenar EN faltantes.
600
- 4) Asigna códigos compactos con alfabetos **barajados por SEED** hasta `MAXLEN_MINI`/`MAXLEN_CJK`.
601
- 5) Exporta: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+TSV).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
602
  """
603
  LEXICON_BUILD_EN = """
604
- ### 🧱 How the lexicon was built (OMW Minimax/Kōmín)
605
- 1) From OMW/WordNet → extract **ES lemmas** and **EN counterparts** by synset.
606
- 2) Normalize and sort by **frequency** (wordfreq).
607
- 3) Optional: **spaCy** refines lemmas; **Argos** may fill missing EN.
608
- 4) Assign compact codes using alphabets **shuffled by SEED** up to `MAXLEN_MINI`/`MAXLEN_CJK`.
609
- 5) Exports: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+TSV).
610
  """
611
 
612
- EXPLAIN_TOP_ES = "Traduce entre **Español / Inglés** y dos conlangs: **Minimax-ASCII** y **Kōmín-CJK**. Con **Máx. Compresión Exacta** puedes recuperar el original exacto (trailer `~...`)."
613
- EXPLAIN_TOP_EN = "Translate between **Spanish / English** and **Minimax-ASCII / Kōmín-CJK**. With **Max Exact Compression**, you can recover the exact original (trailer `~...`)."
614
-
615
  def master_preview(n: int = 20) -> List[List[Any]]:
616
  try:
617
  entries = (MASTER_OBJ or {}).get("entries", [])
@@ -623,68 +591,49 @@ def master_preview(n: int = 20) -> List[List[Any]]:
623
  except Exception:
624
  return [["lemma_es","lemma_en","minimax","komin"], ["(no data)","","",""]]
625
 
626
- # === ES Group ===
627
  def make_group_es():
628
- with gr.Group(visible=True) as group:
629
  gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
630
- # Botón grande para mostrar/ocultar explicación del léxico
631
- show_lex_state = gr.State(False)
632
- with gr.Row():
633
- btn_lex = gr.Button("ℹ️ **Ver explicación del léxico (OMW → Minimax/Kōmín)**", variant="primary", size="lg")
634
- lex_group = gr.Group(visible=False)
635
- with lex_group:
636
- with gr.Accordion("🧱 Léxico: ¿cómo se construyó? (ES)", open=True):
637
- gr.Markdown(LEXICON_BUILD_ES)
638
- gr.Markdown("**Vista previa de `lexicon_master.json` (primeras filas):**")
639
- n_rows = gr.Slider(5, 100, value=20, step=5, label="Filas a mostrar")
640
- df_prev = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
641
- gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows], [df_prev])
642
-
643
- def toggle_lex(show):
644
- show = not bool(show)
645
- return show, (gr.update(visible=show), gr.update(value="ℹ️ **Ocultar explicación del léxico**" if show else "ℹ️ **Ver explicación del léxico (OMW → Minimax/Kōmín)**"))
646
- btn_lex.click(toggle_lex, [show_lex_state], [show_lex_state, lex_group, btn_lex])
647
-
648
- # Ayuda plegable por apartados
649
  with gr.Row():
650
  with gr.Column():
651
- with gr.Accordion("Resumen (¿qué hace?)", open=True):
652
- gr.Markdown(EXPLAIN_TOP_ES)
653
- with gr.Accordion("Opciones y compactación", open=False):
654
- gr.Markdown(COMPACT_ES)
655
- with gr.Accordion("FAQ", open=False):
656
- gr.Markdown("- **¿Se pierde info?** No con Máx. Compresión Exacta (`~...`).\n- **¿Sin spaCy?** Funciona en modo léxico.\n- **Privacidad**: todo corre dentro del Space.")
657
  with gr.Column():
658
- with gr.Accordion("Tutorial rápido", open=True):
659
- gr.Markdown("1) Elige **Fuente/Destino**.\n2) Escribe.\n3) Pulsa **Traducir**.\n\n> Activa **Máx. Compresión Exacta** para poder recuperar el original exacto luego.")
 
 
 
660
 
661
- # Tabs
662
  with gr.Tab("🔁 Traducir"):
663
  with gr.Row():
664
  uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
665
  uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
666
  uni_text = gr.Textbox(lines=3, label="Texto", placeholder="Ej.: Hola, ¿cómo estás?", show_copy_button=True)
667
  with gr.Row():
668
- uni_drop = gr.Checkbox(value=True, label="Omitir artículos (ES/EN → conlang)")
669
- uni_zero = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
670
- uni_rmpr = gr.Checkbox(value=False, label="Quitar pronombres (sujeto/objeto evidentes)")
671
- uni_maxc = gr.Checkbox(value=False, label="Máx. Compresión Exacta (sidecar `~...`)")
672
 
673
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
674
  with gr.Row():
675
- btn_translate = gr.Button("🚀 Traducir", variant="primary")
676
- btn_reset = gr.Button("🧹 Limpiar")
677
  uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
678
 
679
- btn_translate.click(
680
- universal_translate,
681
- [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
682
- [uni_out]
683
- )
684
- btn_reset.click(lambda: ("",""), None, [uni_text, uni_out])
685
 
686
- with gr.Accordion("¿Qué hace esta pestaña?", open=False):
687
- gr.Markdown("Traduce **entre cualquier par** (ES/EN/Minimax/Kōmín). Si marcas **Máx. Compresión Exacta**, añade `~...` con el original comprimido para recuperar luego *bit a bit*.")
688
 
689
  with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
690
  with gr.Row():
@@ -692,25 +641,23 @@ def make_group_es():
692
  target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
693
  text_in = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
694
  with gr.Row():
695
- drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
696
- zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
697
- rm_pron_build = gr.Checkbox(value=False, label="Quitar pronombres")
698
- max_comp_build = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
699
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
700
  with gr.Row():
701
- btn_build = gr.Button("🏗️ Construir", variant="primary")
702
- btn_build_clear = gr.Button("🧹 Limpiar")
703
  out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
704
 
705
- btn_build.click(
706
- build_sentence,
707
- [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
708
- [out]
709
- )
710
- btn_build_clear.click(lambda: ("",""), None, [text_in, out])
711
 
712
- with gr.Accordion("¿Qué hace esta pestaña?", open=False):
713
- gr.Markdown("Fuerza salida **en conlang** desde ES/EN, aplicando reglas de fraseo (orden, partículas, TAM) y tus opciones de compactación.")
714
 
715
  with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
716
  with gr.Row():
@@ -727,69 +674,50 @@ def make_group_es():
727
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
728
 
729
  with gr.Row():
730
- btn_decode = gr.Button("🔓 Decodificar", variant="primary")
731
- btn_decode_clear = gr.Button("🧹 Limpiar")
732
 
733
- btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
734
- btn_decode_clear.click(lambda: ("",""), None, [code_in, out3])
735
 
736
- with gr.Accordion("¿Qué hace esta pestaña?", open=False):
737
- gr.Markdown("Convierte **Minimax/Kōmín → ES/EN**. Si hay `~...`, la recuperación es **exacta**.")
738
 
739
  with gr.Tab("🔄 Prueba ida→vuelta"):
740
  with gr.Row():
741
  rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
742
  rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
743
  rt_text = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
744
- rt_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
745
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
746
  rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
747
  rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
748
  with gr.Row():
749
  btn_rt = gr.Button("▶️ Probar", variant="primary")
750
- btn_rt_clear = gr.Button("🧹 Limpiar")
751
 
752
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
753
- btn_rt_clear.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
754
 
755
- with gr.Accordion("¿Qué hace esta pestaña?", open=False):
756
- gr.Markdown("Hace **ES/EN → Conlang → ES/EN** para comprobar la **reversibilidad**. Con **Máx. Compresión Exacta** la vuelta coincide bit a bit.")
 
757
 
758
- gr.Markdown("---")
759
- gr.Markdown("Hecho con ❤️ · **spaCy** (opcional) · Todo se ejecuta en este Space.")
760
- return group
761
-
762
- # === EN Group ===
763
  def make_group_en():
764
- with gr.Group(visible=False) as group:
765
  gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
766
- show_lex_state = gr.State(False)
767
- with gr.Row():
768
- btn_lex = gr.Button("ℹ️ **Show lexicon build (OMW → Minimax/Kōmín)**", variant="primary", size="lg")
769
- lex_group = gr.Group(visible=False)
770
- with lex_group:
771
- with gr.Accordion("🧱 Lexicon: how it was built (EN)", open=True):
772
- gr.Markdown(LEXICON_BUILD_EN)
773
- gr.Markdown("**Preview of `lexicon_master.json` (first rows):**")
774
- n_rows = gr.Slider(5, 100, value=20, step=5, label="Rows to show")
775
- df_prev = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
776
- gr.Button("Refresh").click(lambda n: master_preview(int(n)), [n_rows], [df_prev])
777
- def toggle_lex(show):
778
- show = not bool(show)
779
- return show, (gr.update(visible=show), gr.update(value="ℹ️ **Hide lexicon build**" if show else "ℹ️ **Show lexicon build (OMW → Minimax/Kōmín)**"))
780
- btn_lex.click(toggle_lex, [show_lex_state], [show_lex_state, lex_group, btn_lex])
781
-
782
  with gr.Row():
783
  with gr.Column():
784
- with gr.Accordion("Summary (what it does)", open=True):
785
- gr.Markdown(EXPLAIN_TOP_EN)
786
- with gr.Accordion("Options & compaction", open=False):
787
- gr.Markdown(COMPACT_EN)
788
- with gr.Accordion("FAQ", open=False):
789
- gr.Markdown("- **Any loss?** Not with Max Exact Compression (`~...`).\n- **No spaCy?** Works in lexical mode.\n- **Privacy**: runs inside this Space.")
790
  with gr.Column():
791
- with gr.Accordion("Quick start", open=True):
792
- gr.Markdown("1) Pick **Source/Target**.\n2) Type.\n3) Click **Translate**.\n\n> Enable **Max Exact Compression** to recover the exact original later.")
 
 
 
793
 
794
  with gr.Tab("🔁 Translate"):
795
  with gr.Row():
@@ -797,26 +725,24 @@ def make_group_en():
797
  uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Target")
798
  uni_text = gr.Textbox(lines=3, label="Text", placeholder="e.g., Hello, how are you?", show_copy_button=True)
799
  with gr.Row():
800
- uni_drop = gr.Checkbox(value=True, label="Drop articles (ES/EN → conlang)")
801
- uni_zero = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
802
- uni_rmpr = gr.Checkbox(value=False, label="Remove pronouns (obvious subject/object)")
803
- uni_maxc = gr.Checkbox(value=False, label="Max Exact Compression (sidecar `~...`)")
804
 
805
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
806
  with gr.Row():
807
- btn_translate = gr.Button("🚀 Translate", variant="primary")
808
- btn_reset = gr.Button("🧹 Clear")
809
  uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
810
 
811
- btn_translate.click(
812
- universal_translate,
813
- [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
814
- [uni_out]
815
- )
816
- btn_reset.click(lambda: ("",""), None, [uni_text, uni_out])
817
 
818
- with gr.Accordion("What does this tab do?", open=False):
819
- gr.Markdown("Translate **between any pair** (ES/EN/Minimax/Kōmín). With **Max Exact Compression**, a `~...` trailer stores the original for bit-perfect recovery.")
820
 
821
  with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
822
  with gr.Row():
@@ -824,25 +750,23 @@ def make_group_en():
824
  target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
825
  text_in = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
826
  with gr.Row():
827
- drop_articles = gr.Checkbox(value=True, label="Drop articles")
828
- zero_copula = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
829
- rm_pron_build = gr.Checkbox(value=False, label="Remove pronouns")
830
- max_comp_build = gr.Checkbox(value=False, label="Max Exact Compression")
831
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
832
  with gr.Row():
833
- btn_build = gr.Button("🏗️ Build", variant="primary")
834
- btn_build_clear = gr.Button("🧹 Clear")
835
  out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
836
 
837
- btn_build.click(
838
- build_sentence,
839
- [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
840
- [out]
841
- )
842
- btn_build_clear.click(lambda: ("",""), None, [text_in, out])
843
 
844
- with gr.Accordion("What does this tab do?", open=False):
845
- gr.Markdown("Forces **conlang output** from ES/EN, applying phrasing rules (order, particles, TAM) and your compacting options.")
846
 
847
  with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
848
  with gr.Row():
@@ -859,59 +783,41 @@ def make_group_en():
859
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
860
 
861
  with gr.Row():
862
- btn_decode = gr.Button("🔓 Decode", variant="primary")
863
- btn_decode_clear = gr.Button("🧹 Clear")
864
 
865
- btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
866
- btn_decode_clear.click(lambda: ("",""), None, [code_in, out3])
867
 
868
- with gr.Accordion("What does this tab do?", open=False):
869
- gr.Markdown("Converts **Minimax/Kōmín → ES/EN**. If a `~...` trailer is present, recovery is **bit-perfect**.")
870
 
871
  with gr.Tab("🔄 Round-trip"):
872
  with gr.Row():
873
  rt_src = gr.Dropdown(["Español","English"], value="English", label="Source")
874
  rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
875
  rt_text = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
876
- rt_max_comp = gr.Checkbox(value=False, label="Max Exact Compression")
877
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
878
  rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
879
  rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
880
  with gr.Row():
881
  btn_rt = gr.Button("▶️ Test", variant="primary")
882
- btn_rt_clear = gr.Button("🧹 Clear")
883
 
884
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
885
- btn_rt_clear.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
886
 
887
- with gr.Accordion("What does this tab do?", open=False):
888
- gr.Markdown("Performs **ES/EN → Conlang → ES/EN** to verify **reversibility**. With **Max Exact Compression**, the return matches the input bit-for-bit.")
 
889
 
890
- gr.Markdown("---")
891
- gr.Markdown("Made with ❤️ · **spaCy** (optional) · Everything runs inside this Space.")
892
- return group
893
-
894
- # ============================== Pestaña global de Léxico ==============================
895
- def make_lexicon_tab():
896
- with gr.TabItem("ℹ️ Léxico / Lexicon (Global)"):
897
- gr.Markdown("## 🧱 Construcción del léxico / Lexicon build")
898
- with gr.Row():
899
- with gr.Column():
900
- with gr.Accordion("Resumen (ES)", open=True): gr.Markdown(LEXICON_BUILD_ES)
901
- with gr.Column():
902
- with gr.Accordion("Summary (EN)", open=False): gr.Markdown(LEXICON_BUILD_EN)
903
- gr.Markdown("### 👀 Vista de ejemplo (primeras filas de `lexicon_master.json`)")
904
- n_rows = gr.Slider(5, 100, value=20, step=5, label="Filas/Rows")
905
- table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
906
- gr.Button("Actualizar / Refresh").click(lambda n: master_preview(int(n)), [n_rows], [table])
907
-
908
- # ================================ Lanzador de la app =================================
909
  with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
910
  gr.Markdown("## 🌍 Idioma / Language")
911
- lang_select = gr.Radio(choices=["ES","EN"], value="ES", label="Selecciona / Select")
912
  group_es = make_group_es()
913
  group_en = make_group_en()
914
- make_lexicon_tab()
915
 
916
  def switch_lang(code):
917
  if code == "EN":
@@ -927,3 +833,4 @@ if __name__ == "__main__":
927
 
928
 
929
 
 
 
1
  # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
+ # Archivos requeridos en la raíz:
 
3
  # - lexicon_minimax.json
4
  # - lexicon_komin.json
5
  # - lexicon_master.json
 
22
  # ------------ Normalización ------------
23
  WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
24
  STRIP = str.maketrans("ÁÉÍÓÚÜÑáéíóúüñ", "AEIOUUNaeiouun")
25
+ def norm_es(w: str) -> str: return re.sub(r"[^a-záéíóúüñ]", "", (w or "").lower()).translate(STRIP)
26
+ def norm_en(w: str) -> str: return re.sub(r"[^a-z]", "", (w or "").lower())
 
 
 
 
27
 
28
  # ------------ Carga de léxicos ------------
29
  def load_json(path: str):
30
  if not os.path.exists(path): return None
31
+ with open(path, "r", encoding="utf-8") as f: return json.load(f)
 
32
 
33
  def load_lexicons():
34
  mm = load_json(LEX_MINI) or {}
 
47
 
48
  if isinstance(master, dict) and "entries" in master:
49
  for e in master["entries"]:
50
+ es = norm_es(str(e.get("lemma_es",""))); en = norm_en(str(e.get("lemma_en","")))
51
+ mi = str(e.get("minimax","")); ko = str(e.get("komin",""))
 
 
52
  if es and en:
53
+ es2en_lemma.setdefault(es, en); en2es_lemma.setdefault(en, es)
 
54
  if en and mi: en2mini.setdefault(en, mi)
55
  if en and ko: en2komi.setdefault(en, ko)
56
 
57
  mini2en = {v:k for k,v in en2mini.items()}
58
  komi2en = {v:k for k,v in en2komi.items()}
 
59
  return (es2mini, es2komi, mini2es, komi2es,
60
  en2mini, en2komi, mini2en, komi2en,
61
  es2en_lemma, en2es_lemma, master)
 
82
  std = base64.b64encode(b).decode("ascii")
83
  trans = str.maketrans("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", alphabet)
84
  return std.translate(trans).rstrip("=")
 
85
  def from_custom_b64(s: str, alphabet: str) -> bytes:
86
  trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
87
+ std = s.translate(trans); pad = "=" * ((4 - len(std) % 4) % 4)
 
88
  return base64.b64decode(std + pad)
89
 
90
  def enc_oov_minimax(token: str) -> str: return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64)
 
103
  try:
104
  import spacy
105
  try:
106
+ nlp_es = spacy.load("es_core_news_sm"); nlp_en = spacy.load("en_core_web_sm"); USE_SPACY = True
 
 
107
  except Exception:
108
  nlp_es = nlp_en = None
109
  except Exception:
 
111
 
112
  def lemma_of(tok, src_lang: str) -> str:
113
  if src_lang == "Español":
114
+ return norm_es(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
115
  else:
116
+ return norm_en(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
 
 
 
117
 
118
+ # ------------ Detección simple ------------
119
+ def detect_polarity(doc) -> bool: return "?" in getattr(doc,"text","")
120
  def detect_neg(doc) -> bool:
121
  for t in doc:
122
+ if getattr(t,"dep_","")=="neg" or getattr(t,"lower_","").lower() in ("no","not","n't"):
123
  return True
124
  return False
 
125
  def detect_tense(root):
126
+ m = str(getattr(root,"morph",""))
127
  if "Tense=Past" in m: return "Past"
128
  if "Tense=Fut" in m: return "Fut"
129
  if "Tense=Pres" in m: return "Pres"
130
+ for c in getattr(root,"children",[]):
131
+ if getattr(c,"pos_","")=="AUX":
132
+ cm = str(getattr(c,"morph",""))
133
  if "Tense=Past" in cm: return "Past"
134
+ if getattr(c,"lower_","").lower()=="will": return "Fut"
135
  return "Pres"
 
136
  def extract_core(doc):
137
  tokens = list(doc)
138
+ root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT" and getattr(t,"pos_","") in ("VERB","AUX")), tokens[0] if tokens else doc)
139
  subs, objs, obls, advs = [], [], [], []
140
+ for t in getattr(root,"children",[]):
141
+ dep = getattr(t,"dep_",""); pos = getattr(t,"pos_","")
 
142
  if dep in ("nsubj","nsubj:pass","csubj"): subs.append(t)
143
  elif dep in ("obj","dobj","iobj"): objs.append(t)
144
  elif dep in ("obl","pobj"): obls.append(t)
145
  elif dep in ("advmod","advcl") and pos=="ADV": advs.append(t)
146
+ for arr in (subs,objs,obls,advs): arr.sort(key=lambda x: getattr(x,"i",0))
 
147
  return root, subs, objs, obls, advs
 
148
  def _person_of_doc(doc, src_lang: str) -> Optional[str]:
149
  try:
150
  tokens = list(doc)
151
+ root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT"), tokens[0])
152
+ subj = next((t for t in getattr(root,"children",[]) if getattr(t,"dep_","").startswith("nsubj")), None)
153
  if subj is None: return None
154
+ plur = ("Number=Plur" in str(getattr(subj,"morph",""))) if src_lang=="Español" else (getattr(subj,"tag_","") in ("NNS","NNPS"))
155
+ low = getattr(subj,"lower_","").lower()
156
  if src_lang=="Español":
157
  if low in ("yo",): return "1p" if plur else "1s"
158
  if low in ("tú","vos"): return "2p" if plur else "2s"
 
168
  return "3p" if plur else "3s"
169
  except Exception:
170
  return None
 
171
  def detect_person(root, src_lang: str) -> Optional[str]:
172
+ m = str(getattr(root,"morph","")); person_str, number_str = "3","s"
 
173
  if "Person=" in m:
174
  for feat in m.split("|"):
175
  if feat.startswith("Person="): person_str = feat.split("=")[1]
176
+ elif feat.startswith("Number="): number_str = "p" if feat.split("=")[1]=="Plur" else "s"
177
  return person_str + number_str
178
  return _person_of_doc(root.doc, src_lang)
179
 
180
+ # ------------ Mapeo y fraseadores ------------
181
  def code_es(lemma: str, target: str) -> str:
182
  lemma = norm_es(lemma)
183
+ if target=="Minimax-ASCII":
184
+ return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
185
+ return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
186
  def code_en(lemma: str, target: str) -> str:
187
  lemma = norm_en(lemma)
188
+ if target=="Minimax-ASCII":
189
  return (EN2MINI.get(lemma) if EN2MINI else None) or enc_oov_minimax(lemma)
190
+ return (EN2KOMI.get(lemma) if EN2KOMI else None) or enc_oov_komin(lemma)
 
191
 
192
+ TAM_MINI = {"Pres":"P","Past":"T","Fut":"F","UNK":"P"}
193
+ TAM_KOMI = {"Pres":"Ⓟ","Past":"Ⓣ","Fut":"Ⓕ","UNK":"Ⓟ"}
194
 
195
  def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
196
  semi_lossless=False, person_hint="2s", remove_pronouns=False):
197
  root, subs, objs, obls, advs = extract_core(doc)
198
+ tense = detect_tense(root); is_q, is_neg = detect_polarity(doc), detect_neg(doc)
 
199
  vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
200
  vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
201
  tail = TAM_MINI.get(tense, "P")
202
+ if semi_lossless: tail += (detect_person(root, src_lang) or person_hint)
203
+ if is_neg: tail += "N";
 
 
204
  if is_q: tail += "Q"
205
  if tail: vcode = f"{vcode}·{tail}"
206
 
 
209
  for t in tokens:
210
  if remove_pronouns:
211
  txt = (getattr(t,"text","") or "").lower()
212
+ if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue
213
+ lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
214
+ outs.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII"))
 
 
215
  return outs
216
 
217
+ S = realize_np(subs); O = realize_np(objs)+realize_np(obls)
 
218
  ADV=[]
219
  for a in advs:
220
+ lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
221
+ ADV.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII"))
222
 
223
+ parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else [vcode]+S+O+ADV
 
 
 
224
  return " ".join(p for p in parts if p)
225
 
226
  def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
 
229
  tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
230
  vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
231
  vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
232
+ P_SUBJ, P_OBJ = "ᵖ", "ᵒ"; NEG_M, Q_FIN = "̆", "?"
233
+ TAM = TAM_KOMI.get(tense,"")
234
+ if semi_lossless: TAM = TAM + f"[{detect_person(root, src_lang) or person_hint}]"
 
 
 
 
235
 
236
  def realize_np(tokens, particle):
237
  outs=[]
238
  for t in tokens:
239
  if remove_pronouns:
240
  txt = (getattr(t,"text","") or "").lower()
241
+ if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue
242
+ lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
243
+ outs.append((code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK")) + particle)
 
 
244
  return outs
245
 
246
+ S = realize_np(subs, P_SUBJ); O = realize_np(objs+obls, P_OBJ)
 
247
  ADV=[]
248
  for a in advs:
249
+ lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
250
+ ADV.append(code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK"))
 
251
 
252
+ parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else S+O+ADV+[vcode+TAM+("̆" if is_neg else "")]
 
 
 
253
  out = " ".join(parts)
254
  if is_q: out += " " + Q_FIN
255
  return out
 
284
  code = ES2MINI.get(key) if target=="Minimax-ASCII" else ES2KOMI.get(key)
285
  return code or (enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0)))
286
  def repl_en(m):
287
+ key = norm_en(m.group(0)); table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
 
288
  if table and key in table: return table[key]
289
  return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
290
  repl = repl_es if src_lang=="Español" else repl_en
291
  return WORD_RE.sub(repl, text)
292
 
293
  def pluralize_es(word: str) -> str:
294
+ exceptions = {"uno":"unos","buen":"buenos","hombre":"hombres"}
295
  if word in exceptions: return exceptions[word]
296
+ if word.endswith("z"): return word[:-1]+"ces"
297
+ if word.endswith(("a","e","i","o")): return word+"s"
298
+ return word+"es"
 
299
  def pluralize_en(word: str) -> str:
300
  exceptions = {"man":"men","woman":"women","child":"children"}
301
  if word in exceptions: return exceptions[word]
302
+ if word.endswith("y") and len(word)>1 and word[-2] not in "aeiou": return word[:-1]+"ies"
303
+ if word.endswith(("s","sh","ch","x","z")): return word+"es"
304
+ return word+"s"
305
+ def pluralize(word: str, tgt_lang: str) -> str: return pluralize_es(word) if tgt_lang=="Español" else pluralize_en(word)
 
 
306
 
307
  mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
308
 
 
310
  if not text.strip(): return ""
311
  code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
312
  code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
313
+ if source=="Kōmín-CJK":
314
+ text = text.replace("?","?").replace(" "," ")
315
+ return " ".join([code2es.get(w,w) for w in text.split() if w!="?"])
316
+ tokens = text.split();
 
 
317
  if not tokens: return ""
318
+ lemma_tokens, pl_flags = [], []; verb_idx=-1; verb_lemma=None; verb_tense="Pres"; verb_person="3s"; has_q=False; is_neg=False
 
 
 
 
319
  for part in tokens:
320
+ look = part.replace("[PL]",""); had_pl = "[PL]" in part; pl_flags.append(had_pl)
 
 
 
321
  m = mini_tail_re.match(look)
322
  if m:
323
+ verb_idx = len(lemma_tokens); stem=m.group("stem"); tail=m.group("tail")
 
324
  vlem_es = code2es.get(stem); vlem_en = code2en.get(stem) if code2en else None
325
  vlem = vlem_es if tgt_lang=="Español" else (vlem_en or vlem_es or stem)
326
+ if not vlem: vlem = dec_oov_minimax(stem) if is_oov_minimax(stem) else stem
 
327
  lemma_tokens.append(vlem); pl_flags.append(False)
328
  if tail:
329
  if tail[0] in "PTF":
330
+ verb_tense = {"P":"Pres","T":"Past","F":"Fut"}[tail[0]]; pos=1
 
331
  if len(tail)>pos and tail[pos] in "123":
332
+ pos+=1; verb_person = tail[pos-1] + (tail[pos] if len(tail)>pos and tail[pos] in "sp" else "s")
 
333
  if len(tail)>pos and tail[pos] in "sp": pos+=1
334
+ is_neg = "N" in tail[pos:]; has_q = "Q" in tail[pos:]
335
+ verb_lemma = vlem; continue
 
 
 
336
  w_es = code2es.get(look); w_en = code2en.get(look) if code2en else None
337
  w = w_es if tgt_lang=="Español" else (w_en or w_es or look)
338
  if not w: w = dec_oov_minimax(look) if is_oov_minimax(look) else look
339
  lemma_tokens.append(w); pl_flags.append(had_pl)
 
340
  out_parts=[]
341
  for idx, lem in enumerate(lemma_tokens):
342
+ if idx==verb_idx:
343
  v_conj = _es_conj(verb_lemma, verb_tense, verb_person) if tgt_lang=="Español" else _en_conj(verb_lemma, verb_tense, verb_person)
344
  if is_neg: v_conj = ("no " if tgt_lang=="Español" else "not ") + v_conj
345
  out_parts.append(v_conj)
346
  else:
347
  out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
 
348
  out_text = " ".join(out_parts)
349
  if has_q:
350
  start_q = "¿" if tgt_lang=="Español" else ""
 
355
  def _es_conj_regular(lemma, tense, person):
356
  if not lemma.endswith(("ar","er","ir")): return lemma
357
  stem, vtype = lemma[:-2], lemma[-2:]
358
+ pres={"ar":{"1s":"o","2s":"as","3s":"a","1p":"amos","2p":"áis","3p":"an"},
359
+ "er":{"1s":"o","2s":"es","3s":"e","1p":"emos","2p":"éis","3p":"en"},
360
+ "ir":{"1s":"o","2s":"es","3s":"e","1p":"imos","2p":"ís","3p":"en"}}
361
+ pret={"ar":{"1s":"é","2s":"aste","3s":"ó","1p":"amos","2p":"asteis","3p":"aron"},
362
+ "er":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"},
363
+ "ir":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"}}
364
+ fut={"1s":"é","2s":"ás","3s":"á","1p":"emos","2p":"éis","3p":"án"}
365
  if tense=="Pres": return stem + pres[vtype].get(person, pres[vtype]["3s"])
366
  if tense=="Past": return stem + pret[vtype].get(person, pret[vtype]["3s"])
367
  return lemma + fut.get(person, fut["3s"])
 
368
  def _es_conj(lemma, tense, person):
369
  if lemma=="ser":
370
  tab={"Pres":{"1s":"soy","2s":"eres","3s":"es","1p":"somos","2p":"sois","3p":"son"},
 
382
  "Fut":{"1s":"iré","2s":"irás","3s":"irá","1p":"iremos","2p":"iréis","3p":"irán"}}
383
  return tab[tense].get(person, tab[tense]["3s"])
384
  return _es_conj_regular(lemma, tense, person)
 
385
  def _en_conj(lemma, tense, person):
386
  if lemma=="be":
387
  if tense=="Pres": return {"1s":"am","2s":"are","3s":"is","1p":"are","2p":"are","3p":"are"}.get(person,"is")
 
415
  remove_pronouns: bool) -> str:
416
  nlp = nlp_es if src_lang=="Español" else nlp_en
417
  doc = nlp(text)
418
+ if target=="Minimax-ASCII":
419
+ return realize_minimax(doc, src_lang, drop_articles, zero_copula, semi_lossless=True, remove_pronouns=remove_pronouns)
 
420
  else:
421
+ return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless=True, remove_pronouns=remove_pronouns)
 
422
 
423
  def build_sentence(text: str, src_lang: str, target: str,
424
  drop_articles: bool, zero_copula: bool, mode: str,
425
  max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
426
  if not text.strip(): return ""
 
427
  if USE_SPACY:
428
+ core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula, True, remove_pronouns)
 
429
  else:
 
430
  if remove_pronouns:
431
  pron = PRON_ES if src_lang=="Español" else PRON_EN
432
  tokens = re.findall(r"\w+|[^\w\s]+", text)
 
455
  if orig is not None:
456
  core = strip_custom_sidecar(text)
457
  es_lemmas = decode_simple(core, src, "Español")
458
+ words = re.findall(r"\w+|[^\w\s]+", es_lemmas); out=[]
 
459
  for w in words:
460
  if re.fullmatch(r"\w+", w):
461
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
462
  out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
463
+ else: out.append(w)
 
464
  return custom_sidecar_enc(" ".join(out), orig)
465
  es_lemmas = decode_simple(text, src, "Español")
466
+ words = re.findall(r"\w+|[^\w\s]+", es_lemmas); out=[]
 
467
  for w in words:
468
  if re.fullmatch(r"\w+", w):
469
  code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
470
  out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
471
+ else: out.append(w)
 
472
  return " ".join(out)
473
  return "[No soportado]"
474
 
 
476
  if not text.strip(): return ""
477
  if not USE_SPACY: return text
478
  nlp = nlp_es if src_lang=="Español" else nlp_en
479
+ doc = nlp(text); out=[]
 
480
  for t in doc:
481
+ if not getattr(t,"is_alpha",False): out.append(getattr(t,"text","")); continue
 
482
  lem = lemma_of(t, src_lang)
483
  if src_lang=="Español":
484
  tr = ES2EN_LEMMA.get(lem); out.append(tr if tr else lem)
 
492
  return conlang, back
493
 
494
  # =====================================================================================
495
+ # ========================= UI bilingüe y explicaciones claras ========================
496
  # =====================================================================================
497
 
498
  ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
499
 
500
+ # Secciones de ayuda (ES/EN) — todas en el MISMO nivel, como acordeones
501
  COMPACT_ES = """
502
+ **📏 Compactación orientativa (haz clic para desplegar)**
503
+ - Sin casillas: **0%**
504
+ - Omitir artículos: **~10–15%**
505
+ - Cópula cero (presente afirm.): **~5–10%**
506
+ - Ambas (artículos + cópula): **~15–20%**
507
+ - Máx. Compresión Exacta: **~40–60%** en textos >100 caracteres (con `~...`). En textos muy cortos puede no reducir.
508
  """
509
  COMPACT_EN = """
510
+ **📏 Typical compaction (click to expand)**
511
+ - No options: **0%**
512
+ - Drop articles: **~10–15%**
513
+ - Zero copula (present affirmative): **~5–10%**
514
+ - Both (articles + copula): **~15–20%**
515
+ - Max Exact Compression: **~40–60%** for >100 chars (`~...`). Very short texts may not shrink.
516
+ """
517
+
518
+ EXPLAIN_TAB_TRANSLATE_ES = """
519
+ **🔁 Traducir (haz clic para desplegar)**
520
+ Convierte el *Texto* al *Destino*. Funciona para **cualquier combinación**: Español, English, Minimax-ASCII, Kōmín-CJK.
521
+ - **Máx. Compresión Exacta** añade `~...` con el original comprimido para poder **recuperarlo exactamente** al decodificar.
522
+ - **Omitir artículos / Cópula cero / Quitar pronombres** se aplican **solo cuando el destino es conlang** (Minimax/Kōmín).
523
+ """
524
+ EXPLAIN_TAB_BUILD_ES = """
525
+ **🛠️ Construir (ES/EN → Conlang) (haz clic para desplegar)**
526
+ Fuerza la salida **en conlang** desde Español o Inglés aplicando reglas de fraseo (orden, partículas/TAM) y tus **checkbox**.
527
+ Útil para ver cómo quedaría la frase **directamente en Minimax/Kōmín** sin ambigüedad de direcciones.
528
+ """
529
+ EXPLAIN_TAB_DECODE_ES = """
530
+ **🗝️ Decodificar (Conlang → ES/EN) (haz clic para desplegar)**
531
+ Convierte **Minimax/Kōmín** a **Español o Inglés**.
532
+ - Si hay `~...`, devuelve el **original exacto**.
533
+ - Sin `~...`, la vuelta es **semi-lossless** usando el léxico y pistas simples.
534
+ """
535
+ EXPLAIN_TAB_ROUNDTRIP_ES = """
536
+ **🔄 Prueba ida→vuelta (haz clic para desplegar)**
537
+ Ejecuta **(ES/EN → Conlang) → (Conlang → ES/EN)** para comprobar **reversibilidad**.
538
+ Con **Máx. Compresión Exacta**, la vuelta coincide **bit a bit** con la entrada.
539
+ """
540
+ EXPLAIN_CHECKBOX_ES = """
541
+ **☑️ ¿Qué hace cada checkbox? (haz clic para desplegar)**
542
+ - **Omitir artículos**: quita *el/la/los/las* (ES) y *a/an/the* (EN) → **~10–15%**.
543
+ - **Cópula cero (presente afirm.)**: esconde *ser/estar/be* cuando suena natural → **~5–10%** extra.
544
+ - **Quitar pronombres**: elimina pronombres de sujeto/objeto **evidentes** (ahorro variable).
545
+ - **Máx. Compresión Exacta**: añade `~...` con zlib para recuperación exacta (**~40–60%** en >100 caracteres).
546
  """
547
 
548
  LEXICON_BUILD_ES = """
549
+ **ℹ️ Léxico (OMW → Minimax/Kōmín) (haz clic para desplegar)**
550
+ 1) Desde **OMW/WordNet 1.4** se extraen **lemas ES** y sus **equivalentes EN** por sinset.
551
+ 2) Se normalizan y ordenan por **frecuencia** (wordfreq).
552
  3) Opcional: **spaCy** refina lemas; **Argos** puede rellenar EN faltantes.
553
+ 4) Se asignan **códigos compactos** con alfabetos barajados por **SEED** hasta `MAXLEN_MINI`/`MAXLEN_CJK`.
554
+ 5) Se exportan: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+TSV).
555
+ **Vista previa** de `lexicon_master.json` abajo.
556
+ """
557
+
558
+ # (EN) versiones cortas
559
+ EXPLAIN_TAB_TRANSLATE_EN = """
560
+ **🔁 Translate (click to expand)** — Converts *Text* to *Target* (any pair: Spanish/English/Minimax/Kōmín).
561
+ With **Max Exact Compression**, appends `~...` to recover the **exact original**. Checkboxes apply when **target is conlang**.
562
+ """
563
+ EXPLAIN_TAB_BUILD_EN = """
564
+ **🛠️ Build (ES/EN → Conlang) (click to expand)** — Forces conlang output (Minimax/Kōmín) with phrasing rules and your checkboxes.
565
+ """
566
+ EXPLAIN_TAB_DECODE_EN = """
567
+ **🗝️ Decode (Conlang → ES/EN) (click to expand)** — If `~...` is present, returns the **bit-perfect original**; otherwise semi-lossless.
568
+ """
569
+ EXPLAIN_TAB_ROUNDTRIP_EN = """
570
+ **🔄 Round-trip (click to expand)** — Runs (ES/EN → Conlang) → (Conlang → ES/EN) to verify reversibility.
571
+ """
572
+ EXPLAIN_CHECKBOX_EN = """
573
+ **☑️ Checkboxes (click to expand)**
574
+ - **Drop articles**: ~10–15%
575
+ - **Zero copula (present affirm.)**: ~5–10% extra
576
+ - **Remove pronouns**: variable
577
+ - **Max Exact Compression**: ~40–60% for >100 chars (`~...`), exact recovery.
578
  """
579
  LEXICON_BUILD_EN = """
580
+ **ℹ️ Lexicon (OMW Minimax/Kōmín) (click to expand)** — OMW/WordNet ES lemmas + EN counterparts, normalized & frequency-sorted; optional spaCy/Argos; codes assigned with SEED-shuffled alphabets up to MAXLEN; exports JSON/TSV. Preview below.
 
 
 
 
 
581
  """
582
 
 
 
 
583
  def master_preview(n: int = 20) -> List[List[Any]]:
584
  try:
585
  entries = (MASTER_OBJ or {}).get("entries", [])
 
591
  except Exception:
592
  return [["lemma_es","lemma_en","minimax","komin"], ["(no data)","","",""]]
593
 
594
+ # ========================= Grupos ES / EN =========================
595
  def make_group_es():
596
+ with gr.Group(visible=True) as g:
597
  gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
598
+ # Acordeones de EXPLICACIÓN todos al MISMO nivel
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
  with gr.Row():
600
  with gr.Column():
601
+ with gr.Accordion(EXPLAIN_TAB_TRANSLATE_ES, open=False): pass
602
+ with gr.Accordion(EXPLAIN_TAB_BUILD_ES, open=False): pass
603
+ with gr.Accordion(EXPLAIN_TAB_DECODE_ES, open=False): pass
604
+ with gr.Accordion(EXPLAIN_TAB_ROUNDTRIP_ES, open=False): pass
 
 
605
  with gr.Column():
606
+ with gr.Accordion(EXPLAIN_CHECKBOX_ES, open=False): gr.Markdown(COMPACT_ES)
607
+ with gr.Accordion(LEXICON_BUILD_ES, open=False):
608
+ n_rows = gr.Slider(5, 100, value=20, step=5, label="Filas a mostrar")
609
+ table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
610
+ gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows], [table])
611
 
612
+ # ==== Tabs funcionales ====
613
  with gr.Tab("🔁 Traducir"):
614
  with gr.Row():
615
  uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
616
  uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
617
  uni_text = gr.Textbox(lines=3, label="Texto", placeholder="Ej.: Hola, ¿cómo estás?", show_copy_button=True)
618
  with gr.Row():
619
+ uni_drop = gr.Checkbox(True, label="Omitir artículos (ES/EN → conlang)")
620
+ uni_zero = gr.Checkbox(False, label="Cópula cero (presente afirm.)")
621
+ uni_rmpr = gr.Checkbox(False, label="Quitar pronombres")
622
+ uni_maxc = gr.Checkbox(False, label="Máx. Compresión Exacta (sidecar `~...`)")
623
 
624
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
625
  with gr.Row():
626
+ btn_tr = gr.Button("🚀 Traducir", variant="primary")
627
+ btn_tr_cl = gr.Button("🧹 Limpiar")
628
  uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
629
 
630
+ btn_tr.click(universal_translate,
631
+ [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
632
+ [uni_out])
633
+ btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
 
 
634
 
635
+ with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
636
+ gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES + "\n\n" + COMPACT_ES)
637
 
638
  with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
639
  with gr.Row():
 
641
  target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
642
  text_in = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
643
  with gr.Row():
644
+ drop_articles = gr.Checkbox(True, label="Omitir artículos")
645
+ zero_copula = gr.Checkbox(False, label="Cópula cero (presente afirm.)")
646
+ rm_pron_build = gr.Checkbox(False, label="Quitar pronombres")
647
+ max_comp_build = gr.Checkbox(False, label="Máx. Compresión Exacta")
648
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
649
  with gr.Row():
650
+ btn_b = gr.Button("🏗️ Construir", variant="primary")
651
+ btn_b_cl = gr.Button("🧹 Limpiar")
652
  out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
653
 
654
+ btn_b.click(build_sentence,
655
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
656
+ [out])
657
+ btn_b_cl.click(lambda: ("",""), None, [text_in, out])
 
 
658
 
659
+ with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
660
+ gr.Markdown(EXPLAIN_TAB_BUILD_ES + "\n\n" + COMPACT_ES)
661
 
662
  with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
663
  with gr.Row():
 
674
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
675
 
676
  with gr.Row():
677
+ btn_d = gr.Button("🔓 Decodificar", variant="primary")
678
+ btn_d_cl = gr.Button("🧹 Limpiar")
679
 
680
+ btn_d.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
681
+ btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
682
 
683
+ with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
684
+ gr.Markdown(EXPLAIN_TAB_DECODE_ES)
685
 
686
  with gr.Tab("🔄 Prueba ida→vuelta"):
687
  with gr.Row():
688
  rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
689
  rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
690
  rt_text = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
691
+ rt_max_comp = gr.Checkbox(False, label="Máx. Compresión Exacta")
692
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
693
  rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
694
  rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
695
  with gr.Row():
696
  btn_rt = gr.Button("▶️ Probar", variant="primary")
697
+ btn_rt_cl = gr.Button("🧹 Limpiar")
698
 
699
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
700
+ btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
701
 
702
+ with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
703
+ gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
704
+ return g
705
 
 
 
 
 
 
706
  def make_group_en():
707
+ with gr.Group(visible=False) as g:
708
  gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
709
  with gr.Row():
710
  with gr.Column():
711
+ with gr.Accordion(EXPLAIN_TAB_TRANSLATE_EN, open=False): pass
712
+ with gr.Accordion(EXPLAIN_TAB_BUILD_EN, open=False): pass
713
+ with gr.Accordion(EXPLAIN_TAB_DECODE_EN, open=False): pass
714
+ with gr.Accordion(EXPLAIN_TAB_ROUNDTRIP_EN, open=False): pass
 
 
715
  with gr.Column():
716
+ with gr.Accordion(EXPLAIN_CHECKBOX_EN, open=False): gr.Markdown(COMPACT_EN)
717
+ with gr.Accordion(LEXICON_BUILD_EN, open=False):
718
+ n_rows = gr.Slider(5, 100, value=20, step=5, label="Rows to show")
719
+ table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
720
+ gr.Button("Refresh").click(lambda n: master_preview(int(n)), [n_rows], [table])
721
 
722
  with gr.Tab("🔁 Translate"):
723
  with gr.Row():
 
725
  uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Target")
726
  uni_text = gr.Textbox(lines=3, label="Text", placeholder="e.g., Hello, how are you?", show_copy_button=True)
727
  with gr.Row():
728
+ uni_drop = gr.Checkbox(True, label="Drop articles (ES/EN → conlang)")
729
+ uni_zero = gr.Checkbox(False, label="Zero copula (present affirm.)")
730
+ uni_rmpr = gr.Checkbox(False, label="Remove pronouns")
731
+ uni_maxc = gr.Checkbox(False, label="Max Exact Compression (sidecar `~...`)")
732
 
733
  uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
734
  with gr.Row():
735
+ btn_tr = gr.Button("🚀 Translate", variant="primary")
736
+ btn_tr_cl = gr.Button("🧹 Clear")
737
  uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
738
 
739
+ btn_tr.click(universal_translate,
740
+ [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
741
+ [uni_out])
742
+ btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
 
 
743
 
744
+ with gr.Accordion("Quick help (what does this button do?)", open=False):
745
+ gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN + "\n\n" + COMPACT_EN)
746
 
747
  with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
748
  with gr.Row():
 
750
  target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
751
  text_in = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
752
  with gr.Row():
753
+ drop_articles = gr.Checkbox(True, label="Drop articles")
754
+ zero_copula = gr.Checkbox(False, label="Zero copula (present affirm.)")
755
+ rm_pron_build = gr.Checkbox(False, label="Remove pronouns")
756
+ max_comp_build = gr.Checkbox(False, label="Max Exact Compression")
757
  mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
758
  with gr.Row():
759
+ btn_b = gr.Button("🏗️ Build", variant="primary")
760
+ btn_b_cl = gr.Button("🧹 Clear")
761
  out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
762
 
763
+ btn_b.click(build_sentence,
764
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
765
+ [out])
766
+ btn_b_cl.click(lambda: ("",""), None, [text_in, out])
 
 
767
 
768
+ with gr.Accordion("Quick help (what does this button do?)", open=False):
769
+ gr.Markdown(EXPLAIN_TAB_BUILD_EN + "\n\n" + COMPACT_EN)
770
 
771
  with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
772
  with gr.Row():
 
783
  return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
784
 
785
  with gr.Row():
786
+ btn_d = gr.Button("🔓 Decode", variant="primary")
787
+ btn_d_cl = gr.Button("🧹 Clear")
788
 
789
+ btn_d.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
790
+ btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
791
 
792
+ with gr.Accordion("Quick help (what does this button do?)", open=False):
793
+ gr.Markdown(EXPLAIN_TAB_DECODE_EN)
794
 
795
  with gr.Tab("🔄 Round-trip"):
796
  with gr.Row():
797
  rt_src = gr.Dropdown(["Español","English"], value="English", label="Source")
798
  rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
799
  rt_text = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
800
+ rt_max_comp = gr.Checkbox(False, label="Max Exact Compression")
801
  rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
802
  rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
803
  rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
804
  with gr.Row():
805
  btn_rt = gr.Button("▶️ Test", variant="primary")
806
+ btn_rt_cl = gr.Button("🧹 Clear")
807
 
808
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
809
+ btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
810
 
811
+ with gr.Accordion("Quick help (what does this button do?)", open=False):
812
+ gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
813
+ return g
814
 
815
+ # ================================ App ================================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
816
  with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
817
  gr.Markdown("## 🌍 Idioma / Language")
818
+ lang_select = gr.Radio(["ES","EN"], value="ES", label="Selecciona / Select")
819
  group_es = make_group_es()
820
  group_en = make_group_en()
 
821
 
822
  def switch_lang(code):
823
  if code == "EN":
 
833
 
834
 
835
 
836
+