LoloSemper commited on
Commit
6094523
·
verified ·
1 Parent(s): 41297dc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +770 -55
app.py CHANGED
@@ -1,11 +1,373 @@
1
  # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
- # ... (imports iguales)
 
 
 
 
 
 
 
 
 
3
 
4
- # ... (load_lexicons, norm_es, etc. iguales)
 
 
 
 
 
 
 
5
 
6
- # OOV y custom_b64 iguales
 
 
 
7
 
8
- # Actualiza b85 a custom_sidecar
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
10
  comp = zlib.compress(original_text.encode("utf-8"), 9)
11
  blob = to_custom_b64(comp, ALPHA_MINI64)
@@ -24,78 +386,431 @@ def extract_custom_sidecar(text: str) -> Optional[str]:
24
  def strip_custom_sidecar(text: str) -> str:
25
  return text.split('~')[0].rstrip() if '~' in text else text
26
 
27
- # Actualiza is_content_token: permite TODO para exactitud
28
- def is_content_token(t) -> bool:
29
- return True # No filtra nada; todo se codifica
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
- # Actualiza realize_minimax: incluye todos los tokens (saludos, wh, etc.)
32
- def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
33
- # Split full text into tokens (incluye punct)
34
- tokens = re.findall(r"\S+", doc) # No filtra; todo
35
- if not tokens: return ""
36
- # Asume primer verbo-ish para hints (simple)
37
- v_idx = next((i for i, t in enumerate(tokens) if t.lower() in ["estás", "eres", "soy", "estar", "ser"]), 0)
38
- parts = []
39
- for i, t in enumerate(tokens):
40
- lem = t.lower().rstrip('?¿!¡.,;') # Limpia punct para code, añade después
41
- punct = t[len(lem):] if len(t) > len(lem) else ""
42
- code = code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII")
43
- if i == v_idx and semi_lossless:
44
- tense = "P" # Detect simple
45
- pi = "2s" # Asume
46
- tail = f"{tense}{pi}Q" if "?" in doc else f"{tense}{pi}"
47
- code = f"{code}·{tail}"
48
- parts.append(code + punct)
49
- return " ".join(parts)
50
-
51
- # Decode: simple reverse para semi, pero sidecar para exact
52
  def decode_simple(text: str, source: str, tgt_lang: str) -> str:
53
- # Para semi: reverse tokens, conjuga si ·tail
 
 
 
 
 
 
 
 
 
54
  tokens = text.split()
55
- out = []
56
- for part in tokens:
57
- m = mini_tail_re.match(part.rstrip('?¿!¡.,;'))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  if m:
 
59
  stem = m.group("stem")
60
  tail = m.group("tail")
61
- vlem = MINI2ES.get(stem, dec_oov_minimax(stem)) if tgt_lang == "Español" else MINI2EN.get(stem, stem)
62
- # Conjuga simple
63
- v_conj = _es_conj(vlem, "Pres", "2s") if tgt_lang == "Español" else _en_conj(vlem, "Pres", "2s")
64
- out.append(v_conj)
65
- if "Q" in tail:
66
- out[-1] += "?"
67
- else:
68
- w = MINI2ES.get(part.rstrip('?¿!¡.,;'), dec_oov_minimax(part)) if tgt_lang == "Español" else part
69
- out.append(w + (part[-1] if part[-1] in '?¿!¡.,;' else ''))
70
- out_text = " ".join(out)
71
- if "?" in text:
72
- out_text = f"¿{out_text}?"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  return out_text
74
 
75
- # Actualiza build_sentence y universal_translate
76
- def build_sentence(text: str, src_lang: str, target: str, drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  if not text.strip(): return ""
78
- semi = True
79
- core = realize_minimax(text, src_lang, drop_articles, zero_copula, semi) if USE_SPACY else encode_simple(text, src_lang, target) # Usa realize para full include
80
  if max_comp_exact:
81
  return custom_sidecar_enc(core, text)
82
  return core
83
 
84
- def universal_translate(text: str, src: str, tgt: str, drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
 
 
85
  if not text.strip(): return ""
86
  if src == tgt: return text
 
 
87
  if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
88
  return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact)
 
 
89
  if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
 
90
  orig = extract_custom_sidecar(text)
91
  if orig is not None: return orig
92
- return decode_simple(strip_custom_sidecar(text), src, tgt)
93
- # Resto igual...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
 
95
- # UI: cambia checkbox a "Max Compresión Exacta (sidecar oculto)"
96
- # En tabs: uni_max_comp = gr.Checkbox(value=False, label="Max Compresión Exacta")
97
- # Click: universal_translate(..., uni_max_comp)
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- # Resto del código (conjugadores, UI) igual al anterior
 
100
 
101
 
 
1
  # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
+ # Archivos necesarios en la raíz:
3
+ # - lexicon_minimax.json
4
+ # - lexicon_komin.json
5
+ # - lexicon_master.json
6
+ #
7
+ # requirements.txt (para HF Spaces):
8
+ # gradio>=4.36.0
9
+ # spacy>=3.7.4
10
+ # es_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl
11
+ # en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
12
 
13
+ import os
14
+ import re
15
+ import json
16
+ import base64
17
+ import zlib
18
+ import hashlib
19
+ from typing import Dict, Tuple, Optional
20
+ import gradio as gr
21
 
22
+ # ------------ Archivos esperados ------------
23
+ LEX_MINI = "lexicon_minimax.json"
24
+ LEX_KOMI = "lexicon_komin.json"
25
+ LEX_MASTER = "lexicon_master.json"
26
 
27
+ # ------------ Normalización ------------
28
+ WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
29
+ STRIP = str.maketrans("ÁÉÍÓÚÜÑáéíóúüñ", "AEIOUUNaeiouun")
30
+
31
+ def norm_es(w: str) -> str:
32
+ return re.sub(r"[^a-záéíóúüñ]", "", (w or "").lower()).translate(STRIP)
33
+
34
+ def norm_en(w: str) -> str:
35
+ return re.sub(r"[^a-z]", "", (w or "").lower())
36
+
37
+ # ------------ Carga de léxicos ------------
38
+ def load_json(path: str):
39
+ if not os.path.exists(path): return None
40
+ with open(path, "r", encoding="utf-8") as f:
41
+ return json.load(f)
42
+
43
+ def load_lexicons():
44
+ mm = load_json(LEX_MINI) or {}
45
+ kk = load_json(LEX_KOMI) or {}
46
+ master = load_json(LEX_MASTER) or {}
47
+
48
+ es2mini = mm.get("mapping", {})
49
+ es2komi = kk.get("mapping", {})
50
+ mini2es = {v:k for k,v in es2mini.items()}
51
+ komi2es = {v:k for k,v in es2komi.items()}
52
+
53
+ es2en_lemma: Dict[str,str] = {}
54
+ en2es_lemma: Dict[str,str] = {}
55
+ en2mini, en2komi = {}, {}
56
+ mini2en, komi2en = {}, {}
57
+
58
+ if isinstance(master, dict) and "entries" in master:
59
+ for e in master["entries"]:
60
+ es = norm_es(str(e.get("lemma_es","")))
61
+ en = norm_en(str(e.get("lemma_en","")))
62
+ mi = str(e.get("minimax",""))
63
+ ko = str(e.get("komin",""))
64
+ if es and en:
65
+ es2en_lemma.setdefault(es, en)
66
+ en2es_lemma.setdefault(en, es)
67
+ if en and mi: en2mini.setdefault(en, mi)
68
+ if en and ko: en2komi.setdefault(en, ko)
69
+
70
+ mini2en = {v:k for k,v in en2mini.items()}
71
+ komi2en = {v:k for k,v in en2komi.items()}
72
+
73
+ return (es2mini, es2komi, mini2es, komi2es,
74
+ en2mini, en2komi, mini2en, komi2en,
75
+ es2en_lemma, en2es_lemma)
76
+
77
+ (ES2MINI, ES2KOMI, MINI2ES, KOMI2ES,
78
+ EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
79
+ ES2EN_LEMMA, EN2ES_LEMMA) = load_lexicons()
80
+
81
+ # ------------ OOV reversible (modo Semi-lossless) ------------
82
+ ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
83
+ CJK_BASE = (
84
+ "天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
85
+ "東西南北中外上下午夜明暗手口目耳心言書家道路門"
86
+ "大小長短早晚高低新古青紅白黒金銀銅玉米茶酒米"
87
+ "文学楽音画体気電海空森林雪雲砂島橋城村国自由静"
88
+ )
89
+ ALPHA_CJK64 = (CJK_BASE * 2)[:64]
90
+
91
+ def to_custom_b64(b: bytes, alphabet: str) -> str:
92
+ std = base64.b64encode(b).decode("ascii")
93
+ trans = str.maketrans(
94
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/",
95
+ alphabet
96
+ )
97
+ return std.translate(trans).rstrip("=")
98
+
99
+ def from_custom_b64(s: str, alphabet: str) -> bytes:
100
+ trans = str.maketrans(
101
+ alphabet,
102
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
103
+ )
104
+ std = s.translate(trans)
105
+ pad = "=" * ((4 - len(std) % 4) % 4)
106
+ return base64.b64decode(std + pad)
107
+
108
+ def enc_oov_minimax(token: str) -> str:
109
+ return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64)
110
+ def dec_oov_minimax(code: str) -> str:
111
+ try: return from_custom_b64(code[1:], ALPHA_MINI64).decode("utf-8")
112
+ except Exception: return code
113
+
114
+ def enc_oov_komin(token: str) -> str:
115
+ return "「" + to_custom_b64(token.encode("utf-8"), ALPHA_CJK64) + "」"
116
+ def dec_oov_komin(code: str) -> str:
117
+ try: return from_custom_b64(code[1:-1], ALPHA_CJK64).decode("utf-8")
118
+ except Exception: return code
119
+
120
+ def is_oov_minimax(code: str) -> bool:
121
+ return code.startswith("~") and len(code) > 1
122
+ def is_oov_komin(code: str) -> bool:
123
+ return len(code) >= 2 and code.startswith("「") and code.endswith("」")
124
+
125
+ # ------------ spaCy opcional ------------
126
+ USE_SPACY = False
127
+ try:
128
+ import spacy
129
+ try:
130
+ nlp_es = spacy.load("es_core_news_sm")
131
+ nlp_en = spacy.load("en_core_web_sm")
132
+ USE_SPACY = True
133
+ except Exception:
134
+ nlp_es = nlp_en = None
135
+ except Exception:
136
+ nlp_es = nlp_en = None
137
+
138
+ def lemma_of(tok, src_lang: str) -> str:
139
+ if src_lang == "Español":
140
+ return norm_es(tok.lemma_ if tok.lemma_ else tok.text)
141
+ else:
142
+ return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
143
+
144
+ # ------------ Selección de oración predicativa ------------
145
+ def pick_predicative_sentence(doc):
146
+ sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
147
+ candidates = []
148
+ for s in sents:
149
+ roots = [t for t in s if t.dep_ == "ROOT" and t.pos_ in ("VERB","AUX")]
150
+ if not roots:
151
+ continue
152
+ root = roots[0]
153
+ has_q = "?" in s.text
154
+ has_subj = any(t.dep_.startswith("nsubj") for t in root.children)
155
+ score = (1 if has_q else 0) + (1 if has_subj else 0) + (len(s) / 1000.0)
156
+ candidates.append((score, s))
157
+ if not candidates:
158
+ return doc
159
+ return sorted(candidates, key=lambda x: x[0], reverse=True)[0][1].as_doc()
160
+
161
+ def is_content_token(t) -> bool:
162
+ return True # No filtra para exactitud
163
+
164
+ # ------------ Mapeo lema→código ------------
165
+ def code_es(lemma: str, target: str) -> str:
166
+ lemma = norm_es(lemma)
167
+ if target == "Minimax-ASCII":
168
+ return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
169
+ else:
170
+ return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
171
+
172
+ def code_en(lemma: str, target: str) -> str:
173
+ lemma = norm_en(lemma)
174
+ if target == "Minimax-ASCII":
175
+ if EN2MINI: return EN2MINI.get(lemma) or enc_oov_minimax(lemma)
176
+ return enc_oov_minimax(lemma)
177
+ else:
178
+ if EN2KOMI: return EN2KOMI.get(lemma) or enc_oov_komin(lemma)
179
+ return enc_oov_komin(lemma)
180
+
181
+ # ------------ Fraseador compacto ------------
182
+ TAM_MINI = {"Pres":"P", "Past":"T", "Fut":"F", "UNK":"P"}
183
+ TAM_KOMI = {"Pres":"Ⓟ", "Past":"Ⓣ", "Fut":"Ⓕ", "UNK":"Ⓟ"}
184
+
185
+ def detect_polarity(doc) -> bool:
186
+ return "?" in doc.text
187
+
188
+ def detect_neg(doc) -> bool:
189
+ for t in doc:
190
+ if t.dep_ == "neg" or t.lower_ in ("no","not","n't"):
191
+ return True
192
+ return False
193
+
194
+ def detect_tense(root):
195
+ m = str(root.morph)
196
+ if "Tense=Past" in m: return "Past"
197
+ if "Tense=Fut" in m: return "Fut"
198
+ if "Tense=Pres" in m: return "Pres"
199
+ for c in root.children:
200
+ if c.pos_ == "AUX":
201
+ cm = str(c.morph)
202
+ if "Tense=Past" in cm: return "Past"
203
+ if c.lower_ == "will": return "Fut"
204
+ return "Pres"
205
+
206
+ def detect_person(root, src_lang: str) -> Optional[str]:
207
+ m = str(root.morph)
208
+ person_str = "3"
209
+ number_str = "s"
210
+ if "Person=" in m:
211
+ for feat in m.split("|"):
212
+ if feat.startswith("Person="):
213
+ person_str = feat.split("=")[1]
214
+ elif feat.startswith("Number="):
215
+ number_str = "p" if feat.split("=")[1] == "Plur" else "s"
216
+ return person_str + number_str
217
+ return _person_of_doc(root.doc, src_lang)
218
+
219
+ def extract_core(doc):
220
+ root = next((t for t in doc if t.dep_=="ROOT" and t.pos_ in ("VERB","AUX")), doc[0])
221
+ subs, objs, obls, advs = [], [], [], []
222
+ for t in root.children:
223
+ if t.dep_ in ("nsubj","nsubj:pass","csubj"):
224
+ subs.append(t)
225
+ elif t.dep_ in ("obj","dobj","iobj"):
226
+ objs.append(t)
227
+ elif t.dep_ in ("obl","pobj"):
228
+ obls.append(t)
229
+ elif t.dep_ in ("advmod","advcl") and t.pos_ == "ADV":
230
+ advs.append(t)
231
+ subs.sort(key=lambda x: x.i); objs.sort(key=lambda x: x.i)
232
+ obls.sort(key=lambda x: x.i); advs.sort(key=lambda x: x.i)
233
+ return root, subs, objs, obls, advs
234
+
235
+ def _person_of_doc(doc, src_lang: str) -> Optional[str]:
236
+ try:
237
+ root = next((t for t in doc if t.dep_=="ROOT"), doc[0])
238
+ subj = next((t for t in root.children if t.dep_.startswith("nsubj")), None)
239
+ if subj is None: return None
240
+ plur = ("Number=Plur" in str(subj.morph)) if src_lang=="Español" else (subj.tag_ in ("NNS","NNPS"))
241
+ low = subj.lower_
242
+ if src_lang=="Español":
243
+ if low in ("yo",): return "1p" if plur else "1s"
244
+ if low in ("tú","vos"): return "2p" if plur else "2s"
245
+ if low in ("usted","él","ella"): return "3p" if plur else "3s"
246
+ lem = lemma_of(subj, "Español")
247
+ if lem in ("yo","nosotros"): return "1p" if plur else "1s"
248
+ if lem in ("tú","vosotros"): return "2p" if plur else "2s"
249
+ return "3p" if plur else "3s"
250
+ else:
251
+ if low in ("i",): return "1p" if plur else "1s"
252
+ if low in ("you",): return "2p" if plur else "2s"
253
+ if low in ("he","she","it"): return "3p" if plur else "3s"
254
+ return "3p" if plur else "3s"
255
+ except Exception:
256
+ return None
257
+
258
+ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
259
+ root, subs, objs, obls, advs = extract_core(doc)
260
+ tense = detect_tense(root)
261
+ is_q, is_neg = detect_polarity(doc), detect_neg(doc)
262
+
263
+ vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in doc.text else "estar")
264
+ vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
265
+
266
+ tail = TAM_MINI.get(tense, "P")
267
+ if semi_lossless:
268
+ pi = detect_person(root, src_lang) or person_hint
269
+ tail += pi
270
+ if is_neg: tail += "N"
271
+ if is_q: tail += "Q"
272
+ if tail:
273
+ vcode = f"{vcode}·{tail}"
274
+
275
+ def realize_np(tokens):
276
+ outs=[]
277
+ for t in tokens:
278
+ if not USE_SPACY or is_content_token(t):
279
+ lem = lemma_of(t, src_lang) if USE_SPACY else (t.text)
280
+ code = code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII")
281
+ if semi_lossless and USE_SPACY and (t.tag_ in ("NNS","NNPS") or "Number=Plur" in str(t.morph)):
282
+ code = f"{code}[PL]"
283
+ outs.append(code)
284
+ return outs
285
+
286
+ S = realize_np(subs)
287
+ O = realize_np(objs) + realize_np(obls)
288
+ ADV=[]
289
+ for a in advs:
290
+ if not USE_SPACY or is_content_token(a):
291
+ lem = lemma_of(a, src_lang) if USE_SPACY else a.text
292
+ ADV.append(code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII"))
293
+
294
+ if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
295
+ parts = S + O + ADV
296
+ else:
297
+ parts = [vcode] + S + O + ADV
298
+ return " ".join(p for p in parts if p)
299
+
300
+ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
301
+ root, subs, objs, obls, advs = extract_core(doc)
302
+ tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
303
+
304
+ vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in doc.text else "estar")
305
+ vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
306
+
307
+ P_SUBJ, P_OBJ = "ᵖ", "ᵒ"
308
+ NEG_M, Q_FIN = "̆", "?"
309
+ TAM = TAM_KOMI.get(tense, "Ⓟ")
310
+
311
+ if semi_lossless:
312
+ pi = detect_person(root, src_lang) or person_hint
313
+ TAM = TAM + f"[{pi}]"
314
+
315
+ def realize_np(tokens, particle):
316
+ outs=[]
317
+ for t in tokens:
318
+ if not USE_SPACY or is_content_token(t):
319
+ lem = lemma_of(t, src_lang) if USE_SPACY else t.text
320
+ code = code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK")
321
+ if semi_lossless and USE_SPACY and (t.tag_ in ("NNS","NNPS") or "Number=Plur" in str(t.morph)):
322
+ code = f"{code}[PL]"
323
+ outs.append(code + particle)
324
+ return outs
325
+
326
+ S = realize_np(subs, P_SUBJ)
327
+ O = realize_np(objs + obls, P_OBJ)
328
+ ADV=[]
329
+ for a in advs:
330
+ if not USE_SPACY or is_content_token(a):
331
+ lem = lemma_of(a, src_lang) if USE_SPACY else a.text
332
+ ADV.append(code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK"))
333
+
334
+ v_form = vcode + TAM + (NEG_M if is_neg else "")
335
+
336
+ if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
337
+ parts = S + O + ADV
338
+ else:
339
+ parts = S + O + ADV + [v_form]
340
+ out = " ".join(parts)
341
+ if is_q: out += " " + Q_FIN
342
+ return out
343
+
344
+ # ------------ Lossless (Base85 comprimido) ------------
345
+ SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
346
+
347
+ def b85_enc_raw(s: str) -> str:
348
+ comp = zlib.compress(s.encode("utf-8"), 9)
349
+ return base64.a85encode(comp, adobe=False).decode("ascii")
350
+
351
+ def b85_dec_raw(b85s: str) -> str:
352
+ comp = base64.a85decode(b85s.encode("ascii"), adobe=False)
353
+ return zlib.decompress(comp).decode("utf-8")
354
+
355
+ def attach_sidecar_b85(conlang_text: str, original_text: str) -> str:
356
+ blob = b85_enc_raw(original_text)
357
+ return f"{conlang_text} §({blob})"
358
+
359
+ def extract_sidecar_b85(text: str) -> Optional[str]:
360
+ m = SIDECAR_B85_RE.search(text)
361
+ if not m: return None
362
+ try:
363
+ return b85_dec_raw(m.group("b85"))
364
+ except Exception:
365
+ return None
366
+
367
+ def strip_sidecar_b85(text: str) -> str:
368
+ return SIDECAR_B85_RE.sub("", text).rstrip()
369
+
370
+ # ------------ Custom sidecar para max compresión exacta ------------
371
  def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
372
  comp = zlib.compress(original_text.encode("utf-8"), 9)
373
  blob = to_custom_b64(comp, ALPHA_MINI64)
 
386
  def strip_custom_sidecar(text: str) -> str:
387
  return text.split('~')[0].rstrip() if '~' in text else text
388
 
389
+ # ------------ Codificar / Decodificar léxico puro ------------
390
+ def encode_simple(text: str, src_lang: str, target: str) -> str:
391
+ if not text.strip(): return ""
392
+ def repl_es(m):
393
+ key = norm_es(m.group(0))
394
+ code = ES2MINI.get(key) if target=="Minimax-ASCII" else ES2KOMI.get(key)
395
+ return code or (enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0)))
396
+ def repl_en(m):
397
+ key = norm_en(m.group(0))
398
+ table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
399
+ if table and key in table:
400
+ return table[key]
401
+ return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
402
+ repl = repl_es if src_lang=="Español" else repl_en
403
+ return WORD_RE.sub(repl, text)
404
+
405
+ def pluralize_es(word: str) -> str:
406
+ exceptions = {"uno": "unos", "buen": "buenos", "hombre": "hombres"}
407
+ if word in exceptions: return exceptions[word]
408
+ if word.endswith("z"): return word[:-1] + "ces"
409
+ if word.endswith(("a", "e", "i", "o")): return word + "s"
410
+ return word + "es"
411
+
412
+ def pluralize_en(word: str) -> str:
413
+ exceptions = {"man": "men", "woman": "women", "child": "children"}
414
+ if word in exceptions: return exceptions[word]
415
+ if word.endswith("y") and len(word) > 1 and word[-2] not in "aeiou": return word[:-1] + "ies"
416
+ if word.endswith(("s", "sh", "ch", "x", "z")): return word + "es"
417
+ return word + "s"
418
+
419
+ def pluralize(word: str, tgt_lang: str) -> str:
420
+ return pluralize_es(word) if tgt_lang == "Español" else pluralize_en(word)
421
+
422
+ PRON_ES = {"yo", "tú", "él", "ella", "nosotros", "vosotros", "ellos", "ellas", "usted", "ustedes"}
423
+ PRON_EN = {"i", "you", "he", "she", "it", "we", "they"}
424
+
425
+ mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  def decode_simple(text: str, source: str, tgt_lang: str) -> str:
428
+ if not text.strip():
429
+ return ""
430
+ code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
431
+ code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
432
+ pron_set = PRON_ES if tgt_lang == "Español" else PRON_EN
433
+
434
+ if source == "Kōmín-CJK":
435
+ text = text.replace("?", "?").replace(" ", " ")
436
+ return " ".join([code2es.get(w, w) for w in text.split() if w != "?"])
437
+
438
  tokens = text.split()
439
+ if not tokens: return ""
440
+
441
+ lemma_tokens = []
442
+ pl_flags = []
443
+ verb_idx = -1
444
+ verb_lemma = None
445
+ verb_tense = "Pres"
446
+ verb_person = "3s"
447
+ has_q = False
448
+ is_neg = False
449
+
450
+ for i, part in enumerate(tokens):
451
+ look = part.replace("[PL]", "")
452
+ had_pl = "[PL]" in part
453
+ pl_flags.append(had_pl)
454
+
455
+ m = mini_tail_re.match(look)
456
  if m:
457
+ verb_idx = len(lemma_tokens)
458
  stem = m.group("stem")
459
  tail = m.group("tail")
460
+ vlem_es = code2es.get(stem)
461
+ vlem_en = code2en.get(stem) if code2en else None
462
+ vlem = vlem_es if tgt_lang == "Español" else (vlem_en or vlem_es or stem)
463
+ if not vlem:
464
+ if is_oov_minimax(stem):
465
+ vlem = dec_oov_minimax(stem)
466
+ else:
467
+ vlem = stem
468
+ lemma_tokens.append(vlem)
469
+ pl_flags.append(False)
470
+
471
+ # Parse tail
472
+ if tail:
473
+ if len(tail) > 0 and tail[0] in "PTF":
474
+ verb_tense = {"P": "Pres", "T": "Past", "F": "Fut"}.get(tail[0], "Pres")
475
+ pos = 1
476
+ person = "3s"
477
+ if len(tail) > pos and tail[pos] in "123":
478
+ pos += 1
479
+ if len(tail) > pos and tail[pos] in "sp":
480
+ person = tail[pos-1] + tail[pos]
481
+ pos += 1
482
+ else:
483
+ person = tail[pos-1] + "s"
484
+ verb_person = person
485
+ is_neg = "N" in tail[pos:]
486
+ has_q = "Q" in tail[pos:]
487
+ verb_lemma = vlem
488
+ continue
489
+
490
+ # No verbo
491
+ w_es = code2es.get(look)
492
+ w_en = code2en.get(look) if code2en else None
493
+ w = w_es if tgt_lang == "Español" else (w_en or w_es or look)
494
+ if not w:
495
+ if is_oov_minimax(look):
496
+ w = dec_oov_minimax(look)
497
+ else:
498
+ w = look
499
+ lemma_tokens.append(w)
500
+ pl_flags.append(had_pl)
501
+
502
+ if verb_idx == -1:
503
+ # Fallback zero copula
504
+ verb_lemma = "ser" if tgt_lang == "Español" else "be"
505
+ verb_tense = "Pres"
506
+ verb_person = "3s"
507
+ v_conj = _es_conj(verb_lemma, verb_tense, verb_person) if tgt_lang == "Español" else _en_conj(verb_lemma, verb_tense, verb_person)
508
+ lemma_tokens.insert(1 if lemma_tokens else 0, v_conj)
509
+ out_text = " ".join(lemma_tokens)
510
+ else:
511
+ # Conjuga
512
+ conj_func = _es_conj if tgt_lang == "Español" else _en_conj
513
+ v_conj = conj_func(verb_lemma, verb_tense, verb_person)
514
+ if is_neg:
515
+ neg_prefix = "no " if tgt_lang == "Español" else "not "
516
+ v_conj = neg_prefix + v_conj
517
+
518
+ # Reordena SVO
519
+ post_v = lemma_tokens[verb_idx + 1:]
520
+ pl_post = pl_flags[verb_idx + 1:]
521
+ s_idx = next((j for j, w in enumerate(post_v) if w.lower() in pron_set), None)
522
+ S = post_v[s_idx] if s_idx is not None else None
523
+ if S:
524
+ if pl_post[s_idx]:
525
+ S = pluralize(S, tgt_lang)
526
+ del post_v[s_idx]
527
+ del pl_post[s_idx]
528
+
529
+ O_ADV = []
530
+ if post_v:
531
+ O = pluralize(post_v[0], tgt_lang) if pl_post[0] else post_v[0]
532
+ O_ADV.append(O)
533
+ O_ADV.extend([pluralize(post_v[k], tgt_lang) if pl_post[k] else post_v[k] for k in range(1, len(post_v))])
534
+
535
+ parts = [p for p in [S, v_conj] + O_ADV if p]
536
+ out_text = " ".join(parts)
537
+
538
+ # Wh en Q: si primer token es wh, muévelo al frente
539
+ if has_q and lemma_tokens and lemma_tokens[0].lower() in {"como", "cómo", "what", "how"}:
540
+ wh = lemma_tokens.pop(0)
541
+ out_text = f"{wh} {out_text}"
542
+
543
+ # Pregunta
544
+ if has_q:
545
+ start_q = "¿" if tgt_lang == "Español" else ""
546
+ end_q = "?" if tgt_lang == "Español" else "?"
547
+ out_text = f"{start_q}{out_text.capitalize()}{end_q}"
548
+
549
  return out_text
550
 
551
+ # ------------ Conjugadores mínimos ------------
552
+ _ES_SUBJ = {"1s":"yo","2s":"tú","3s":"él/ella","1p":"nosotros","2p":"vosotros","3p":"ellos"}
553
+ _EN_SUBJ = {"1s":"I","2s":"you","3s":"he","1p":"we","2p":"you","3p":"they"}
554
+
555
+ def _es_conj_regular(lemma, tense, person):
556
+ if not lemma.endswith(("ar","er","ir")): return lemma
557
+ stem = lemma[:-2]; vtype = lemma[-2:]
558
+ pres = {
559
+ "ar": {"1s":"o","2s":"as","3s":"a","1p":"amos","2p":"áis","3p":"an"},
560
+ "er": {"1s":"o","2s":"es","3s":"e","1p":"emos","2p":"éis","3p":"en"},
561
+ "ir": {"1s":"o","2s":"es","3s":"e","1p":"imos","2p":"ís","3p":"en"},
562
+ }
563
+ pret = {
564
+ "ar": {"1s":"é","2s":"aste","3s":"ó","1p":"amos","2p":"asteis","3p":"aron"},
565
+ "er": {"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"},
566
+ "ir": {"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"},
567
+ }
568
+ fut = {"1s":"é","2s":"ás","3s":"á","1p":"emos","2p":"éis","3p":"án"}
569
+ if tense == "Pres": return stem + pres[vtype].get(person, pres[vtype]["3s"])
570
+ if tense == "Past": return stem + pret[vtype].get(person, pret[vtype]["3s"])
571
+ return lemma + fut.get(person, fut["3s"])
572
+
573
+ def _es_conj(lemma, tense, person):
574
+ if lemma == "ser":
575
+ tab = {
576
+ "Pres":{"1s":"soy","2s":"eres","3s":"es","1p":"somos","2p":"sois","3p":"son"},
577
+ "Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"},
578
+ "Fut":{"1s":"seré","2s":"serás","3s":"será","1p":"seremos","2p":"seréis","3p":"serán"},
579
+ }; return tab[tense].get(person, tab[tense]["3s"])
580
+ if lemma == "estar":
581
+ tab = {
582
+ "Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
583
+ "Past":{"1s":"estuve","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"},
584
+ "Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"},
585
+ }; return tab[tense].get(person, tab[tense]["3s"])
586
+ if lemma == "ir":
587
+ tab = {
588
+ "Pres":{"1s":"voy","2s":"vas","3s":"va","1p":"vamos","2p":"vais","3p":"van"},
589
+ "Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"},
590
+ "Fut":{"1s":"iré","2s":"irás","3s":"irá","1p":"iremos","2p":"iréis","3p":"irán"},
591
+ }; return tab[tense].get(person, tab[tense]["3s"])
592
+ return _es_conj_regular(lemma, tense, person)
593
+
594
+ def _en_conj(lemma, tense, person):
595
+ if lemma == "be":
596
+ if tense == "Pres":
597
+ return {"1s":"am","2s":"are","3s":"is","1p":"are","2p":"are","3p":"are"}.get(person, "is")
598
+ if tense == "Past":
599
+ return {"1s":"was","2s":"were","3s":"was","1p":"were","2p":"were","3p":"were"}.get(person, "was")
600
+ return "be"
601
+ if lemma == "have":
602
+ if tense == "Pres": return "has" if person=="3s" else "have"
603
+ if tense == "Past": return "had"
604
+ return "have"
605
+ if lemma == "go":
606
+ if tense == "Past": return "went"
607
+ return "goes" if (tense=="Pres" and person=="3s") else "go"
608
+ if lemma == "do":
609
+ if tense == "Past": return "did"
610
+ return "does" if (tense=="Pres" and person=="3s") else "do"
611
+
612
+ if tense == "Pres":
613
+ if person == "3s":
614
+ if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"):
615
+ return lemma[:-1] + "ies"
616
+ if lemma.endswith(("s","sh","ch","x","z","o")):
617
+ return lemma + "es"
618
+ return lemma + "s"
619
+ return lemma
620
+ elif tense == "Past":
621
+ if lemma.endswith("e"): return lemma + "d"
622
+ if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1] + "ied"
623
+ return lemma + "ed"
624
+ else:
625
+ return lemma
626
+
627
+ # ------------ Semi-lossless (rutas) ------------
628
+ def _build_with_spacy(text: str, src_lang: str, target: str,
629
+ drop_articles: bool, zero_copula: bool, semi_lossless: bool) -> str:
630
+ nlp = nlp_es if src_lang=="Español" else nlp_en
631
+ doc = nlp(text)
632
+ if target == "Minimax-ASCII":
633
+ return realize_minimax(doc, src_lang, drop_articles, zero_copula, semi_lossless=semi_lossless)
634
+ else:
635
+ return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless=semi_lossless)
636
+
637
+ def build_sentence(text: str, src_lang: str, target: str,
638
+ drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
639
  if not text.strip(): return ""
640
+ semi = True # Siempre semi-lossless
641
+ core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi_lossless=semi) if USE_SPACY else encode_simple(text, src_lang, target)
642
  if max_comp_exact:
643
  return custom_sidecar_enc(core, text)
644
  return core
645
 
646
+ def universal_translate(text: str, src: str, tgt: str,
647
+ drop_articles: bool, zero_copula: bool,
648
+ mode: str, max_comp_exact: bool = False) -> str:
649
  if not text.strip(): return ""
650
  if src == tgt: return text
651
+
652
+ # Natural → Conlang
653
  if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
654
  return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact)
655
+
656
+ # Conlang → Natural (considera sidecars)
657
  if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
658
+ # Custom sidecar para exact
659
  orig = extract_custom_sidecar(text)
660
  if orig is not None: return orig
661
+ # Fallback b85 si hay
662
+ orig = extract_sidecar_b85(text)
663
+ if orig is not None: return orig
664
+ # Semi-lossless
665
+ return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
666
+
667
+ # Natural ↔ Natural (lemas)
668
+ if src in ("Español","English") and tgt in ("Español","English"):
669
+ return translate_natural(text, src, tgt)
670
+
671
+ # Conlang ↔ Conlang (simplificado)
672
+ if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
673
+ orig = extract_custom_sidecar(text)
674
+ if orig is not None:
675
+ # Preserva sidecar
676
+ core = strip_custom_sidecar(text)
677
+ es_lemmas = decode_simple(core, src, "Español")
678
+ words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
679
+ out=[]
680
+ for w in words:
681
+ if re.fullmatch(r"\w+", w):
682
+ code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
683
+ if not code:
684
+ code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)
685
+ out.append(code)
686
+ else:
687
+ out.append(w)
688
+ out_text = " ".join(out)
689
+ return custom_sidecar_enc(out_text, orig)
690
+ # Sin sidecar, normal
691
+ core = text
692
+ es_lemmas = decode_simple(core, src, "Español")
693
+ words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
694
+ out=[]
695
+ for w in words:
696
+ if re.fullmatch(r"\w+", w):
697
+ code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
698
+ if not code:
699
+ code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)
700
+ out.append(code)
701
+ else:
702
+ out.append(w)
703
+ return " ".join(out)
704
+
705
+ return "[No soportado]"
706
+
707
+ def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
708
+ if not text.strip(): return ""
709
+ if not USE_SPACY: return text
710
+ nlp = nlp_es if src_lang=="Español" else nlp_en
711
+ doc = nlp(text)
712
+ out=[]
713
+ for t in doc:
714
+ if not t.is_alpha:
715
+ out.append(t.text); continue
716
+ lem = lemma_of(t, src_lang)
717
+ if src_lang=="Español":
718
+ tr = ES2EN_LEMMA.get(lem)
719
+ out.append(tr if tr else lem)
720
+ else:
721
+ tr = EN2ES_LEMMA.get(lem)
722
+ out.append(tr if tr else lem)
723
+ return " ".join(out)
724
+
725
+ def round_trip(text, src, tgt, mode, max_comp_exact):
726
+ conlang = universal_translate(text, src, tgt, True, False, mode, max_comp_exact)
727
+ back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
728
+ return conlang, back
729
+
730
+ # ------------ UI y explicaciones ------------
731
+ EXPLAIN_ES = """
732
+ **Modo único: Semi-lossless** — Compacto con hints para reconstruir orden/morfología. Round-trip fiable (~90%). Activa "Max Compresión Exacta" para 100% exacto con sidecar oculto (~40% ahorro avg).
733
+ **Conlangs**: Minimax (VSO, ·TAMpersonNQ), Kōmín (SOV, ᵖ/ᵒ Ⓟ[2s]̆?).
734
+ """
735
+
736
+ ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
737
+
738
+ with gr.Blocks(title="Universal Conlang Translator") as demo:
739
+ gr.Markdown("# Universal Conlang Translator · Max Compresión Exacta")
740
+ gr.Markdown(EXPLAIN_ES)
741
+
742
+ # --- Traducir (universal) ---
743
+ with gr.Tab("Traducir"):
744
+ with gr.Row():
745
+ uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
746
+ uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
747
+ uni_text = gr.Textbox(lines=3, label="Texto", value="Hola, ¿cómo estás?")
748
+ with gr.Row():
749
+ uni_drop = gr.Checkbox(value=True, label="Omitir artículos (ES/EN→conlang)")
750
+ uni_zero = gr.Checkbox(value=False, label="Cópula cero (presente afirm.) (ES/EN→conlang)")
751
+ uni_max_comp = gr.Checkbox(value=False, label="Max Compresión Exacta (sidecar oculto)")
752
+ uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
753
+ uni_out = gr.Textbox(lines=6, label="Traducción")
754
+ gr.Button("Traducir").click(
755
+ universal_translate,
756
+ [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
757
+ [uni_out]
758
+ )
759
+
760
+ # --- Construir frase (ES/EN → Conlang) ---
761
+ with gr.Tab("Construir frase (ES/EN → Conlang)"):
762
+ with gr.Row():
763
+ src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
764
+ target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
765
+ text_in = gr.Textbox(lines=3, label="Frase", value="Hola, ¿cómo estás?")
766
+ with gr.Row():
767
+ drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
768
+ zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
769
+ max_comp_build = gr.Checkbox(value=False, label="Max Compresión Exacta (sidecar oculto)")
770
+ mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
771
+ out = gr.Textbox(lines=6, label="Salida")
772
+ gr.Button("Construir").click(
773
+ build_sentence,
774
+ [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
775
+ [out]
776
+ )
777
+
778
+ # --- Decodificar (Conlang → ES/EN) ---
779
+ with gr.Tab("Decodificar (Conlang → ES/EN)"):
780
+ with gr.Row():
781
+ src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Fuente")
782
+ tgt_lang = gr.Dropdown(["Español","English"], value="Español", label="Destino")
783
+ code_in = gr.Textbox(lines=3, label="Texto en conlang (incluye ~blob si procede)")
784
+ out3 = gr.Textbox(lines=6, label="Salida")
785
+
786
+ def decode_lossless_aware(text, src, tgt):
787
+ orig = extract_custom_sidecar(text)
788
+ if orig is not None: return orig
789
+ orig = extract_sidecar_b85(text)
790
+ if orig is not None: return orig
791
+ return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
792
+
793
+ gr.Button("Decodificar").click(
794
+ decode_lossless_aware, [code_in, src_code, tgt_lang], [out3]
795
+ )
796
 
797
+ # --- Round-trip ---
798
+ with gr.Tab("Prueba ida→vuelta"):
799
+ with gr.Row():
800
+ rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
801
+ rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
802
+ rt_text = gr.Textbox(lines=3, label="Frase", value="Hola, ¿cómo estás?")
803
+ rt_max_comp = gr.Checkbox(value=False, label="Max Compresión Exacta")
804
+ rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
805
+ rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)")
806
+ rt_out_back = gr.Textbox(lines=3, label="Vuelta")
807
+ gr.Button("Probar").click(
808
+ round_trip,
809
+ [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp],
810
+ [rt_out_conlang, rt_out_back]
811
+ )
812
 
813
+ if __name__ == "__main__":
814
+ demo.launch()
815
 
816