LoloSemper commited on
Commit
de90cf7
·
verified ·
1 Parent(s): eff6688

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +208 -586
app.py CHANGED
@@ -1,585 +1,171 @@
1
- # app.py — Universal Conlang Translator (Max Compresión Exacta)
2
- # Archivos requeridos en la raíz:
3
- # - lexicon_minimax.json
4
- # - lexicon_komin.json
5
- # - lexicon_master.json
6
- #
7
- # requirements.txt (para HF Spaces):
8
- # gradio>=4.36.0
9
- # spacy>=3.7.4
10
- # es_core_news_sm @ https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.7.0/es_core_news_sm-3.7.0-py3-none-any.whl
11
- # en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
12
-
13
- import os, re, json, base64, zlib
14
- from typing import Dict, Optional, List, Any
15
- import gradio as gr
16
-
17
- # ------------ Archivos esperados ------------
18
- LEX_MINI = "lexicon_minimax.json"
19
- LEX_KOMI = "lexicon_komin.json"
20
- LEX_MASTER = "lexicon_master.json"
21
-
22
- # ------------ Normalización ------------
23
- WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE)
24
- STRIP = str.maketrans("ÁÉÍÓÚÜÑáéíóúüñ", "AEIOUUNaeiouun")
25
- def norm_es(w: str) -> str: return re.sub(r"[^a-záéíóúüñ]", "", (w or "").lower()).translate(STRIP)
26
- def norm_en(w: str) -> str: return re.sub(r"[^a-z]", "", (w or "").lower())
27
-
28
- # ------------ Carga de léxicos ------------
29
- def load_json(path: str):
30
- if not os.path.exists(path): return None
31
- with open(path, "r", encoding="utf-8") as f: return json.load(f)
32
-
33
- def load_lexicons():
34
- mm = load_json(LEX_MINI) or {}
35
- kk = load_json(LEX_KOMI) or {}
36
- master = load_json(LEX_MASTER) or {}
37
-
38
- es2mini = mm.get("mapping", {})
39
- es2komi = kk.get("mapping", {})
40
- mini2es = {v:k for k,v in es2mini.items()}
41
- komi2es = {v:k for k,v in es2komi.items()}
42
-
43
- es2en_lemma: Dict[str,str] = {}
44
- en2es_lemma: Dict[str,str] = {}
45
- en2mini, en2komi = {}, {}
46
- mini2en, komi2en = {}, {}
47
-
48
- if isinstance(master, dict) and "entries" in master:
49
- for e in master["entries"]:
50
- es = norm_es(str(e.get("lemma_es",""))); en = norm_en(str(e.get("lemma_en","")))
51
- mi = str(e.get("minimax","")); ko = str(e.get("komin",""))
52
- if es and en:
53
- es2en_lemma.setdefault(es, en); en2es_lemma.setdefault(en, es)
54
- if en and mi: en2mini.setdefault(en, mi)
55
- if en and ko: en2komi.setdefault(en, ko)
56
-
57
- mini2en = {v:k for k,v in en2mini.items()}
58
- komi2en = {v:k for k,v in en2komi.items()}
59
- return (es2mini, es2komi, mini2es, komi2es,
60
- en2mini, en2komi, mini2en, komi2en,
61
- es2en_lemma, en2es_lemma, master)
62
-
63
- (ES2MINI, ES2KOMI, MINI2ES, KOMI2ES,
64
- EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
65
- ES2EN_LEMMA, EN2ES_LEMMA, MASTER_OBJ) = load_lexicons()
66
-
67
- # ------------ Pronombres (para “Quitar pronombres”) ------------
68
- PRON_ES = {"yo","tú","vos","usted","él","ella","nosotros","vosotros","ustedes","ellos","ellas","me","te","se","nos","os"}
69
- PRON_EN = {"i","you","he","she","it","we","they","me","him","her","us","them"}
70
-
71
- # ------------ OOV reversible (Semi-lossless) ------------
72
- ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
73
- CJK_BASE = (
74
- "天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
75
- "東西南北中外上下午夜明暗手口目耳心言書家道路門"
76
- "大小長短早晚高低新古青紅白黒金銀銅玉米茶酒米"
77
- "文学楽音画体気電海空森林雪雲砂島橋城村国自由静"
78
- )
79
- ALPHA_CJK64 = (CJK_BASE * 2)[:64]
80
-
81
- def to_custom_b64(b: bytes, alphabet: str) -> str:
82
- std = base64.b64encode(b).decode("ascii")
83
- trans = str.maketrans("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", alphabet)
84
- return std.translate(trans).rstrip("=")
85
- def from_custom_b64(s: str, alphabet: str) -> bytes:
86
- trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/")
87
- std = s.translate(trans); pad = "=" * ((4 - len(std) % 4) % 4)
88
- return base64.b64decode(std + pad)
89
-
90
- def enc_oov_minimax(token: str) -> str: return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64)
91
- def dec_oov_minimax(code: str) -> str:
92
- try: return from_custom_b64(code[1:], ALPHA_MINI64).decode("utf-8")
93
- except Exception: return code
94
- def enc_oov_komin(token: str) -> str: return "「" + to_custom_b64(token.encode("utf-8"), ALPHA_CJK64) + "」"
95
- def dec_oov_komin(code: str) -> str:
96
- try: return from_custom_b64(code[1:-1], ALPHA_CJK64).decode("utf-8")
97
- except Exception: return code
98
- def is_oov_minimax(code: str) -> bool: return code.startswith("~") and len(code) > 1
99
- def is_oov_komin(code: str) -> bool: return len(code) >= 2 and code.startswith("「") and code.endswith("」")
100
-
101
- # ------------ spaCy opcional ------------
102
- USE_SPACY = False
103
- try:
104
- import spacy
105
- try:
106
- nlp_es = spacy.load("es_core_news_sm"); nlp_en = spacy.load("en_core_web_sm"); USE_SPACY = True
107
- except Exception:
108
- nlp_es = nlp_en = None
109
- except Exception:
110
- nlp_es = nlp_en = None
111
-
112
- def lemma_of(tok, src_lang: str) -> str:
113
- if src_lang == "Español":
114
- return norm_es(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
115
- else:
116
- return norm_en(tok.lemma_ if getattr(tok,"lemma_","") else tok.text)
117
-
118
- # ------------ Detección simple ------------
119
- def detect_polarity(doc) -> bool: return "?" in getattr(doc,"text","")
120
- def detect_neg(doc) -> bool:
121
- for t in doc:
122
- if getattr(t,"dep_","")=="neg" or getattr(t,"lower_","").lower() in ("no","not","n't"):
123
- return True
124
- return False
125
- def detect_tense(root):
126
- m = str(getattr(root,"morph",""))
127
- if "Tense=Past" in m: return "Past"
128
- if "Tense=Fut" in m: return "Fut"
129
- if "Tense=Pres" in m: return "Pres"
130
- for c in getattr(root,"children",[]):
131
- if getattr(c,"pos_","")=="AUX":
132
- cm = str(getattr(c,"morph",""))
133
- if "Tense=Past" in cm: return "Past"
134
- if getattr(c,"lower_","").lower()=="will": return "Fut"
135
- return "Pres"
136
- def extract_core(doc):
137
- tokens = list(doc)
138
- root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT" and getattr(t,"pos_","") in ("VERB","AUX")), tokens[0] if tokens else doc)
139
- subs, objs, obls, advs = [], [], [], []
140
- for t in getattr(root,"children",[]):
141
- dep = getattr(t,"dep_",""); pos = getattr(t,"pos_","")
142
- if dep in ("nsubj","nsubj:pass","csubj"): subs.append(t)
143
- elif dep in ("obj","dobj","iobj"): objs.append(t)
144
- elif dep in ("obl","pobj"): obls.append(t)
145
- elif dep in ("advmod","advcl") and pos=="ADV": advs.append(t)
146
- for arr in (subs,objs,obls,advs): arr.sort(key=lambda x: getattr(x,"i",0))
147
- return root, subs, objs, obls, advs
148
- def _person_of_doc(doc, src_lang: str) -> Optional[str]:
149
- try:
150
- tokens = list(doc)
151
- root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT"), tokens[0])
152
- subj = next((t for t in getattr(root,"children",[]) if getattr(t,"dep_","").startswith("nsubj")), None)
153
- if subj is None: return None
154
- plur = ("Number=Plur" in str(getattr(subj,"morph",""))) if src_lang=="Español" else (getattr(subj,"tag_","") in ("NNS","NNPS"))
155
- low = getattr(subj,"lower_","").lower()
156
- if src_lang=="Español":
157
- if low in ("yo",): return "1p" if plur else "1s"
158
- if low in ("tú","vos"): return "2p" if plur else "2s"
159
- if low in ("usted","él","ella"): return "3p" if plur else "3s"
160
- lem = lemma_of(subj, "Español")
161
- if lem in ("yo","nosotros"): return "1p" if plur else "1s"
162
- if lem in ("tú","vosotros"): return "2p" if plur else "2s"
163
- return "3p" if plur else "3s"
164
- else:
165
- if low in ("i",): return "1p" if plur else "1s"
166
- if low in ("you",): return "2p" if plur else "2s"
167
- if low in ("he","she","it"): return "3p" if plur else "3s"
168
- return "3p" if plur else "3s"
169
- except Exception:
170
- return None
171
- def detect_person(root, src_lang: str) -> Optional[str]:
172
- m = str(getattr(root,"morph","")); person_str, number_str = "3","s"
173
- if "Person=" in m:
174
- for feat in m.split("|"):
175
- if feat.startswith("Person="): person_str = feat.split("=")[1]
176
- elif feat.startswith("Number="): number_str = "p" if feat.split("=")[1]=="Plur" else "s"
177
- return person_str + number_str
178
- return _person_of_doc(root.doc, src_lang)
179
-
180
- # ------------ Mapeo y fraseadores ------------
181
- def code_es(lemma: str, target: str) -> str:
182
- lemma = norm_es(lemma)
183
- if target=="Minimax-ASCII":
184
- return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
185
- return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
186
- def code_en(lemma: str, target: str) -> str:
187
- lemma = norm_en(lemma)
188
- if target=="Minimax-ASCII":
189
- return (EN2MINI.get(lemma) if EN2MINI else None) or enc_oov_minimax(lemma)
190
- return (EN2KOMI.get(lemma) if EN2KOMI else None) or enc_oov_komin(lemma)
191
-
192
- TAM_MINI = {"Pres":"P","Past":"T","Fut":"F","UNK":"P"}
193
- TAM_KOMI = {"Pres":"Ⓟ","Past":"Ⓣ","Fut":"Ⓕ","UNK":"Ⓟ"}
194
-
195
- def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True,
196
- semi_lossless=False, person_hint="2s", remove_pronouns=False):
197
- root, subs, objs, obls, advs = extract_core(doc)
198
- tense = detect_tense(root); is_q, is_neg = detect_polarity(doc), detect_neg(doc)
199
- vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
200
- vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII")
201
- tail = TAM_MINI.get(tense, "P")
202
- if semi_lossless: tail += (detect_person(root, src_lang) or person_hint)
203
- if is_neg: tail += "N";
204
- if is_q: tail += "Q"
205
- if tail: vcode = f"{vcode}·{tail}"
206
-
207
- def realize_np(tokens):
208
- outs=[]
209
- for t in tokens:
210
- if remove_pronouns:
211
- txt = (getattr(t,"text","") or "").lower()
212
- if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue
213
- lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
214
- outs.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII"))
215
- return outs
216
-
217
- S = realize_np(subs); O = realize_np(objs)+realize_np(obls)
218
- ADV=[]
219
- for a in advs:
220
- lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
221
- ADV.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII"))
222
-
223
- parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else [vcode]+S+O+ADV
224
- return " ".join(p for p in parts if p)
225
-
226
- def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True,
227
- semi_lossless=False, person_hint="2s", remove_pronouns=False):
228
- root, subs, objs, obls, advs = extract_core(doc)
229
- tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc)
230
- vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar")
231
- vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK")
232
- P_SUBJ, P_OBJ = "ᵖ", "ᵒ"; NEG_M, Q_FIN = "̆", "?"
233
- TAM = TAM_KOMI.get(tense,"Ⓟ")
234
- if semi_lossless: TAM = TAM + f"[{detect_person(root, src_lang) or person_hint}]"
235
-
236
- def realize_np(tokens, particle):
237
- outs=[]
238
- for t in tokens:
239
- if remove_pronouns:
240
- txt = (getattr(t,"text","") or "").lower()
241
- if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue
242
- lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","")
243
- outs.append((code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK")) + particle)
244
- return outs
245
-
246
- S = realize_np(subs, P_SUBJ); O = realize_np(objs+obls, P_OBJ)
247
- ADV=[]
248
- for a in advs:
249
- lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","")
250
- ADV.append(code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK"))
251
-
252
- parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else S+O+ADV+[vcode+TAM+("̆" if is_neg else "")]
253
- out = " ".join(parts)
254
- if is_q: out += " " + Q_FIN
255
- return out
256
-
257
- # ------------ Sidecars (compresión exacta) ------------
258
- SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
259
- def b85_enc_raw(s: str) -> str: return base64.a85encode(zlib.compress(s.encode("utf-8"), 9), adobe=False).decode("ascii")
260
- def b85_dec_raw(b85s: str) -> str: return zlib.decompress(base64.a85decode(b85s.encode("ascii"), adobe=False)).decode("utf-8")
261
- def attach_sidecar_b85(conlang_text: str, original_text: str) -> str: return f"{conlang_text} §({b85_enc_raw(original_text)})"
262
- def extract_sidecar_b85(text: str) -> Optional[str]:
263
- m = SIDECAR_B85_RE.search(text);
264
- if not m: return None
265
- try: return b85_dec_raw(m.group("b85"))
266
- except Exception: return None
267
- def strip_sidecar_b85(text: str) -> str: return SIDECAR_B85_RE.sub("", text).rstrip()
268
- def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
269
- blob = to_custom_b64(zlib.compress(original_text.encode("utf-8"), 9), ALPHA_MINI64)
270
- return f"{conlang_text} ~{blob}"
271
- def extract_custom_sidecar(text: str) -> Optional[str]:
272
- if '~' in text:
273
- core, blob = text.rsplit('~', 1)
274
- try: return zlib.decompress(from_custom_b64(blob, ALPHA_MINI64)).decode("utf-8")
275
- except Exception: return None
276
- return None
277
- def strip_custom_sidecar(text: str) -> str: return text.split('~')[0].rstrip() if '~' in text else text
278
-
279
- # ------------ Codificación / decodificación simple ------------
280
- def encode_simple(text: str, src_lang: str, target: str) -> str:
281
- if not text.strip(): return ""
282
- def repl_es(m):
283
- key = norm_es(m.group(0))
284
- code = ES2MINI.get(key) if target=="Minimax-ASCII" else ES2KOMI.get(key)
285
- return code or (enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0)))
286
- def repl_en(m):
287
- key = norm_en(m.group(0)); table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
288
- if table and key in table: return table[key]
289
- return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
290
- repl = repl_es if src_lang=="Español" else repl_en
291
- return WORD_RE.sub(repl, text)
292
-
293
- def pluralize_es(word: str) -> str:
294
- exceptions = {"uno":"unos","buen":"buenos","hombre":"hombres"}
295
- if word in exceptions: return exceptions[word]
296
- if word.endswith("z"): return word[:-1]+"ces"
297
- if word.endswith(("a","e","i","o")): return word+"s"
298
- return word+"es"
299
- def pluralize_en(word: str) -> str:
300
- exceptions = {"man":"men","woman":"women","child":"children"}
301
- if word in exceptions: return exceptions[word]
302
- if word.endswith("y") and len(word)>1 and word[-2] not in "aeiou": return word[:-1]+"ies"
303
- if word.endswith(("s","sh","ch","x","z")): return word+"es"
304
- return word+"s"
305
- def pluralize(word: str, tgt_lang: str) -> str: return pluralize_es(word) if tgt_lang=="Español" else pluralize_en(word)
306
-
307
- mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
308
-
309
- def decode_simple(text: str, source: str, tgt_lang: str) -> str:
310
- if not text.strip(): return ""
311
- code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
312
- code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
313
- if source=="Kōmín-CJK":
314
- text = text.replace("?","?").replace(" "," ")
315
- return " ".join([code2es.get(w,w) for w in text.split() if w!="?"])
316
- tokens = text.split();
317
- if not tokens: return ""
318
- lemma_tokens, pl_flags = [], []; verb_idx=-1; verb_lemma=None; verb_tense="Pres"; verb_person="3s"; has_q=False; is_neg=False
319
- for part in tokens:
320
- look = part.replace("[PL]",""); had_pl = "[PL]" in part; pl_flags.append(had_pl)
321
- m = mini_tail_re.match(look)
322
- if m:
323
- verb_idx = len(lemma_tokens); stem=m.group("stem"); tail=m.group("tail")
324
- vlem_es = code2es.get(stem); vlem_en = code2en.get(stem) if code2en else None
325
- vlem = vlem_es if tgt_lang=="Español" else (vlem_en or vlem_es or stem)
326
- if not vlem: vlem = dec_oov_minimax(stem) if is_oov_minimax(stem) else stem
327
- lemma_tokens.append(vlem); pl_flags.append(False)
328
- if tail:
329
- if tail[0] in "PTF":
330
- verb_tense = {"P":"Pres","T":"Past","F":"Fut"}[tail[0]]; pos=1
331
- if len(tail)>pos and tail[pos] in "123":
332
- pos+=1; verb_person = tail[pos-1] + (tail[pos] if len(tail)>pos and tail[pos] in "sp" else "s")
333
- if len(tail)>pos and tail[pos] in "sp": pos+=1
334
- is_neg = "N" in tail[pos:]; has_q = "Q" in tail[pos:]
335
- verb_lemma = vlem; continue
336
- w_es = code2es.get(look); w_en = code2en.get(look) if code2en else None
337
- w = w_es if tgt_lang=="Español" else (w_en or w_es or look)
338
- if not w: w = dec_oov_minimax(look) if is_oov_minimax(look) else look
339
- lemma_tokens.append(w); pl_flags.append(had_pl)
340
- out_parts=[]
341
- for idx, lem in enumerate(lemma_tokens):
342
- if idx==verb_idx:
343
- v_conj = _es_conj(verb_lemma, verb_tense, verb_person) if tgt_lang=="Español" else _en_conj(verb_lemma, verb_tense, verb_person)
344
- if is_neg: v_conj = ("no " if tgt_lang=="Español" else "not ") + v_conj
345
- out_parts.append(v_conj)
346
- else:
347
- out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
348
- out_text = " ".join(out_parts)
349
- if has_q:
350
- start_q = "¿" if tgt_lang=="Español" else ""
351
- out_text = f"{start_q}{out_text.capitalize()}?"
352
- return out_text
353
-
354
- # ------------ Conjugadores mínimos ------------
355
- def _es_conj_regular(lemma, tense, person):
356
- if not lemma.endswith(("ar","er","ir")): return lemma
357
- stem, vtype = lemma[:-2], lemma[-2:]
358
- pres={"ar":{"1s":"o","2s":"as","3s":"a","1p":"amos","2p":"áis","3p":"an"},
359
- "er":{"1s":"o","2s":"es","3s":"e","1p":"emos","2p":"éis","3p":"en"},
360
- "ir":{"1s":"o","2s":"es","3s":"e","1p":"imos","2p":"ís","3p":"en"}}
361
- pret={"ar":{"1s":"é","2s":"aste","3s":"ó","1p":"amos","2p":"asteis","3p":"aron"},
362
- "er":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"},
363
- "ir":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"ieron"}}
364
- fut={"1s":"é","2s":"ás","3s":"á","1p":"emos","2p":"éis","3p":"án"}
365
- if tense=="Pres": return stem + pres[vtype].get(person, pres[vtype]["3s"])
366
- if tense=="Past": return stem + pret[vtype].get(person, pret[vtype]["3s"])
367
- return lemma + fut.get(person, fut["3s"])
368
- def _es_conj(lemma, tense, person):
369
- if lemma=="ser":
370
- tab={"Pres":{"1s":"soy","2s":"eres","3s":"es","1p":"somos","2p":"sois","3p":"son"},
371
- "Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"},
372
- "Fut":{"1s":"seré","2s":"serás","3s":"será","1p":"seremos","2p":"seréis","3p":"serán"}}
373
- return tab[tense].get(person, tab[tense]["3s"])
374
- if lemma=="estar":
375
- tab={"Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
376
- "Past":{"1s":"estuve","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"},
377
- "Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"}}
378
- return tab[tense].get(person, tab[tense]["3s"])
379
- if lemma=="ir":
380
- tab={"Pres":{"1s":"voy","2s":"vas","3s":"va","1p":"vamos","2p":"vais","3p":"van"},
381
- "Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"},
382
- "Fut":{"1s":"iré","2s":"irás","3s":"irá","1p":"iremos","2p":"iréis","3p":"irán"}}
383
- return tab[tense].get(person, tab[tense]["3s"])
384
- return _es_conj_regular(lemma, tense, person)
385
- def _en_conj(lemma, tense, person):
386
- if lemma=="be":
387
- if tense=="Pres": return {"1s":"am","2s":"are","3s":"is","1p":"are","2p":"are","3p":"are"}.get(person,"is")
388
- if tense=="Past": return {"1s":"was","2s":"were","3s":"was","1p":"were","2p":"were","3p":"were"}.get(person,"was")
389
- return "be"
390
- if lemma=="have":
391
- if tense=="Pres": return "has" if person=="3s" else "have"
392
- if tense=="Past": return "had"
393
- return "have"
394
- if lemma=="go":
395
- if tense=="Past": return "went"
396
- return "goes" if (tense=="Pres" and person=="3s") else "go"
397
- if lemma=="do":
398
- if tense=="Past": return "did"
399
- return "does" if (tense=="Pres" and person=="3s") else "do"
400
- if tense=="Pres":
401
- if person=="3s":
402
- if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1]+"ies"
403
- if lemma.endswith(("s","sh","ch","x","z","o")): return lemma+"es"
404
- return lemma+"s"
405
- return lemma
406
- if tense=="Past":
407
- if lemma.endswith("e"): return lemma+"d"
408
- if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1]+"ied"
409
- return lemma+"ed"
410
- return lemma
411
-
412
- # ------------ Rutas principales ------------
413
- def _build_with_spacy(text: str, src_lang: str, target: str,
414
- drop_articles: bool, zero_copula: bool, semi_lossless: bool,
415
- remove_pronouns: bool) -> str:
416
- nlp = nlp_es if src_lang=="Español" else nlp_en
417
- doc = nlp(text)
418
- if target=="Minimax-ASCII":
419
- return realize_minimax(doc, src_lang, drop_articles, zero_copula, semi_lossless=True, remove_pronouns=remove_pronouns)
420
- else:
421
- return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless=True, remove_pronouns=remove_pronouns)
422
-
423
- def build_sentence(text: str, src_lang: str, target: str,
424
- drop_articles: bool, zero_copula: bool, mode: str,
425
- max_comp_exact: bool = False, remove_pronouns: bool = False) -> str:
426
- if not text.strip(): return ""
427
- if USE_SPACY:
428
- core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula, True, remove_pronouns)
429
- else:
430
- if remove_pronouns:
431
- pron = PRON_ES if src_lang=="Español" else PRON_EN
432
- tokens = re.findall(r"\w+|[^\w\s]+", text)
433
- text = " ".join([w for w in tokens if w.lower() not in pron])
434
- core = encode_simple(text, src_lang, target)
435
- return custom_sidecar_enc(core, text) if max_comp_exact else core
436
-
437
- def universal_translate(text: str, src: str, tgt: str,
438
- drop_articles: bool, zero_copula: bool,
439
- mode: str, max_comp_exact: bool = False,
440
- remove_pronouns: bool = False) -> str:
441
- if not text.strip(): return ""
442
- if src == tgt: return text
443
- if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
444
- return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact, remove_pronouns)
445
- if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
446
- orig = extract_custom_sidecar(text)
447
- if orig is not None: return orig
448
- orig = extract_sidecar_b85(text)
449
- if orig is not None: return orig
450
- return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
451
- if src in ("Español","English") and tgt in ("Español","English"):
452
- return translate_natural(text, src, tgt)
453
- if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
454
- orig = extract_custom_sidecar(text)
455
- if orig is not None:
456
- core = strip_custom_sidecar(text)
457
- es_lemmas = decode_simple(core, src, "Español")
458
- words = re.findall(r"\w+|[^\w\s]+", es_lemmas); out=[]
459
- for w in words:
460
- if re.fullmatch(r"\w+", w):
461
- code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
462
- out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
463
- else: out.append(w)
464
- return custom_sidecar_enc(" ".join(out), orig)
465
- es_lemmas = decode_simple(text, src, "Español")
466
- words = re.findall(r"\w+|[^\w\s]+", es_lemmas); out=[]
467
- for w in words:
468
- if re.fullmatch(r"\w+", w):
469
- code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
470
- out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
471
- else: out.append(w)
472
- return " ".join(out)
473
- return "[No soportado]"
474
-
475
- def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
476
- if not text.strip(): return ""
477
- if not USE_SPACY: return text
478
- nlp = nlp_es if src_lang=="Español" else nlp_en
479
- doc = nlp(text); out=[]
480
- for t in doc:
481
- if not getattr(t,"is_alpha",False): out.append(getattr(t,"text","")); continue
482
- lem = lemma_of(t, src_lang)
483
- if src_lang=="Español":
484
- tr = ES2EN_LEMMA.get(lem); out.append(tr if tr else lem)
485
- else:
486
- tr = EN2ES_LEMMA.get(lem); out.append(tr if tr else lem)
487
- return " ".join(out)
488
-
489
- def round_trip(text, src, tgt, mode, max_comp_exact):
490
- conlang = universal_translate(text, src, tgt, True, False, mode, max_comp_exact, False)
491
- back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact, False)
492
- return conlang, back
493
-
494
  # =====================================================================================
495
  # ========================= UI bilingüe y explicaciones claras ========================
496
  # =====================================================================================
497
 
498
  ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
499
 
500
- # Secciones de ayuda (ES/EN) todas en el MISMO nivel, como acordeones
501
- COMPACT_ES = """
502
- **📏 Compactación orientativa (haz clic para desplegar)**
503
- - Sin casillas: **0%**
504
- - Omitir artículos: **~10–15%**
505
- - Cópula cero (presente afirm.): **~5–10%**
506
- - Ambas (artículos + cópula): **~15–20%**
507
- - Máx. Compresión Exacta: **~40–60%** en textos >100 caracteres (con `~...`). En textos muy cortos puede no reducir.
508
- """
509
- COMPACT_EN = """
510
- **📏 Typical compaction (click to expand)**
511
- - No options: **0%**
512
- - Drop articles: **~10–15%**
513
- - Zero copula (present affirmative): **~5–10%**
514
- - Both (articles + copula): **~15–20%**
515
- - Max Exact Compression: **~40–60%** for >100 chars (`~...`). Very short texts may not shrink.
516
- """
517
-
 
 
518
  EXPLAIN_TAB_TRANSLATE_ES = """
519
- **🔁 Traducir (haz clic para desplegar)**
520
- Convierte el *Texto* al *Destino*. Funciona para **cualquier combinación**: Español, English, Minimax-ASCII, Kōmín-CJK.
521
- - **Máx. Compresión Exacta** añade `~...` con el original comprimido para poder **recuperarlo exactamente** al decodificar.
522
- - **Omitir artículos / Cópula cero / Quitar pronombres** se aplican **solo cuando el destino es conlang** (Minimax/Kōmín).
523
  """
 
524
  EXPLAIN_TAB_BUILD_ES = """
525
- **🛠️ Construir (ES/EN Conlang) (haz clic para desplegar)**
526
- Fuerza la salida **en conlang** desde Español o Inglés aplicando reglas de fraseo (orden, partículas/TAM) y tus **checkbox**.
527
- Útil para ver cómo quedaría la frase **directamente en Minimax/Kōmín** sin ambigüedad de direcciones.
528
  """
 
529
  EXPLAIN_TAB_DECODE_ES = """
530
- **🗝️ Decodificar (Conlang → ES/EN) (haz clic para desplegar)**
531
- Convierte **Minimax/Kōmín** a **Español o Inglés**.
532
- - Si hay `~...`, devuelve el **original exacto**.
533
- - Sin `~...`, la vuelta es **semi-lossless** usando el léxico y pistas simples.
534
  """
 
535
  EXPLAIN_TAB_ROUNDTRIP_ES = """
536
- **🔄 Prueba idavuelta (haz clic para desplegar)**
537
- Ejecuta **(ES/EN Conlang) (Conlang ES/EN)** para comprobar **reversibilidad**.
538
- Con **Máx. Compresión Exacta**, la vuelta coincide **bit a bit** con la entrada.
539
  """
 
540
  EXPLAIN_CHECKBOX_ES = """
541
- **☑️ ¿Qué hace cada checkbox? (haz clic para desplegar)**
542
- - **Omitir artículos**: quita *el/la/los/las* (ES) y *a/an/the* (EN) → **~10–15%**.
543
- - **Cópula cero (presente afirm.)**: esconde *ser/estar/be* cuando suena natural → **~5–10%** extra.
544
- - **Quitar pronombres**: elimina pronombres de sujeto/objeto **evidentes** (ahorro variable).
545
- - **Máx. Compresión Exacta**: añade `~...` con zlib para recuperación exacta (**~40–60%** en >100 caracteres).
546
- """
547
 
548
- LEXICON_BUILD_ES = """
549
- **ℹ️ Léxico (OMW Minimax/Kōmín) (haz clic para desplegar)**
550
- 1) Desde **OMW/WordNet 1.4** se extraen **lemas ES** y sus **equivalentes EN** por sinset.
551
- 2) Se normalizan y ordenan por **frecuencia** (wordfreq).
552
- 3) Opcional: **spaCy** refina lemas; **Argos** puede rellenar EN faltantes.
553
- 4) Se asignan **códigos compactos** con alfabetos barajados por **SEED** hasta `MAXLEN_MINI`/`MAXLEN_CJK`.
554
- 5) Se exportan: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+TSV).
555
- **Vista previa** de `lexicon_master.json` abajo.
 
 
 
556
  """
557
 
558
- # (EN) versiones cortas
559
  EXPLAIN_TAB_TRANSLATE_EN = """
560
- **🔁 Translate (click to expand)** Converts *Text* to *Target* (any pair: Spanish/English/Minimax/Kōmín).
561
- With **Max Exact Compression**, appends `~...` to recover the **exact original**. Checkboxes apply when **target is conlang**.
 
 
562
  """
 
563
  EXPLAIN_TAB_BUILD_EN = """
564
- **🛠️ Build (ES/EN → Conlang) (click to expand)** — Forces conlang output (Minimax/Kōmín) with phrasing rules and your checkboxes.
 
565
  """
 
566
  EXPLAIN_TAB_DECODE_EN = """
567
- **🗝️ Decode (Conlang → ES/EN) (click to expand)** — If `~...` is present, returns the **bit-perfect original**; otherwise semi-lossless.
 
 
 
568
  """
 
569
  EXPLAIN_TAB_ROUNDTRIP_EN = """
570
- **🔄 Round-trip (click to expand)** — Runs (ES/EN → Conlang) → (Conlang → ES/EN) to verify reversibility.
 
571
  """
 
572
  EXPLAIN_CHECKBOX_EN = """
573
- **☑️ Checkboxes (click to expand)**
574
- - **Drop articles**: ~10–15%
575
- - **Zero copula (present affirm.)**: ~510% extra
576
- - **Remove pronouns**: variable
577
- - **Max Exact Compression**: ~40–60% for >100 chars (`~...`), exact recovery.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  """
 
579
  LEXICON_BUILD_EN = """
580
- **ℹ️ Lexicon (OMW → Minimax/Kōmín) (click to expand)** — OMW/WordNet ES lemmas + EN counterparts, normalized & frequency-sorted; optional spaCy/Argos; codes assigned with SEED-shuffled alphabets up to MAXLEN; exports JSON/TSV. Preview below.
 
 
 
 
 
 
 
 
581
  """
582
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
583
  def master_preview(n: int = 20) -> List[List[Any]]:
584
  try:
585
  entries = (MASTER_OBJ or {}).get("entries", [])
@@ -595,16 +181,22 @@ def master_preview(n: int = 20) -> List[List[Any]]:
595
  def make_group_es():
596
  with gr.Group(visible=True) as g:
597
  gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
598
- # Acordeones de EXPLICACIÓNtodos al MISMO nivel
599
  with gr.Row():
600
  with gr.Column():
601
- with gr.Accordion(EXPLAIN_TAB_TRANSLATE_ES, open=False): pass
602
- with gr.Accordion(EXPLAIN_TAB_BUILD_ES, open=False): pass
603
- with gr.Accordion(EXPLAIN_TAB_DECODE_ES, open=False): pass
604
- with gr.Accordion(EXPLAIN_TAB_ROUNDTRIP_ES, open=False): pass
 
 
 
 
605
  with gr.Column():
606
- with gr.Accordion(EXPLAIN_CHECKBOX_ES, open=False): gr.Markdown(COMPACT_ES)
607
- with gr.Accordion(LEXICON_BUILD_ES, open=False):
 
 
608
  n_rows = gr.Slider(5, 100, value=20, step=5, label="Filas a mostrar")
609
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
610
  gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows], [table])
@@ -626,14 +218,20 @@ def make_group_es():
626
  btn_tr = gr.Button("🚀 Traducir", variant="primary")
627
  btn_tr_cl = gr.Button("🧹 Limpiar")
628
  uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
 
 
 
 
 
 
629
 
630
- btn_tr.click(universal_translate,
631
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
632
- [uni_out])
633
  btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
634
 
635
- with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
636
- gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES + "\n\n" + COMPACT_ES)
637
 
638
  with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
639
  with gr.Row():
@@ -650,14 +248,20 @@ def make_group_es():
650
  btn_b = gr.Button("🏗️ Construir", variant="primary")
651
  btn_b_cl = gr.Button("🧹 Limpiar")
652
  out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
 
653
 
654
- btn_b.click(build_sentence,
 
 
 
 
 
655
  [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
656
- [out])
657
  btn_b_cl.click(lambda: ("",""), None, [text_in, out])
658
 
659
- with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
660
- gr.Markdown(EXPLAIN_TAB_BUILD_ES + "\n\n" + COMPACT_ES)
661
 
662
  with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
663
  with gr.Row():
@@ -680,7 +284,7 @@ def make_group_es():
680
  btn_d.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
681
  btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
682
 
683
- with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
684
  gr.Markdown(EXPLAIN_TAB_DECODE_ES)
685
 
686
  with gr.Tab("🔄 Prueba ida→vuelta"):
@@ -699,7 +303,7 @@ def make_group_es():
699
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
700
  btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
701
 
702
- with gr.Accordion("Ayuda rápida (¿qué hace este botón?)", open=False):
703
  gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
704
  return g
705
 
@@ -708,13 +312,19 @@ def make_group_en():
708
  gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
709
  with gr.Row():
710
  with gr.Column():
711
- with gr.Accordion(EXPLAIN_TAB_TRANSLATE_EN, open=False): pass
712
- with gr.Accordion(EXPLAIN_TAB_BUILD_EN, open=False): pass
713
- with gr.Accordion(EXPLAIN_TAB_DECODE_EN, open=False): pass
714
- with gr.Accordion(EXPLAIN_TAB_ROUNDTRIP_EN, open=False): pass
 
 
 
 
715
  with gr.Column():
716
- with gr.Accordion(EXPLAIN_CHECKBOX_EN, open=False): gr.Markdown(COMPACT_EN)
717
- with gr.Accordion(LEXICON_BUILD_EN, open=False):
 
 
718
  n_rows = gr.Slider(5, 100, value=20, step=5, label="Rows to show")
719
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
720
  gr.Button("Refresh").click(lambda n: master_preview(int(n)), [n_rows], [table])
@@ -735,14 +345,20 @@ def make_group_en():
735
  btn_tr = gr.Button("🚀 Translate", variant="primary")
736
  btn_tr_cl = gr.Button("🧹 Clear")
737
  uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
 
 
 
 
 
 
738
 
739
- btn_tr.click(universal_translate,
740
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
741
- [uni_out])
742
  btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
743
 
744
- with gr.Accordion("Quick help (what does this button do?)", open=False):
745
- gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN + "\n\n" + COMPACT_EN)
746
 
747
  with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
748
  with gr.Row():
@@ -759,14 +375,20 @@ def make_group_en():
759
  btn_b = gr.Button("🏗️ Build", variant="primary")
760
  btn_b_cl = gr.Button("🧹 Clear")
761
  out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
 
 
 
 
 
 
762
 
763
- btn_b.click(build_sentence,
764
  [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
765
- [out])
766
  btn_b_cl.click(lambda: ("",""), None, [text_in, out])
767
 
768
- with gr.Accordion("Quick help (what does this button do?)", open=False):
769
- gr.Markdown(EXPLAIN_TAB_BUILD_EN + "\n\n" + COMPACT_EN)
770
 
771
  with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
772
  with gr.Row():
@@ -775,7 +397,7 @@ def make_group_en():
775
  code_in = gr.Textbox(lines=3, label="Conlang text (may include `~...`)", show_copy_button=True)
776
  out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
777
 
778
- def decode_lossless_aware(text, src, tgt):
779
  orig = extract_custom_sidecar(text)
780
  if orig is not None: return orig
781
  orig = extract_sidecar_b85(text)
@@ -786,10 +408,10 @@ def make_group_en():
786
  btn_d = gr.Button("🔓 Decode", variant="primary")
787
  btn_d_cl = gr.Button("🧹 Clear")
788
 
789
- btn_d.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
790
  btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
791
 
792
- with gr.Accordion("Quick help (what does this button do?)", open=False):
793
  gr.Markdown(EXPLAIN_TAB_DECODE_EN)
794
 
795
  with gr.Tab("🔄 Round-trip"):
@@ -808,7 +430,7 @@ def make_group_en():
808
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
809
  btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
810
 
811
- with gr.Accordion("Quick help (what does this button do?)", open=False):
812
  gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
813
  return g
814
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # =====================================================================================
2
  # ========================= UI bilingüe y explicaciones claras ========================
3
  # =====================================================================================
4
 
5
  ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
6
 
7
+ # ---- Bloques de explicación (cortos para TÍTULO + largos para CONTENIDO) ----
8
+ ACC_TITLES_ES = {
9
+ "translate": "🔁 Traducir — ¿Qué hace? (haz clic para desplegar)",
10
+ "build": "🛠️ Construir (ES/EN → Conlang) — ¿Qué hace?",
11
+ "decode": "🗝️ Decodificar (Conlang → ES/EN) — ¿Qué hace?",
12
+ "roundtrip": "🔄 Prueba ida→vuelta ¿Qué hace?",
13
+ "checkbox": "☑️ Opciones y compactación (artículos, cópula, pronombres, exacta)",
14
+ "lexicon": "ℹ️ Léxico (OMW Minimax/Kōmín) explicación y vista previa"
15
+ }
16
+
17
+ ACC_TITLES_EN = {
18
+ "translate": "🔁 Translate — What does it do? (click to expand)",
19
+ "build": "🛠️ Build (ES/EN → Conlang) — What does it do?",
20
+ "decode": "🗝️ Decode (Conlang → ES/EN) — What does it do?",
21
+ "roundtrip": "🔄 Round-trip What does it do?",
22
+ "checkbox": "☑️ Options & compaction (articles, copula, pronouns, exact)",
23
+ "lexicon": "ℹ️ Lexicon (OMW → Minimax/Kōmín) — explainer & preview"
24
+ }
25
+
26
+ # Contenidos (Markdown) — ya limpios (se verán dentro del Accordion)
27
  EXPLAIN_TAB_TRANSLATE_ES = """
28
+ Convierte el **Texto** al **Destino**. Funciona para cualquier combinación: Español, English, Minimax-ASCII y Kōmín-CJK.
29
+
30
+ - Si activas **Máx. Compresión Exacta**, añade un remolque `~...` con el **original comprimido** para recuperarlo **exactamente** al decodificar.
31
+ - Los **checkbox** (Omitir artículos / Cópula cero / Quitar pronombres) **solo aplican** cuando el **Destino es un conlang** (Minimax o Kōmín).
32
  """
33
+
34
  EXPLAIN_TAB_BUILD_ES = """
35
+ Fuerza la salida **en conlang** (Minimax o Kōmín) desde Español o Inglés.
36
+ Aplica reglas de fraseo (orden, partículas/TAM) y las opciones de **compactación**.
 
37
  """
38
+
39
  EXPLAIN_TAB_DECODE_ES = """
40
+ Convierte **Minimax/Kōmín** a **Español o Inglés**.
41
+
42
+ - Si el texto trae `~...`, devuelve el **original exacto**.
43
+ - Si no hay `~...`, la reconstrucción es **semi-lossless** con léxico y pistas simples.
44
  """
45
+
46
  EXPLAIN_TAB_ROUNDTRIP_ES = """
47
+ Ejecuta **(ES/ENConlang) → (Conlang ES/EN)** para comprobar **reversibilidad**.
48
+ Con **Máx. Compresión Exacta**, la vuelta coincide **bit a bit**.
 
49
  """
50
+
51
  EXPLAIN_CHECKBOX_ES = """
52
+ **Qué hace cada opción:**
 
 
 
 
 
53
 
54
+ - **Omitir artículos** (el/la/los/las; a/an/the): ahorro típico **~10–15%**.
55
+ - **Cópula cero (presente afirm.)**: omite *ser/estar/be* cuando suena natural → **~5–10%** extra.
56
+ - **Quitar pronombres**: elimina pronombres de sujeto/objeto evidentes ahorro **variable**.
57
+ - **Máx. Compresión Exacta**: añade `~...` (zlib) para recuperación exacta. En >100 caracteres, **~40–60%**; en textos cortos puede no reducir.
58
+
59
+ **Referencia orientativa:**
60
+ - Sin casillas: **0%**
61
+ - Solo artículos: **~10–15%**
62
+ - Solo cópula: **~5–10%**
63
+ - Artículos + cópula: **~15–20%**
64
+ - Con exacta: **~40–60%** (si el texto es suficientemente largo)
65
  """
66
 
 
67
  EXPLAIN_TAB_TRANSLATE_EN = """
68
+ Converts **Text** to **Target**. Works for any pair: Spanish, English, Minimax-ASCII, Kōmín-CJK.
69
+
70
+ - **Max Exact Compression** appends `~...` with the **exact original** for perfect recovery.
71
+ - Checkboxes (Drop articles / Zero copula / Remove pronouns) apply **only when the Target is a conlang**.
72
  """
73
+
74
  EXPLAIN_TAB_BUILD_EN = """
75
+ Forces **conlang output** (Minimax or Kōmín) from Spanish/English.
76
+ Applies phrasing rules (order, particles/TAM) and **compaction** options.
77
  """
78
+
79
  EXPLAIN_TAB_DECODE_EN = """
80
+ Converts **Minimax/Kōmín** to **Spanish/English**.
81
+
82
+ - If `~...` is present, returns the **bit-perfect original**.
83
+ - Otherwise, reconstructs **semi-losslessly** using the lexicon.
84
  """
85
+
86
  EXPLAIN_TAB_ROUNDTRIP_EN = """
87
+ Runs **(ES/EN → Conlang) → (Conlang → ES/EN)** to verify **reversibility**.
88
+ With **Max Exact Compression**, the return matches bit-for-bit.
89
  """
90
+
91
  EXPLAIN_CHECKBOX_EN = """
92
+ **What each option does:**
93
+
94
+ - **Drop articles**: **~1015%**.
95
+ - **Zero copula (present affirmative)**: **~5–10%** extra.
96
+ - **Remove pronouns**: variable savings.
97
+ - **Max Exact Compression**: `~...` (zlib) for exact recovery. For >100 chars, **~40–60%**; very short texts may not shrink.
98
+
99
+ **Reference (approx):**
100
+ - No options: **0%**
101
+ - Articles only: **~10–15%**
102
+ - Copula only: **~5–10%**
103
+ - Articles + Copula: **~15–20%**
104
+ - With exact: **~40–60%** (if text is long enough)
105
+ """
106
+
107
+ LEXICON_BUILD_ES = """
108
+ Se construyó así:
109
+
110
+ 1. De **OMW/WordNet 1.4** se extraen **lemas ES** y sus **equivalentes EN** por sinset.
111
+ 2. Normalización y orden por **frecuencia** (*wordfreq*).
112
+ 3. Opcional: **spaCy** refina lemas; **Argos** puede rellenar EN faltantes.
113
+ 4. Asignación de **códigos compactos** con alfabetos barajados por **SEED** hasta `MAXLEN_MINI`/`MAXLEN_CJK`.
114
+ 5. Exporta: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+ TSV).
115
+
116
+ **Vista previa** de `lexicon_master.json` (elige cuántas filas ver) aquí abajo.
117
  """
118
+
119
  LEXICON_BUILD_EN = """
120
+ Built as follows:
121
+
122
+ 1. From **OMW/WordNet 1.4**, gather **ES lemmas** and **EN counterparts** by synset.
123
+ 2. Normalize and sort by **frequency** (*wordfreq*).
124
+ 3. Optional: **spaCy** refines lemmas; **Argos** may fill missing EN.
125
+ 4. Assign **compact codes** with **SEED-shuffled** alphabets up to `MAXLEN_MINI`/`MAXLEN_CJK`.
126
+ 5. Exports: `lexicon_minimax.json`, `lexicon_komin.json`, `lexicon_master.json` (+ TSV).
127
+
128
+ **Preview** of `lexicon_master.json` below.
129
  """
130
 
131
+ # ---------- Utilidad: cálculo de compactación ----------
132
+ def _pct_comp(original: str, result: str) -> float:
133
+ if not original: return 0.0
134
+ return max(0.0, 100.0 * (1.0 - (len(result) / len(original))))
135
+
136
+ def compaction_report_es(text, src, tgt, drop, zero, rm, maxc) -> str:
137
+ if not text.strip(): return "—"
138
+ if tgt not in ("Minimax-ASCII","Kōmín-CJK"):
139
+ return "La compactación aplica cuando el **Destino** es Minimax/Kōmín."
140
+ # Base (sin casillas, sin sidecar)
141
+ base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False)
142
+ # Actual (con opciones, sin sidecar)
143
+ curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm)
144
+ # Si el usuario marcó exacta, también medimos con sidecar
145
+ curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm) if maxc else None
146
+ p_base = _pct_comp(text, base)
147
+ p_curr = _pct_comp(text, curr)
148
+ msg = f"**Base (sin casillas):** {p_base:.1f}% · **Con tus opciones:** {p_curr:.1f}%"
149
+ if curr_exact is not None:
150
+ p_exact = _pct_comp(text, curr_exact)
151
+ msg += f" · **Con sidecar `~...`:** {p_exact:.1f}%"
152
+ return msg
153
+
154
+ def compaction_report_en(text, src, tgt, drop, zero, rm, maxc) -> str:
155
+ if not text.strip(): return "—"
156
+ if tgt not in ("Minimax-ASCII","Kōmín-CJK"):
157
+ return "Compaction applies when **Target** is Minimax/Kōmín."
158
+ base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False)
159
+ curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm)
160
+ curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm) if maxc else None
161
+ p_base = _pct_comp(text, base)
162
+ p_curr = _pct_comp(text, curr)
163
+ msg = f"**Base (no options):** {p_base:.1f}% · **With your options:** {p_curr:.1f}%"
164
+ if curr_exact is not None:
165
+ p_exact = _pct_comp(text, curr_exact)
166
+ msg += f" · **With `~...` sidecar:** {p_exact:.1f}%"
167
+ return msg
168
+
169
  def master_preview(n: int = 20) -> List[List[Any]]:
170
  try:
171
  entries = (MASTER_OBJ or {}).get("entries", [])
 
181
  def make_group_es():
182
  with gr.Group(visible=True) as g:
183
  gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
184
+ # Acordeones de explicaciónMISMO nivel y con contenido Markdown dentro
185
  with gr.Row():
186
  with gr.Column():
187
+ with gr.Accordion(ACC_TITLES_ES["translate"], open=False):
188
+ gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES)
189
+ with gr.Accordion(ACC_TITLES_ES["build"], open=False):
190
+ gr.Markdown(EXPLAIN_TAB_BUILD_ES)
191
+ with gr.Accordion(ACC_TITLES_ES["decode"], open=False):
192
+ gr.Markdown(EXPLAIN_TAB_DECODE_ES)
193
+ with gr.Accordion(ACC_TITLES_ES["roundtrip"], open=False):
194
+ gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
195
  with gr.Column():
196
+ with gr.Accordion(ACC_TITLES_ES["checkbox"], open=False):
197
+ gr.Markdown(EXPLAIN_CHECKBOX_ES)
198
+ with gr.Accordion(ACC_TITLES_ES["lexicon"], open=False):
199
+ gr.Markdown(LEXICON_BUILD_ES)
200
  n_rows = gr.Slider(5, 100, value=20, step=5, label="Filas a mostrar")
201
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
202
  gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows], [table])
 
218
  btn_tr = gr.Button("🚀 Traducir", variant="primary")
219
  btn_tr_cl = gr.Button("🧹 Limpiar")
220
  uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
221
+ comp_out = gr.Markdown("") # indicador de compactación
222
+
223
+ def do_translate(text, src, tgt, drop, zero, mode, maxc, rm):
224
+ res = universal_translate(text, src, tgt, drop, zero, mode, maxc, rm)
225
+ rep = compaction_report_es(text, src, tgt, drop, zero, rm, maxc)
226
+ return res, rep
227
 
228
+ btn_tr.click(do_translate,
229
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
230
+ [uni_out, comp_out])
231
  btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
232
 
233
+ with gr.Accordion("Ayuda rápida", open=False):
234
+ gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES)
235
 
236
  with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
237
  with gr.Row():
 
248
  btn_b = gr.Button("🏗️ Construir", variant="primary")
249
  btn_b_cl = gr.Button("🧹 Limpiar")
250
  out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
251
+ comp_out_b = gr.Markdown("")
252
 
253
+ def do_build(text, src, tgt, drop, zero, mode, maxc, rm):
254
+ res = build_sentence(text, src, tgt, drop, zero, mode, maxc, rm)
255
+ rep = compaction_report_es(text, src, tgt, drop, zero, rm, maxc)
256
+ return res, rep
257
+
258
+ btn_b.click(do_build,
259
  [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
260
+ [out, comp_out_b])
261
  btn_b_cl.click(lambda: ("",""), None, [text_in, out])
262
 
263
+ with gr.Accordion("Ayuda rápida", open=False):
264
+ gr.Markdown(EXPLAIN_TAB_BUILD_ES)
265
 
266
  with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
267
  with gr.Row():
 
284
  btn_d.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
285
  btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
286
 
287
+ with gr.Accordion("Ayuda rápida", open=False):
288
  gr.Markdown(EXPLAIN_TAB_DECODE_ES)
289
 
290
  with gr.Tab("🔄 Prueba ida→vuelta"):
 
303
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
304
  btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
305
 
306
+ with gr.Accordion("Ayuda rápida", open=False):
307
  gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES)
308
  return g
309
 
 
312
  gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
313
  with gr.Row():
314
  with gr.Column():
315
+ with gr.Accordion(ACC_TITLES_EN["translate"], open=False):
316
+ gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN)
317
+ with gr.Accordion(ACC_TITLES_EN["build"], open=False):
318
+ gr.Markdown(EXPLAIN_TAB_BUILD_EN)
319
+ with gr.Accordion(ACC_TITLES_EN["decode"], open=False):
320
+ gr.Markdown(EXPLAIN_TAB_DECODE_EN)
321
+ with gr.Accordion(ACC_TITLES_EN["roundtrip"], open=False):
322
+ gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
323
  with gr.Column():
324
+ with gr.Accordion(ACC_TITLES_EN["checkbox"], open=False):
325
+ gr.Markdown(EXPLAIN_CHECKBOX_EN)
326
+ with gr.Accordion(ACC_TITLES_EN["lexicon"], open=False):
327
+ gr.Markdown(LEXICON_BUILD_EN)
328
  n_rows = gr.Slider(5, 100, value=20, step=5, label="Rows to show")
329
  table = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False)
330
  gr.Button("Refresh").click(lambda n: master_preview(int(n)), [n_rows], [table])
 
345
  btn_tr = gr.Button("🚀 Translate", variant="primary")
346
  btn_tr_cl = gr.Button("🧹 Clear")
347
  uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
348
+ comp_out = gr.Markdown("")
349
+
350
+ def do_translate_en(text, src, tgt, drop, zero, mode, maxc, rm):
351
+ res = universal_translate(text, src, tgt, drop, zero, mode, maxc, rm)
352
+ rep = compaction_report_en(text, src, tgt, drop, zero, rm, maxc)
353
+ return res, rep
354
 
355
+ btn_tr.click(do_translate_en,
356
  [uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_maxc, uni_rmpr],
357
+ [uni_out, comp_out])
358
  btn_tr_cl.click(lambda: ("",""), None, [uni_text, uni_out])
359
 
360
+ with gr.Accordion("Quick help", open=False):
361
+ gr.Markdown(EXPLAIN_TAB_TRANSLATE_EN)
362
 
363
  with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
364
  with gr.Row():
 
375
  btn_b = gr.Button("🏗️ Build", variant="primary")
376
  btn_b_cl = gr.Button("🧹 Clear")
377
  out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
378
+ comp_out_b = gr.Markdown("")
379
+
380
+ def do_build_en(text, src, tgt, drop, zero, mode, maxc, rm):
381
+ res = build_sentence(text, src, tgt, drop, zero, mode, maxc, rm)
382
+ rep = compaction_report_en(text, src, tgt, drop, zero, rm, maxc)
383
+ return res, rep
384
 
385
+ btn_b.click(do_build_en,
386
  [text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build, rm_pron_build],
387
+ [out, comp_out_b])
388
  btn_b_cl.click(lambda: ("",""), None, [text_in, out])
389
 
390
+ with gr.Accordion("Quick help", open=False):
391
+ gr.Markdown(EXPLAIN_TAB_BUILD_EN)
392
 
393
  with gr.Tab("🗝️ Decode (Conlang → ES/EN)"):
394
  with gr.Row():
 
397
  code_in = gr.Textbox(lines=3, label="Conlang text (may include `~...`)", show_copy_button=True)
398
  out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
399
 
400
+ def decode_lossless_aware_en(text, src, tgt):
401
  orig = extract_custom_sidecar(text)
402
  if orig is not None: return orig
403
  orig = extract_sidecar_b85(text)
 
408
  btn_d = gr.Button("🔓 Decode", variant="primary")
409
  btn_d_cl = gr.Button("🧹 Clear")
410
 
411
+ btn_d.click(decode_lossless_aware_en, [code_in, src_code, tgt_lang], [out3])
412
  btn_d_cl.click(lambda: ("",""), None, [code_in, out3])
413
 
414
+ with gr.Accordion("Quick help", open=False):
415
  gr.Markdown(EXPLAIN_TAB_DECODE_EN)
416
 
417
  with gr.Tab("🔄 Round-trip"):
 
430
  btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
431
  btn_rt_cl.click(lambda: ("","",""), None, [rt_text, rt_out_conlang, rt_out_back])
432
 
433
+ with gr.Accordion("Quick help", open=False):
434
  gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_EN)
435
  return g
436