Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py — Universal Conlang Translator (Max Compresión Exacta)
|
| 2 |
# Archivos necesarios en la raíz:
|
| 3 |
# - lexicon_minimax.json
|
| 4 |
# - lexicon_komin.json
|
|
@@ -15,8 +15,7 @@ import re
|
|
| 15 |
import json
|
| 16 |
import base64
|
| 17 |
import zlib
|
| 18 |
-
import
|
| 19 |
-
from typing import Dict, Tuple, Optional
|
| 20 |
import gradio as gr
|
| 21 |
|
| 22 |
# ------------ Archivos esperados ------------
|
|
@@ -78,7 +77,7 @@ def load_lexicons():
|
|
| 78 |
EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
|
| 79 |
ES2EN_LEMMA, EN2ES_LEMMA) = load_lexicons()
|
| 80 |
|
| 81 |
-
# ------------ OOV reversible (
|
| 82 |
ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
|
| 83 |
CJK_BASE = (
|
| 84 |
"天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
|
|
@@ -141,47 +140,7 @@ def lemma_of(tok, src_lang: str) -> str:
|
|
| 141 |
else:
|
| 142 |
return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
|
| 143 |
|
| 144 |
-
# ------------
|
| 145 |
-
def pick_predicative_sentence(doc):
|
| 146 |
-
sents = list(doc.sents) if doc.has_annotation("SENT_START") else [doc]
|
| 147 |
-
candidates = []
|
| 148 |
-
for s in sents:
|
| 149 |
-
roots = [t for t in s if t.dep_ == "ROOT" and t.pos_ in ("VERB","AUX")]
|
| 150 |
-
if not roots:
|
| 151 |
-
continue
|
| 152 |
-
root = roots[0]
|
| 153 |
-
has_q = "?" in s.text
|
| 154 |
-
has_subj = any(t.dep_.startswith("nsubj") for t in root.children)
|
| 155 |
-
score = (1 if has_q else 0) + (1 if has_subj else 0) + (len(s) / 1000.0)
|
| 156 |
-
candidates.append((score, s))
|
| 157 |
-
if not candidates:
|
| 158 |
-
return doc
|
| 159 |
-
return sorted(candidates, key=lambda x: x[0], reverse=True)[0][1].as_doc()
|
| 160 |
-
|
| 161 |
-
def is_content_token(t) -> bool:
|
| 162 |
-
return True # No filtra para exactitud
|
| 163 |
-
|
| 164 |
-
# ------------ Mapeo lema→código ------------
|
| 165 |
-
def code_es(lemma: str, target: str) -> str:
|
| 166 |
-
lemma = norm_es(lemma)
|
| 167 |
-
if target == "Minimax-ASCII":
|
| 168 |
-
return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
|
| 169 |
-
else:
|
| 170 |
-
return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
|
| 171 |
-
|
| 172 |
-
def code_en(lemma: str, target: str) -> str:
|
| 173 |
-
lemma = norm_en(lemma)
|
| 174 |
-
if target == "Minimax-ASCII":
|
| 175 |
-
if EN2MINI: return EN2MINI.get(lemma) or enc_oov_minimax(lemma)
|
| 176 |
-
return enc_oov_minimax(lemma)
|
| 177 |
-
else:
|
| 178 |
-
if EN2KOMI: return EN2KOMI.get(lemma) or enc_oov_komin(lemma)
|
| 179 |
-
return enc_oov_komin(lemma)
|
| 180 |
-
|
| 181 |
-
# ------------ Fraseador compacto ------------
|
| 182 |
-
TAM_MINI = {"Pres":"P", "Past":"T", "Fut":"F", "UNK":"P"}
|
| 183 |
-
TAM_KOMI = {"Pres":"Ⓟ", "Past":"Ⓣ", "Fut":"Ⓕ", "UNK":"Ⓟ"}
|
| 184 |
-
|
| 185 |
def detect_polarity(doc) -> bool:
|
| 186 |
return "?" in doc.text
|
| 187 |
|
|
@@ -255,6 +214,26 @@ def _person_of_doc(doc, src_lang: str) -> Optional[str]:
|
|
| 255 |
except Exception:
|
| 256 |
return None
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
|
| 259 |
root, subs, objs, obls, advs = extract_core(doc)
|
| 260 |
tense = detect_tense(root)
|
|
@@ -275,21 +254,14 @@ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, se
|
|
| 275 |
def realize_np(tokens):
|
| 276 |
outs=[]
|
| 277 |
for t in tokens:
|
| 278 |
-
if
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
if semi_lossless and USE_SPACY and (t.tag_ in ("NNS","NNPS") or "Number=Plur" in str(t.morph)):
|
| 282 |
-
code = f"{code}[PL]"
|
| 283 |
-
outs.append(code)
|
| 284 |
return outs
|
| 285 |
|
| 286 |
S = realize_np(subs)
|
| 287 |
O = realize_np(objs) + realize_np(obls)
|
| 288 |
-
ADV=[]
|
| 289 |
-
for a in advs:
|
| 290 |
-
if not USE_SPACY or is_content_token(a):
|
| 291 |
-
lem = lemma_of(a, src_lang) if USE_SPACY else a.text
|
| 292 |
-
ADV.append(code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII"))
|
| 293 |
|
| 294 |
if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
|
| 295 |
parts = S + O + ADV
|
|
@@ -315,21 +287,14 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
|
|
| 315 |
def realize_np(tokens, particle):
|
| 316 |
outs=[]
|
| 317 |
for t in tokens:
|
| 318 |
-
if
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
if semi_lossless and USE_SPACY and (t.tag_ in ("NNS","NNPS") or "Number=Plur" in str(t.morph)):
|
| 322 |
-
code = f"{code}[PL]"
|
| 323 |
-
outs.append(code + particle)
|
| 324 |
return outs
|
| 325 |
|
| 326 |
S = realize_np(subs, P_SUBJ)
|
| 327 |
O = realize_np(objs + obls, P_OBJ)
|
| 328 |
-
ADV=[]
|
| 329 |
-
for a in advs:
|
| 330 |
-
if not USE_SPACY or is_content_token(a):
|
| 331 |
-
lem = lemma_of(a, src_lang) if USE_SPACY else a.text
|
| 332 |
-
ADV.append(code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK"))
|
| 333 |
|
| 334 |
v_form = vcode + TAM + (NEG_M if is_neg else "")
|
| 335 |
|
|
@@ -341,7 +306,7 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
|
|
| 341 |
if is_q: out += " " + Q_FIN
|
| 342 |
return out
|
| 343 |
|
| 344 |
-
# ------------
|
| 345 |
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
|
| 346 |
|
| 347 |
def b85_enc_raw(s: str) -> str:
|
|
@@ -367,7 +332,6 @@ def extract_sidecar_b85(text: str) -> Optional[str]:
|
|
| 367 |
def strip_sidecar_b85(text: str) -> str:
|
| 368 |
return SIDECAR_B85_RE.sub("", text).rstrip()
|
| 369 |
|
| 370 |
-
# ------------ Custom sidecar para max compresión exacta ------------
|
| 371 |
def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
|
| 372 |
comp = zlib.compress(original_text.encode("utf-8"), 9)
|
| 373 |
blob = to_custom_b64(comp, ALPHA_MINI64)
|
|
@@ -386,7 +350,7 @@ def extract_custom_sidecar(text: str) -> Optional[str]:
|
|
| 386 |
def strip_custom_sidecar(text: str) -> str:
|
| 387 |
return text.split('~')[0].rstrip() if '~' in text else text
|
| 388 |
|
| 389 |
-
# ------------ Codificar
|
| 390 |
def encode_simple(text: str, src_lang: str, target: str) -> str:
|
| 391 |
if not text.strip(): return ""
|
| 392 |
def repl_es(m):
|
|
@@ -419,9 +383,6 @@ def pluralize_en(word: str) -> str:
|
|
| 419 |
def pluralize(word: str, tgt_lang: str) -> str:
|
| 420 |
return pluralize_es(word) if tgt_lang == "Español" else pluralize_en(word)
|
| 421 |
|
| 422 |
-
PRON_ES = {"yo", "tú", "él", "ella", "nosotros", "vosotros", "ellos", "ellas", "usted", "ustedes"}
|
| 423 |
-
PRON_EN = {"i", "you", "he", "she", "it", "we", "they"}
|
| 424 |
-
|
| 425 |
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
|
| 426 |
|
| 427 |
def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
@@ -429,7 +390,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 429 |
return ""
|
| 430 |
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
|
| 431 |
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
|
| 432 |
-
pron_set = PRON_ES if tgt_lang == "Español" else PRON_EN
|
| 433 |
|
| 434 |
if source == "Kōmín-CJK":
|
| 435 |
text = text.replace("?", "?").replace(" ", " ")
|
|
@@ -447,7 +407,7 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 447 |
has_q = False
|
| 448 |
is_neg = False
|
| 449 |
|
| 450 |
-
for
|
| 451 |
look = part.replace("[PL]", "")
|
| 452 |
had_pl = "[PL]" in part
|
| 453 |
pl_flags.append(had_pl)
|
|
@@ -468,7 +428,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 468 |
lemma_tokens.append(vlem)
|
| 469 |
pl_flags.append(False)
|
| 470 |
|
| 471 |
-
# Parse tail
|
| 472 |
if tail:
|
| 473 |
if len(tail) > 0 and tail[0] in "PTF":
|
| 474 |
verb_tense = {"P": "Pres", "T": "Past", "F": "Fut"}.get(tail[0], "Pres")
|
|
@@ -487,7 +446,6 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 487 |
verb_lemma = vlem
|
| 488 |
continue
|
| 489 |
|
| 490 |
-
# No verbo
|
| 491 |
w_es = code2es.get(look)
|
| 492 |
w_en = code2en.get(look) if code2en else None
|
| 493 |
w = w_es if tgt_lang == "Español" else (w_en or w_es or look)
|
|
@@ -500,50 +458,23 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 500 |
pl_flags.append(had_pl)
|
| 501 |
|
| 502 |
out_parts = []
|
| 503 |
-
greeting = None
|
| 504 |
-
wh = None
|
| 505 |
for idx, lem in enumerate(lemma_tokens):
|
| 506 |
if idx == verb_idx:
|
| 507 |
conj_func = _es_conj if tgt_lang == "Español" else _en_conj
|
| 508 |
v_conj = conj_func(verb_lemma, verb_tense, verb_person)
|
| 509 |
if is_neg:
|
| 510 |
-
|
| 511 |
-
v_conj = neg_prefix + v_conj
|
| 512 |
out_parts.append(v_conj)
|
| 513 |
continue
|
|
|
|
| 514 |
|
| 515 |
-
|
| 516 |
-
if w.lower() in {"hola", "hello", "hi", "hey"}:
|
| 517 |
-
greeting = w
|
| 518 |
-
elif w.lower() in {"como", "cómo", "what", "how"} and has_q:
|
| 519 |
-
wh = w
|
| 520 |
-
if tgt_lang == "Español" and w.lower() == "como":
|
| 521 |
-
wh = "cómo"
|
| 522 |
-
else:
|
| 523 |
-
out_parts.append(w)
|
| 524 |
-
|
| 525 |
-
# Reorden: Greeting + wh + S V O ADV
|
| 526 |
-
final_out = []
|
| 527 |
-
if greeting:
|
| 528 |
-
final_out.append(greeting.capitalize())
|
| 529 |
-
if wh:
|
| 530 |
-
final_out.append(wh)
|
| 531 |
-
final_out += out_parts
|
| 532 |
-
|
| 533 |
-
out_text = " ".join(final_out)
|
| 534 |
-
|
| 535 |
-
# Pregunta
|
| 536 |
if has_q:
|
| 537 |
start_q = "¿" if tgt_lang == "Español" else ""
|
| 538 |
-
|
| 539 |
-
out_text = f"{start_q}{out_text.capitalize()}{end_q}"
|
| 540 |
-
|
| 541 |
return out_text
|
| 542 |
|
| 543 |
# ------------ Conjugadores mínimos ------------
|
| 544 |
-
_ES_SUBJ = {"1s":"yo","2s":"tú","3s":"él/ella","1p":"nosotros","2p":"vosotros","3p":"ellos"}
|
| 545 |
-
_EN_SUBJ = {"1s":"I","2s":"you","3s":"he","1p":"we","2p":"you","3p":"they"}
|
| 546 |
-
|
| 547 |
def _es_conj_regular(lemma, tense, person):
|
| 548 |
if not lemma.endswith(("ar","er","ir")): return lemma
|
| 549 |
stem = lemma[:-2]; vtype = lemma[-2:]
|
|
@@ -572,7 +503,7 @@ def _es_conj(lemma, tense, person):
|
|
| 572 |
if lemma == "estar":
|
| 573 |
tab = {
|
| 574 |
"Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
|
| 575 |
-
"Past":{"1s":"
|
| 576 |
"Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"},
|
| 577 |
}; return tab[tense].get(person, tab[tense]["3s"])
|
| 578 |
if lemma == "ir":
|
|
@@ -616,7 +547,7 @@ def _en_conj(lemma, tense, person):
|
|
| 616 |
else:
|
| 617 |
return lemma
|
| 618 |
|
| 619 |
-
# ------------
|
| 620 |
def _build_with_spacy(text: str, src_lang: str, target: str,
|
| 621 |
drop_articles: bool, zero_copula: bool, semi_lossless: bool) -> str:
|
| 622 |
nlp = nlp_es if src_lang=="Español" else nlp_en
|
|
@@ -629,7 +560,7 @@ def _build_with_spacy(text: str, src_lang: str, target: str,
|
|
| 629 |
def build_sentence(text: str, src_lang: str, target: str,
|
| 630 |
drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
|
| 631 |
if not text.strip(): return ""
|
| 632 |
-
semi = True #
|
| 633 |
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi_lossless=semi) if USE_SPACY else encode_simple(text, src_lang, target)
|
| 634 |
if max_comp_exact:
|
| 635 |
return custom_sidecar_enc(core, text)
|
|
@@ -640,31 +571,19 @@ def universal_translate(text: str, src: str, tgt: str,
|
|
| 640 |
mode: str, max_comp_exact: bool = False) -> str:
|
| 641 |
if not text.strip(): return ""
|
| 642 |
if src == tgt: return text
|
| 643 |
-
|
| 644 |
-
# Natural → Conlang
|
| 645 |
if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
|
| 646 |
return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact)
|
| 647 |
-
|
| 648 |
-
# Conlang → Natural (considera sidecars)
|
| 649 |
if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
|
| 650 |
-
# Custom sidecar para exact
|
| 651 |
orig = extract_custom_sidecar(text)
|
| 652 |
if orig is not None: return orig
|
| 653 |
-
# Fallback b85 si hay
|
| 654 |
orig = extract_sidecar_b85(text)
|
| 655 |
if orig is not None: return orig
|
| 656 |
-
# Semi-lossless
|
| 657 |
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 658 |
-
|
| 659 |
-
# Natural ↔ Natural (lemas)
|
| 660 |
if src in ("Español","English") and tgt in ("Español","English"):
|
| 661 |
return translate_natural(text, src, tgt)
|
| 662 |
-
|
| 663 |
-
# Conlang ↔ Conlang (simplificado)
|
| 664 |
if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
|
| 665 |
orig = extract_custom_sidecar(text)
|
| 666 |
if orig is not None:
|
| 667 |
-
# Preserva sidecar
|
| 668 |
core = strip_custom_sidecar(text)
|
| 669 |
es_lemmas = decode_simple(core, src, "Español")
|
| 670 |
words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
|
|
@@ -672,28 +591,20 @@ def universal_translate(text: str, src: str, tgt: str,
|
|
| 672 |
for w in words:
|
| 673 |
if re.fullmatch(r"\w+", w):
|
| 674 |
code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
|
| 675 |
-
if
|
| 676 |
-
code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)
|
| 677 |
-
out.append(code)
|
| 678 |
else:
|
| 679 |
out.append(w)
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
# Sin sidecar, normal
|
| 683 |
-
core = text
|
| 684 |
-
es_lemmas = decode_simple(core, src, "Español")
|
| 685 |
words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
|
| 686 |
out=[]
|
| 687 |
for w in words:
|
| 688 |
if re.fullmatch(r"\w+", w):
|
| 689 |
code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
|
| 690 |
-
if
|
| 691 |
-
code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)
|
| 692 |
-
out.append(code)
|
| 693 |
else:
|
| 694 |
out.append(w)
|
| 695 |
return " ".join(out)
|
| 696 |
-
|
| 697 |
return "[No soportado]"
|
| 698 |
|
| 699 |
def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
|
|
@@ -719,91 +630,306 @@ def round_trip(text, src, tgt, mode, max_comp_exact):
|
|
| 719 |
back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
|
| 720 |
return conlang, back
|
| 721 |
|
| 722 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
EXPLAIN_ES = """
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 729 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 730 |
|
| 731 |
-
|
|
|
|
| 732 |
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
|
|
|
|
|
|
|
|
|
| 736 |
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
with gr.
|
| 757 |
-
|
| 758 |
-
src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
|
| 759 |
-
target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 760 |
-
text_in = gr.Textbox(lines=3, label="Frase", value="")
|
| 761 |
-
with gr.Row():
|
| 762 |
-
drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
|
| 763 |
-
zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
|
| 764 |
-
max_comp_build = gr.Checkbox(value=False, label="Max Compresión Exacta (sidecar oculto)")
|
| 765 |
-
mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 766 |
-
out = gr.Textbox(lines=6, label="Salida")
|
| 767 |
-
gr.Button("Construir").click(
|
| 768 |
-
build_sentence,
|
| 769 |
-
[text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
|
| 770 |
-
[out]
|
| 771 |
-
)
|
| 772 |
-
|
| 773 |
-
# --- Decodificar (Conlang → ES/EN) ---
|
| 774 |
-
with gr.Tab("Decodificar (Conlang → ES/EN)"):
|
| 775 |
with gr.Row():
|
| 776 |
-
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 794 |
with gr.Row():
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 807 |
|
| 808 |
if __name__ == "__main__":
|
| 809 |
-
demo.launch()
|
|
|
|
| 1 |
+
# app.py — Universal Conlang Translator (Max Compresión Exacta) — UI bilingüe ES/EN
|
| 2 |
# Archivos necesarios en la raíz:
|
| 3 |
# - lexicon_minimax.json
|
| 4 |
# - lexicon_komin.json
|
|
|
|
| 15 |
import json
|
| 16 |
import base64
|
| 17 |
import zlib
|
| 18 |
+
from typing import Dict, Optional
|
|
|
|
| 19 |
import gradio as gr
|
| 20 |
|
| 21 |
# ------------ Archivos esperados ------------
|
|
|
|
| 77 |
EN2MINI, EN2KOMI, MINI2EN, KOMI2EN,
|
| 78 |
ES2EN_LEMMA, EN2ES_LEMMA) = load_lexicons()
|
| 79 |
|
| 80 |
+
# ------------ OOV reversible (Semi-lossless) ------------
|
| 81 |
ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64]
|
| 82 |
CJK_BASE = (
|
| 83 |
"天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚"
|
|
|
|
| 140 |
else:
|
| 141 |
return norm_en(tok.lemma_ if tok.lemma_ else tok.text)
|
| 142 |
|
| 143 |
+
# ------------ Utilidades de análisis sintáctico ------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 144 |
def detect_polarity(doc) -> bool:
|
| 145 |
return "?" in doc.text
|
| 146 |
|
|
|
|
| 214 |
except Exception:
|
| 215 |
return None
|
| 216 |
|
| 217 |
+
# ------------ Mapeo lema→código y fraseadores ------------
|
| 218 |
+
def code_es(lemma: str, target: str) -> str:
|
| 219 |
+
lemma = norm_es(lemma)
|
| 220 |
+
if target == "Minimax-ASCII":
|
| 221 |
+
return ES2MINI.get(lemma) or enc_oov_minimax(lemma)
|
| 222 |
+
else:
|
| 223 |
+
return ES2KOMI.get(lemma) or enc_oov_komin(lemma)
|
| 224 |
+
|
| 225 |
+
def code_en(lemma: str, target: str) -> str:
|
| 226 |
+
lemma = norm_en(lemma)
|
| 227 |
+
if target == "Minimax-ASCII":
|
| 228 |
+
if EN2MINI: return EN2MINI.get(lemma) or enc_oov_minimax(lemma)
|
| 229 |
+
return enc_oov_minimax(lemma)
|
| 230 |
+
else:
|
| 231 |
+
if EN2KOMI: return EN2KOMI.get(lemma) or enc_oov_komin(lemma)
|
| 232 |
+
return enc_oov_komin(lemma)
|
| 233 |
+
|
| 234 |
+
TAM_MINI = {"Pres":"P", "Past":"T", "Fut":"F", "UNK":"P"}
|
| 235 |
+
TAM_KOMI = {"Pres":"Ⓟ", "Past":"Ⓣ", "Fut":"Ⓕ", "UNK":"Ⓟ"}
|
| 236 |
+
|
| 237 |
def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s"):
|
| 238 |
root, subs, objs, obls, advs = extract_core(doc)
|
| 239 |
tense = detect_tense(root)
|
|
|
|
| 254 |
def realize_np(tokens):
|
| 255 |
outs=[]
|
| 256 |
for t in tokens:
|
| 257 |
+
lem = lemma_of(t, src_lang) if USE_SPACY else (t.text)
|
| 258 |
+
code = code_es(lem, "Minimax-ASCII") if src_lang=="Español" else code_en(lem, "Minimax-ASCII")
|
| 259 |
+
outs.append(code)
|
|
|
|
|
|
|
|
|
|
| 260 |
return outs
|
| 261 |
|
| 262 |
S = realize_np(subs)
|
| 263 |
O = realize_np(objs) + realize_np(obls)
|
| 264 |
+
ADV=[code_es(lemma_of(a, src_lang), "Minimax-ASCII") if src_lang=="Español" else code_en(lemma_of(a, src_lang), "Minimax-ASCII") for a in advs] if USE_SPACY else []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
|
| 267 |
parts = S + O + ADV
|
|
|
|
| 287 |
def realize_np(tokens, particle):
|
| 288 |
outs=[]
|
| 289 |
for t in tokens:
|
| 290 |
+
lem = lemma_of(t, src_lang) if USE_SPACY else t.text
|
| 291 |
+
code = code_es(lem, "Kōmín-CJK") if src_lang=="Español" else code_en(lem, "Kōmín-CJK")
|
| 292 |
+
outs.append(code + particle)
|
|
|
|
|
|
|
|
|
|
| 293 |
return outs
|
| 294 |
|
| 295 |
S = realize_np(subs, P_SUBJ)
|
| 296 |
O = realize_np(objs + obls, P_OBJ)
|
| 297 |
+
ADV=[code_es(lemma_of(a, src_lang), "Kōmín-CJK") if src_lang=="Español" else code_en(lemma_of(a, src_lang), "Kōmín-CJK") for a in advs] if USE_SPACY else []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 298 |
|
| 299 |
v_form = vcode + TAM + (NEG_M if is_neg else "")
|
| 300 |
|
|
|
|
| 306 |
if is_q: out += " " + Q_FIN
|
| 307 |
return out
|
| 308 |
|
| 309 |
+
# ------------ Sidecars para compresión exacta ------------
|
| 310 |
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_`{|}~]+)\)$")
|
| 311 |
|
| 312 |
def b85_enc_raw(s: str) -> str:
|
|
|
|
| 332 |
def strip_sidecar_b85(text: str) -> str:
|
| 333 |
return SIDECAR_B85_RE.sub("", text).rstrip()
|
| 334 |
|
|
|
|
| 335 |
def custom_sidecar_enc(conlang_text: str, original_text: str) -> str:
|
| 336 |
comp = zlib.compress(original_text.encode("utf-8"), 9)
|
| 337 |
blob = to_custom_b64(comp, ALPHA_MINI64)
|
|
|
|
| 350 |
def strip_custom_sidecar(text: str) -> str:
|
| 351 |
return text.split('~')[0].rstrip() if '~' in text else text
|
| 352 |
|
| 353 |
+
# ------------ Codificar/decodificar léxico puro ------------
|
| 354 |
def encode_simple(text: str, src_lang: str, target: str) -> str:
|
| 355 |
if not text.strip(): return ""
|
| 356 |
def repl_es(m):
|
|
|
|
| 383 |
def pluralize(word: str, tgt_lang: str) -> str:
|
| 384 |
return pluralize_es(word) if tgt_lang == "Español" else pluralize_en(word)
|
| 385 |
|
|
|
|
|
|
|
|
|
|
| 386 |
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
|
| 387 |
|
| 388 |
def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
|
|
| 390 |
return ""
|
| 391 |
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
|
| 392 |
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
|
|
|
|
| 393 |
|
| 394 |
if source == "Kōmín-CJK":
|
| 395 |
text = text.replace("?", "?").replace(" ", " ")
|
|
|
|
| 407 |
has_q = False
|
| 408 |
is_neg = False
|
| 409 |
|
| 410 |
+
for part in tokens:
|
| 411 |
look = part.replace("[PL]", "")
|
| 412 |
had_pl = "[PL]" in part
|
| 413 |
pl_flags.append(had_pl)
|
|
|
|
| 428 |
lemma_tokens.append(vlem)
|
| 429 |
pl_flags.append(False)
|
| 430 |
|
|
|
|
| 431 |
if tail:
|
| 432 |
if len(tail) > 0 and tail[0] in "PTF":
|
| 433 |
verb_tense = {"P": "Pres", "T": "Past", "F": "Fut"}.get(tail[0], "Pres")
|
|
|
|
| 446 |
verb_lemma = vlem
|
| 447 |
continue
|
| 448 |
|
|
|
|
| 449 |
w_es = code2es.get(look)
|
| 450 |
w_en = code2en.get(look) if code2en else None
|
| 451 |
w = w_es if tgt_lang == "Español" else (w_en or w_es or look)
|
|
|
|
| 458 |
pl_flags.append(had_pl)
|
| 459 |
|
| 460 |
out_parts = []
|
|
|
|
|
|
|
| 461 |
for idx, lem in enumerate(lemma_tokens):
|
| 462 |
if idx == verb_idx:
|
| 463 |
conj_func = _es_conj if tgt_lang == "Español" else _en_conj
|
| 464 |
v_conj = conj_func(verb_lemma, verb_tense, verb_person)
|
| 465 |
if is_neg:
|
| 466 |
+
v_conj = ("no " if tgt_lang == "Español" else "not ") + v_conj
|
|
|
|
| 467 |
out_parts.append(v_conj)
|
| 468 |
continue
|
| 469 |
+
out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
|
| 470 |
|
| 471 |
+
out_text = " ".join(out_parts)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
if has_q:
|
| 473 |
start_q = "¿" if tgt_lang == "Español" else ""
|
| 474 |
+
out_text = f"{start_q}{out_text.capitalize()}?"
|
|
|
|
|
|
|
| 475 |
return out_text
|
| 476 |
|
| 477 |
# ------------ Conjugadores mínimos ------------
|
|
|
|
|
|
|
|
|
|
| 478 |
def _es_conj_regular(lemma, tense, person):
|
| 479 |
if not lemma.endswith(("ar","er","ir")): return lemma
|
| 480 |
stem = lemma[:-2]; vtype = lemma[-2:]
|
|
|
|
| 503 |
if lemma == "estar":
|
| 504 |
tab = {
|
| 505 |
"Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"},
|
| 506 |
+
"Past":{"1s":"estuviste","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"},
|
| 507 |
"Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"},
|
| 508 |
}; return tab[tense].get(person, tab[tense]["3s"])
|
| 509 |
if lemma == "ir":
|
|
|
|
| 547 |
else:
|
| 548 |
return lemma
|
| 549 |
|
| 550 |
+
# ------------ Rutas principales ------------
|
| 551 |
def _build_with_spacy(text: str, src_lang: str, target: str,
|
| 552 |
drop_articles: bool, zero_copula: bool, semi_lossless: bool) -> str:
|
| 553 |
nlp = nlp_es if src_lang=="Español" else nlp_en
|
|
|
|
| 560 |
def build_sentence(text: str, src_lang: str, target: str,
|
| 561 |
drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
|
| 562 |
if not text.strip(): return ""
|
| 563 |
+
semi = True # siempre semi-lossless
|
| 564 |
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi_lossless=semi) if USE_SPACY else encode_simple(text, src_lang, target)
|
| 565 |
if max_comp_exact:
|
| 566 |
return custom_sidecar_enc(core, text)
|
|
|
|
| 571 |
mode: str, max_comp_exact: bool = False) -> str:
|
| 572 |
if not text.strip(): return ""
|
| 573 |
if src == tgt: return text
|
|
|
|
|
|
|
| 574 |
if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
|
| 575 |
return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact)
|
|
|
|
|
|
|
| 576 |
if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"):
|
|
|
|
| 577 |
orig = extract_custom_sidecar(text)
|
| 578 |
if orig is not None: return orig
|
|
|
|
| 579 |
orig = extract_sidecar_b85(text)
|
| 580 |
if orig is not None: return orig
|
|
|
|
| 581 |
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
|
|
|
|
|
|
| 582 |
if src in ("Español","English") and tgt in ("Español","English"):
|
| 583 |
return translate_natural(text, src, tgt)
|
|
|
|
|
|
|
| 584 |
if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"):
|
| 585 |
orig = extract_custom_sidecar(text)
|
| 586 |
if orig is not None:
|
|
|
|
| 587 |
core = strip_custom_sidecar(text)
|
| 588 |
es_lemmas = decode_simple(core, src, "Español")
|
| 589 |
words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
|
|
|
|
| 591 |
for w in words:
|
| 592 |
if re.fullmatch(r"\w+", w):
|
| 593 |
code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
|
| 594 |
+
out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
|
|
|
|
|
|
|
| 595 |
else:
|
| 596 |
out.append(w)
|
| 597 |
+
return custom_sidecar_enc(" ".join(out), orig)
|
| 598 |
+
es_lemmas = decode_simple(text, src, "Español")
|
|
|
|
|
|
|
|
|
|
| 599 |
words = re.findall(r"\w+|[^\w\s]+", es_lemmas)
|
| 600 |
out=[]
|
| 601 |
for w in words:
|
| 602 |
if re.fullmatch(r"\w+", w):
|
| 603 |
code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w))
|
| 604 |
+
out.append(code or (enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w)))
|
|
|
|
|
|
|
| 605 |
else:
|
| 606 |
out.append(w)
|
| 607 |
return " ".join(out)
|
|
|
|
| 608 |
return "[No soportado]"
|
| 609 |
|
| 610 |
def translate_natural(text: str, src_lang: str, tgt_lang: str) -> str:
|
|
|
|
| 630 |
back = universal_translate(conlang, tgt, src, True, False, mode, max_comp_exact)
|
| 631 |
return conlang, back
|
| 632 |
|
| 633 |
+
# =====================================================================================
|
| 634 |
+
# ========================== UI bilingüe con selector global ==========================
|
| 635 |
+
# =====================================================================================
|
| 636 |
+
|
| 637 |
+
ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"]
|
| 638 |
+
|
| 639 |
+
# Texto ES
|
| 640 |
EXPLAIN_ES = """
|
| 641 |
+
## 🌐 ¿Qué hace esta app?
|
| 642 |
+
Traduce entre **Español / Inglés** y dos lenguajes construidos:
|
| 643 |
+
- **Minimax-ASCII** (compacto y solo ASCII)
|
| 644 |
+
- **Kōmín-CJK** (estilo CJK con partículas)
|
| 645 |
+
|
| 646 |
+
También **comprime sin perder información** si activas **Máx. Compresión Exacta** (`~...` guarda el original).
|
| 647 |
+
Al **decodificar**, si existe ese `~...`, recuperas el texto **exacto**.
|
| 648 |
+
|
| 649 |
+
### 🧠 ¿Por qué me sirve?
|
| 650 |
+
- Para **reducir** tamaño de mensajes/notas.
|
| 651 |
+
- Para **codificar/decodificar** de forma legible y reversible.
|
| 652 |
+
- Para jugar con **conlangs** simples.
|
| 653 |
+
|
| 654 |
+
### ⚙️ Opciones (puedes ignorarlas al principio)
|
| 655 |
+
- **Omitir artículos**: quita *el/la/los/las* o *a/an/the*. Ahorra ~10–15%.
|
| 656 |
+
- **Cópula cero** (presente afirmativo): oculta *ser/estar/be* cuando suena natural. +~5–10%.
|
| 657 |
+
- **Máx. Compresión Exacta**: añade `~...` con el original comprimido (mejor en textos medianos/largos).
|
| 658 |
"""
|
| 659 |
+
FAQ_ES = """
|
| 660 |
+
### ❓ Preguntas rápidas
|
| 661 |
+
- **¿Se pierde info?** No, con **Máx. Compresión Exacta** el `~...` guarda el original.
|
| 662 |
+
- **¿Sin spaCy?** Funciona igual (modo léxico). Con spaCy suena más natural.
|
| 663 |
+
- **Privacidad**: todo corre dentro de este Space.
|
| 664 |
+
"""
|
| 665 |
+
TUTORIAL_ES = """
|
| 666 |
+
### 🏁 Empezar (3 pasos)
|
| 667 |
+
1. Elige **Fuente** y **Destino**.
|
| 668 |
+
2. Escribe tu frase.
|
| 669 |
+
3. Pulsa **Traducir**.
|
| 670 |
|
| 671 |
+
> Para recuperar **exactamente** el original más tarde, activa **Máx. Compresión Exacta**.
|
| 672 |
+
"""
|
| 673 |
|
| 674 |
+
# Texto EN
|
| 675 |
+
EXPLAIN_EN = """
|
| 676 |
+
## 🌐 What does this app do?
|
| 677 |
+
It translates between **Spanish / English** and two constructed languages:
|
| 678 |
+
- **Minimax-ASCII** (compact, ASCII-only)
|
| 679 |
+
- **Kōmín-CJK** (CJK-style with particles)
|
| 680 |
|
| 681 |
+
You can also **compress without losing information** by enabling **Max Exact Compression** (`~...` stores the original).
|
| 682 |
+
When **decoding**, if `~...` exists, you get the **exact original** back.
|
| 683 |
+
"""
|
| 684 |
+
FAQ_EN = """
|
| 685 |
+
### ❓ Quick answers
|
| 686 |
+
- **Any loss?** Not with **Max Exact Compression** — the `~...` keeps the original.
|
| 687 |
+
- **No spaCy?** Still works (lexical mode). With spaCy it reads more naturally.
|
| 688 |
+
- **Privacy**: everything runs inside this Space.
|
| 689 |
+
"""
|
| 690 |
+
TUTORIAL_EN = """
|
| 691 |
+
### 🏁 Quick start (3 steps)
|
| 692 |
+
1. Pick **Source** and **Target**.
|
| 693 |
+
2. Type your sentence.
|
| 694 |
+
3. Click **Translate**.
|
| 695 |
+
|
| 696 |
+
> To recover the **exact** original later, enable **Max Exact Compression**.
|
| 697 |
+
"""
|
| 698 |
+
|
| 699 |
+
def make_group_es():
|
| 700 |
+
with gr.Group(visible=True) as group:
|
| 701 |
+
gr.Markdown("# 🌐 Universal Conlang Translator · Compresión Exacta (ES)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 702 |
with gr.Row():
|
| 703 |
+
with gr.Column(scale=1):
|
| 704 |
+
with gr.Accordion("Resumen (ES)", open=True):
|
| 705 |
+
gr.Markdown(EXPLAIN_ES)
|
| 706 |
+
with gr.Accordion("FAQ (ES)", open=False):
|
| 707 |
+
gr.Markdown(FAQ_ES)
|
| 708 |
+
with gr.Column(scale=1):
|
| 709 |
+
with gr.Accordion("Tutorial (ES)", open=True):
|
| 710 |
+
gr.Markdown(TUTORIAL_ES)
|
| 711 |
+
gr.Markdown("**Consejo:** Los mensajes muy cortos pueden no reducirse por la cabecera del `~...`.")
|
| 712 |
+
with gr.Tab("🔁 Traducir"):
|
| 713 |
+
with gr.Row():
|
| 714 |
+
uni_src = gr.Dropdown(ALL_LANGS, value="Español", label="Fuente")
|
| 715 |
+
uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Destino")
|
| 716 |
+
uni_text = gr.Textbox(lines=3, label="Texto", placeholder="Ej.: Hola, ¿cómo estás?", show_copy_button=True)
|
| 717 |
+
with gr.Row():
|
| 718 |
+
uni_drop = gr.Checkbox(value=True, label="Omitir artículos (ES/EN → conlang)")
|
| 719 |
+
uni_zero = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
|
| 720 |
+
uni_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta (sidecar `~...`)")
|
| 721 |
+
uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 722 |
+
with gr.Row():
|
| 723 |
+
btn_translate = gr.Button("🚀 Traducir", variant="primary")
|
| 724 |
+
btn_reset = gr.Button("🧹 Limpiar")
|
| 725 |
+
uni_out = gr.Textbox(lines=6, label="Traducción", show_copy_button=True)
|
| 726 |
+
|
| 727 |
+
btn_translate.click(
|
| 728 |
+
universal_translate,
|
| 729 |
+
[uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
|
| 730 |
+
[uni_out]
|
| 731 |
+
)
|
| 732 |
+
btn_reset.click(lambda: "", None, [uni_text, uni_out])
|
| 733 |
+
|
| 734 |
+
gr.Markdown("### 🔎 Ejemplos (clic para autocompletar)")
|
| 735 |
+
ex1 = gr.Button("ES→Minimax: «Hola, ¿cómo estás?»")
|
| 736 |
+
ex2 = gr.Button("EN→Kōmín: «This system keeps messages compact.»")
|
| 737 |
+
ex3 = gr.Button("ES→Minimax (con compresión): «El clima hoy es excelente para pasear.»")
|
| 738 |
+
ex4 = gr.Button("EN→Kōmín (con compresión): «Please decode this later with the sidecar.»")
|
| 739 |
+
|
| 740 |
+
ex1.click(lambda: ("Hola, ¿cómo estás?", "Español", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 741 |
+
ex2.click(lambda: ("This system keeps messages compact.", "English", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 742 |
+
ex3.click(lambda: ("El clima hoy es excelente para pasear.", "Español", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 743 |
+
ex4.click(lambda: ("Please decode this later with the sidecar.", "English", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 744 |
+
|
| 745 |
+
with gr.Tab("🛠️ Construir (ES/EN → Conlang)"):
|
| 746 |
+
with gr.Row():
|
| 747 |
+
src_lang = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
|
| 748 |
+
target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 749 |
+
text_in = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
|
| 750 |
+
with gr.Row():
|
| 751 |
+
drop_articles = gr.Checkbox(value=True, label="Omitir artículos")
|
| 752 |
+
zero_copula = gr.Checkbox(value=False, label="Cópula cero (presente afirm.)")
|
| 753 |
+
max_comp_build = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
|
| 754 |
+
mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 755 |
+
with gr.Row():
|
| 756 |
+
btn_build = gr.Button("🏗️ Construir", variant="primary")
|
| 757 |
+
btn_build_clear = gr.Button("🧹 Limpiar")
|
| 758 |
+
out = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
|
| 759 |
+
|
| 760 |
+
btn_build.click(
|
| 761 |
+
build_sentence,
|
| 762 |
+
[text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
|
| 763 |
+
[out]
|
| 764 |
+
)
|
| 765 |
+
btn_build_clear.click(lambda: "", None, [text_in, out])
|
| 766 |
+
|
| 767 |
+
with gr.Tab("🗝️ Decodificar (Conlang → ES/EN)"):
|
| 768 |
+
with gr.Row():
|
| 769 |
+
src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Fuente")
|
| 770 |
+
tgt_lang = gr.Dropdown(["Español","English"], value="Español", label="Destino")
|
| 771 |
+
code_in = gr.Textbox(lines=3, label="Texto en conlang (puede incluir `~...`)", show_copy_button=True)
|
| 772 |
+
out3 = gr.Textbox(lines=6, label="Salida", show_copy_button=True)
|
| 773 |
+
|
| 774 |
+
def decode_lossless_aware(text, src, tgt):
|
| 775 |
+
orig = extract_custom_sidecar(text)
|
| 776 |
+
if orig is not None: return orig
|
| 777 |
+
orig = extract_sidecar_b85(text)
|
| 778 |
+
if orig is not None: return orig
|
| 779 |
+
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 780 |
+
|
| 781 |
+
with gr.Row():
|
| 782 |
+
btn_decode = gr.Button("🔓 Decodificar", variant="primary")
|
| 783 |
+
btn_decode_clear = gr.Button("🧹 Limpiar")
|
| 784 |
+
|
| 785 |
+
btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
|
| 786 |
+
btn_decode_clear.click(lambda: "", None, [code_in, out3])
|
| 787 |
+
|
| 788 |
+
gr.Markdown("> **Tip:** si ves `~...`, la decodificación será 100% exacta.")
|
| 789 |
+
|
| 790 |
+
with gr.Tab("🔄 Prueba ida→vuelta"):
|
| 791 |
+
with gr.Row():
|
| 792 |
+
rt_src = gr.Dropdown(["Español","English"], value="Español", label="Fuente")
|
| 793 |
+
rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 794 |
+
rt_text = gr.Textbox(lines=3, label="Frase", show_copy_button=True)
|
| 795 |
+
rt_max_comp = gr.Checkbox(value=False, label="Máx. Compresión Exacta")
|
| 796 |
+
rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 797 |
+
rt_out_conlang = gr.Textbox(lines=3, label="Conlang (ida)", show_copy_button=True)
|
| 798 |
+
rt_out_back = gr.Textbox(lines=3, label="Vuelta", show_copy_button=True)
|
| 799 |
+
with gr.Row():
|
| 800 |
+
btn_rt = gr.Button("▶️ Probar", variant="primary")
|
| 801 |
+
btn_rt_clear = gr.Button("🧹 Limpiar")
|
| 802 |
+
|
| 803 |
+
btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
|
| 804 |
+
btn_rt_clear.click(lambda: "", None, [rt_text, rt_out_conlang, rt_out_back])
|
| 805 |
+
|
| 806 |
+
gr.Markdown("---")
|
| 807 |
+
gr.Markdown("Hecho con ❤️ · **spaCy** (opcional) · Todo se ejecuta en este Space.")
|
| 808 |
+
return group
|
| 809 |
+
|
| 810 |
+
def make_group_en():
|
| 811 |
+
with gr.Group(visible=False) as group:
|
| 812 |
+
gr.Markdown("# 🌐 Universal Conlang Translator · Max Exact Compression (EN)")
|
| 813 |
with gr.Row():
|
| 814 |
+
with gr.Column(scale=1):
|
| 815 |
+
with gr.Accordion("Summary (EN)", open=True):
|
| 816 |
+
gr.Markdown(EXPLAIN_EN)
|
| 817 |
+
with gr.Accordion("FAQ (EN)", open=False):
|
| 818 |
+
gr.Markdown(FAQ_EN)
|
| 819 |
+
with gr.Column(scale=1):
|
| 820 |
+
with gr.Accordion("Tutorial (EN)", open=True):
|
| 821 |
+
gr.Markdown(TUTORIAL_EN)
|
| 822 |
+
gr.Markdown("**Tip:** Very short messages may not shrink due to the `~...` header.")
|
| 823 |
+
with gr.Tab("🔁 Translate"):
|
| 824 |
+
with gr.Row():
|
| 825 |
+
uni_src = gr.Dropdown(ALL_LANGS, value="English", label="Source")
|
| 826 |
+
uni_tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label="Target")
|
| 827 |
+
uni_text = gr.Textbox(lines=3, label="Text", placeholder="e.g., Hello, how are you?", show_copy_button=True)
|
| 828 |
+
with gr.Row():
|
| 829 |
+
uni_drop = gr.Checkbox(value=True, label="Drop articles (ES/EN → conlang)")
|
| 830 |
+
uni_zero = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
|
| 831 |
+
uni_max_comp = gr.Checkbox(value=False, label="Max Exact Compression (sidecar `~...`)")
|
| 832 |
+
uni_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 833 |
+
with gr.Row():
|
| 834 |
+
btn_translate = gr.Button("🚀 Translate", variant="primary")
|
| 835 |
+
btn_reset = gr.Button("🧹 Clear")
|
| 836 |
+
uni_out = gr.Textbox(lines=6, label="Translation", show_copy_button=True)
|
| 837 |
+
|
| 838 |
+
btn_translate.click(
|
| 839 |
+
universal_translate,
|
| 840 |
+
[uni_text, uni_src, uni_tgt, uni_drop, uni_zero, uni_mode, uni_max_comp],
|
| 841 |
+
[uni_out]
|
| 842 |
+
)
|
| 843 |
+
btn_reset.click(lambda: "", None, [uni_text, uni_out])
|
| 844 |
+
|
| 845 |
+
gr.Markdown("### 🔎 Examples (click to autofill)")
|
| 846 |
+
ex1 = gr.Button("EN→Minimax: “Hello, how are you?”")
|
| 847 |
+
ex2 = gr.Button("ES→Kōmín: “Este sistema mantiene los mensajes compactos.”")
|
| 848 |
+
ex3 = gr.Button("EN→Minimax (compressed): “The weather today is perfect for a walk.”")
|
| 849 |
+
ex4 = gr.Button("ES→Kōmín (compressed): “Por favor decodifica esto luego con el sidecar.”")
|
| 850 |
+
|
| 851 |
+
ex1.click(lambda: ("Hello, how are you?", "English", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 852 |
+
ex2.click(lambda: ("Este sistema mantiene los mensajes compactos.", "Español", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 853 |
+
ex3.click(lambda: ("The weather today is perfect for a walk.", "English", "Minimax-ASCII"), None, [uni_text, uni_src, uni_tgt])
|
| 854 |
+
ex4.click(lambda: ("Por favor decodifica esto luego con el sidecar.", "Español", "Kōmín-CJK"), None, [uni_text, uni_src, uni_tgt])
|
| 855 |
+
|
| 856 |
+
with gr.Tab("🛠️ Build (ES/EN → Conlang)"):
|
| 857 |
+
with gr.Row():
|
| 858 |
+
src_lang = gr.Dropdown(["Español","English"], value="English", label="Source")
|
| 859 |
+
target = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 860 |
+
text_in = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
|
| 861 |
+
with gr.Row():
|
| 862 |
+
drop_articles = gr.Checkbox(value=True, label="Drop articles")
|
| 863 |
+
zero_copula = gr.Checkbox(value=False, label="Zero copula (present affirmative)")
|
| 864 |
+
max_comp_build = gr.Checkbox(value=False, label="Max Exact Compression")
|
| 865 |
+
mode_build = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 866 |
+
with gr.Row():
|
| 867 |
+
btn_build = gr.Button("🏗️ Build", variant="primary")
|
| 868 |
+
btn_build_clear = gr.Button("🧹 Clear")
|
| 869 |
+
out = gr.Textbox(lines=6, label="Output", show_copy_button=True)
|
| 870 |
+
|
| 871 |
+
btn_build.click(
|
| 872 |
+
build_sentence,
|
| 873 |
+
[text_in, src_lang, target, drop_articles, zero_copula, mode_build, max_comp_build],
|
| 874 |
+
[out]
|
| 875 |
+
)
|
| 876 |
+
btn_build_clear.click(lambda: "", None, [text_in, out])
|
| 877 |
+
|
| 878 |
+
with gr.Tab("����️ Decode (Conlang → ES/EN)"):
|
| 879 |
+
with gr.Row():
|
| 880 |
+
src_code = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Source")
|
| 881 |
+
tgt_lang = gr.Dropdown(["Español","English"], value="English", label="Target")
|
| 882 |
+
code_in = gr.Textbox(lines=3, label="Conlang text (may include `~...`)", show_copy_button=True)
|
| 883 |
+
out3 = gr.Textbox(lines=6, label="Output", show_copy_button=True)
|
| 884 |
+
|
| 885 |
+
def decode_lossless_aware(text, src, tgt):
|
| 886 |
+
orig = extract_custom_sidecar(text)
|
| 887 |
+
if orig is not None: return orig
|
| 888 |
+
orig = extract_sidecar_b85(text)
|
| 889 |
+
if orig is not None: return orig
|
| 890 |
+
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 891 |
+
|
| 892 |
+
with gr.Row():
|
| 893 |
+
btn_decode = gr.Button("🔓 Decode", variant="primary")
|
| 894 |
+
btn_decode_clear = gr.Button("🧹 Clear")
|
| 895 |
+
|
| 896 |
+
btn_decode.click(decode_lossless_aware, [code_in, src_code, tgt_lang], [out3])
|
| 897 |
+
btn_decode_clear.click(lambda: "", None, [code_in, out3])
|
| 898 |
+
|
| 899 |
+
gr.Markdown("> **Tip:** if you see `~...`, decoding will be bit-perfect.")
|
| 900 |
+
|
| 901 |
+
with gr.Tab("🔄 Round-trip"):
|
| 902 |
+
with gr.Row():
|
| 903 |
+
rt_src = gr.Dropdown(["Español","English"], value="English", label="Source")
|
| 904 |
+
rt_tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang")
|
| 905 |
+
rt_text = gr.Textbox(lines=3, label="Sentence", show_copy_button=True)
|
| 906 |
+
rt_max_comp = gr.Checkbox(value=False, label="Max Exact Compression")
|
| 907 |
+
rt_mode = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False)
|
| 908 |
+
rt_out_conlang = gr.Textbox(lines=3, label="Outward (conlang)", show_copy_button=True)
|
| 909 |
+
rt_out_back = gr.Textbox(lines=3, label="Back", show_copy_button=True)
|
| 910 |
+
with gr.Row():
|
| 911 |
+
btn_rt = gr.Button("▶️ Test", variant="primary")
|
| 912 |
+
btn_rt_clear = gr.Button("🧹 Clear")
|
| 913 |
+
|
| 914 |
+
btn_rt.click(round_trip, [rt_text, rt_src, rt_tgt, rt_mode, rt_max_comp], [rt_out_conlang, rt_out_back])
|
| 915 |
+
btn_rt_clear.click(lambda: "", None, [rt_text, rt_out_conlang, rt_out_back])
|
| 916 |
+
|
| 917 |
+
gr.Markdown("---")
|
| 918 |
+
gr.Markdown("Made with ❤️ · **spaCy** (optional) · Everything runs inside this Space.")
|
| 919 |
+
return group
|
| 920 |
+
|
| 921 |
+
with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo:
|
| 922 |
+
gr.Markdown("## 🌍 Idioma / Language")
|
| 923 |
+
lang_select = gr.Radio(choices=["ES","EN"], value="ES", label="Selecciona / Select")
|
| 924 |
+
group_es = make_group_es()
|
| 925 |
+
group_en = make_group_en()
|
| 926 |
+
|
| 927 |
+
def switch_lang(code):
|
| 928 |
+
if code == "EN":
|
| 929 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 930 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 931 |
+
|
| 932 |
+
lang_select.change(switch_lang, [lang_select], [group_es, group_en])
|
| 933 |
|
| 934 |
if __name__ == "__main__":
|
| 935 |
+
demo.launch()
|