|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os, re, json, base64, zlib |
|
|
from typing import Dict, Optional, List, Any |
|
|
import gradio as gr |
|
|
|
|
|
|
|
|
LEX_MINI = "lexicon_minimax.json" |
|
|
LEX_KOMI = "lexicon_komin.json" |
|
|
LEX_MASTER = "lexicon_master.json" |
|
|
|
|
|
|
|
|
WORD_RE = re.compile(r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+", re.UNICODE) |
|
|
STRIP = str.maketrans("ÁÉÍÓÚÜÑáéíóúüñ", "AEIOUUNaeiouun") |
|
|
def norm_es(w: str) -> str: return re.sub(r"[^a-záéíóúüñ]", "", (w or "").lower()).translate(STRIP) |
|
|
def norm_en(w: str) -> str: return re.sub(r"[^a-z]", "", (w or "").lower()) |
|
|
|
|
|
|
|
|
def load_json(path: str): |
|
|
if not os.path.exists(path): return None |
|
|
with open(path, "r", encoding="utf-8") as f: return json.load(f) |
|
|
|
|
|
def load_lexicons(): |
|
|
mm = load_json(LEX_MINI) or {} |
|
|
kk = load_json(LEX_KOMI) or {} |
|
|
master = load_json(LEX_MASTER) or {} |
|
|
|
|
|
es2mini = mm.get("mapping", {}) |
|
|
es2komi = kk.get("mapping", {}) |
|
|
mini2es = {v:k for k,v in es2mini.items()} |
|
|
komi2es = {v:k for k,v in es2komi.items()} |
|
|
|
|
|
es2en_lemma: Dict[str,str] = {} |
|
|
en2es_lemma: Dict[str,str] = {} |
|
|
en2mini, en2komi = {}, {} |
|
|
mini2en, komi2en = {}, {} |
|
|
|
|
|
if isinstance(master, dict) and "entries" in master: |
|
|
for e in master["entries"]: |
|
|
es = norm_es(str(e.get("lemma_es",""))); en = norm_en(str(e.get("lemma_en",""))) |
|
|
mi = str(e.get("minimax","")); ko = str(e.get("komin","")) |
|
|
if es and en: |
|
|
es2en_lemma.setdefault(es, en); en2es_lemma.setdefault(en, es) |
|
|
if en and mi: en2mini.setdefault(en, mi) |
|
|
if en and ko: en2komi.setdefault(en, ko) |
|
|
|
|
|
mini2en = {v:k for k,v in en2mini.items()} |
|
|
komi2en = {v:k for k,v in en2komi.items()} |
|
|
return (es2mini, es2komi, mini2es, komi2es, en2mini, en2komi, mini2en, komi2en, es2en_lemma, en2es_lemma, master) |
|
|
|
|
|
(ES2MINI, ES2KOMI, MINI2ES, KOMI2ES, EN2MINI, EN2KOMI, MINI2EN, KOMI2EN, ES2EN_LEMMA, EN2ES_LEMMA, MASTER_OBJ) = load_lexicons() |
|
|
|
|
|
|
|
|
PRON_ES = {"yo","tú","vos","usted","él","ella","nosotros","vosotros","ustedes","ellos","ellas","me","te","se","nos","os"} |
|
|
PRON_EN = {"i","you","he","she","it","we","they","me","him","her","us","them"} |
|
|
|
|
|
|
|
|
ALPHA_MINI64 = "@ptkmnslraeiouy0123456789><=:/!?.+-_*#bcdfghjvqwxzACEGHIJKLMNOPRS"[:64] |
|
|
CJK_BASE = ( |
|
|
"天地人日月山川雨風星火水木土金石光影花草鳥犬猫魚" |
|
|
"東西南北中外上下午夜明暗手口目耳心言書家道路門" |
|
|
"大小長短早晚高低新古青紅白黒金銀銅玉米茶酒米" |
|
|
"文学楽音画体気電海空森林雪雲砂島橋城村国自由静" |
|
|
) |
|
|
ALPHA_CJK64 = (CJK_BASE * 2)[:64] |
|
|
|
|
|
def to_custom_b64(b: bytes, alphabet: str) -> str: |
|
|
std = base64.b64encode(b).decode("ascii") |
|
|
trans = str.maketrans("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/", alphabet) |
|
|
return std.translate(trans).rstrip("=") |
|
|
def from_custom_b64(s: str, alphabet: str) -> bytes: |
|
|
trans = str.maketrans(alphabet, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/") |
|
|
std = s.translate(trans); pad = "=" * ((4 - len(std) % 4) % 4) |
|
|
return base64.b64decode(std + pad) |
|
|
def enc_oov_minimax(token: str) -> str: return "~" + to_custom_b64(token.encode("utf-8"), ALPHA_MINI64) |
|
|
def dec_oov_minimax(code: str) -> str: |
|
|
try: return from_custom_b64(code[1:], ALPHA_MINI64).decode("utf-8") |
|
|
except Exception: return code |
|
|
def enc_oov_komin(token: str) -> str: return "「" + to_custom_b64(token.encode("utf-8"), ALPHA_CJK64) + "」" |
|
|
def dec_oov_komin(code: str) -> str: |
|
|
try: return from_custom_b64(code[1:-1], ALPHA_CJK64).decode("utf-8") |
|
|
except Exception: return code |
|
|
def is_oov_minimax(code: str) -> bool: return code.startswith("~") and len(code) > 1 |
|
|
def is_oov_komin(code: str) -> bool: return len(code) >= 2 and code.startswith("「") and code.endswith("」") |
|
|
|
|
|
|
|
|
USE_SPACY = False |
|
|
try: |
|
|
import spacy |
|
|
try: |
|
|
nlp_es = spacy.load("es_core_news_sm"); nlp_en = spacy.load("en_core_web_sm"); USE_SPACY = True |
|
|
except Exception: |
|
|
nlp_es = nlp_en = None |
|
|
except Exception: |
|
|
nlp_es = nlp_en = None |
|
|
|
|
|
def lemma_of(tok, src_lang: str) -> str: |
|
|
if src_lang == "Español": |
|
|
return norm_es(tok.lemma_ if getattr(tok,"lemma_","") else tok.text) |
|
|
else: |
|
|
return norm_en(tok.lemma_ if getattr(tok,"lemma_","") else tok.text) |
|
|
|
|
|
|
|
|
def detect_polarity(doc) -> bool: return "?" in getattr(doc,"text","") |
|
|
def detect_neg(doc) -> bool: |
|
|
for t in doc: |
|
|
if getattr(t,"dep_","")=="neg" or getattr(t,"lower_","").lower() in ("no","not","n't"): |
|
|
return True |
|
|
return False |
|
|
def detect_tense(root): |
|
|
m = str(getattr(root,"morph","")) |
|
|
if "Tense=Past" in m: return "Past" |
|
|
if "Tense=Fut" in m: return "Fut" |
|
|
if "Tense=Pres" in m: return "Pres" |
|
|
for c in getattr(root,"children",[]): |
|
|
if getattr(c,"pos_","")=="AUX": |
|
|
cm = str(getattr(c,"morph","")) |
|
|
if "Tense=Past" in cm: return "Past" |
|
|
if getattr(c,"lower_","").lower()=="will": return "Fut" |
|
|
return "Pres" |
|
|
def extract_core(doc): |
|
|
tokens = list(doc) |
|
|
root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT" and getattr(t,"pos_","") in ("VERB","AUX")), tokens[0] if tokens else doc) |
|
|
subs, objs, obls, advs = [], [], [], [] |
|
|
for t in getattr(root,"children",[]): |
|
|
dep = getattr(t,"dep_",""); pos = getattr(t,"pos_","") |
|
|
if dep in ("nsubj","nsubj:pass","csubj"): subs.append(t) |
|
|
elif dep in ("obj","dobj","iobj"): objs.append(t) |
|
|
elif dep in ("obl","pobj"): obls.append(t) |
|
|
elif dep in ("advmod","advcl") and pos=="ADV": advs.append(t) |
|
|
for arr in (subs,objs,obls,advs): arr.sort(key=lambda x: getattr(x,"i",0)) |
|
|
return root, subs, objs, obls, advs |
|
|
def _person_of_doc(doc, src_lang: str) -> Optional[str]: |
|
|
try: |
|
|
tokens = list(doc) |
|
|
root = next((t for t in tokens if getattr(t,"dep_","")=="ROOT"), tokens[0]) |
|
|
subj = next((t for t in getattr(root,"children",[]) if getattr(t,"dep_","").startswith("nsubj")), None) |
|
|
if subj is None: return None |
|
|
plur = ("Number=Plur" in str(getattr(subj,"morph",""))) if src_lang=="Español" else (getattr(subj,"tag_","") in ("NNS","NNPS")) |
|
|
low = getattr(subj,"lower_","").lower() |
|
|
if src_lang=="Español": |
|
|
if low in ("yo",): return "1p" if plur else "1s" |
|
|
if low in ("tú","vos"): return "2p" if plur else "2s" |
|
|
if low in ("usted","él","ella"): return "3p" if plur else "3s" |
|
|
lem = lemma_of(subj, "Español") |
|
|
if lem in ("yo","nosotros"): return "1p" if plur else "1s" |
|
|
if lem in ("tú","vosotros"): return "2p" if plur else "2s" |
|
|
return "3p" if plur else "3s" |
|
|
else: |
|
|
if low in ("i",): return "1p" if plur else "1s" |
|
|
if low in ("you",): return "2p" if plur else "2s" |
|
|
if low in ("he","she","it"): return "3p" if plur else "3s" |
|
|
return "3p" if plur else "3s" |
|
|
except Exception: |
|
|
return None |
|
|
def detect_person(root, src_lang: str) -> Optional[str]: |
|
|
m = str(getattr(root,"morph","")); person_str, number_str = "3","s" |
|
|
if "Person=" in m: |
|
|
for feat in m.split("|"): |
|
|
if feat.startswith("Person="): person_str = feat.split("=")[1] |
|
|
elif feat.startswith("Number="): number_str = "p" if feat.split("=")[1]=="Plur" else "s" |
|
|
return person_str + number_str |
|
|
return _person_of_doc(root.doc, src_lang) |
|
|
|
|
|
|
|
|
def code_es(lemma: str, target: str) -> str: |
|
|
lemma = norm_es(lemma) |
|
|
if target=="Minimax-ASCII": return ES2MINI.get(lemma) or enc_oov_minimax(lemma) |
|
|
return ES2KOMI.get(lemma) or enc_oov_komin(lemma) |
|
|
def code_en(lemma: str, target: str) -> str: |
|
|
lemma = norm_en(lemma) |
|
|
if target=="Minimax-ASCII": return (EN2MINI.get(lemma) if EN2MINI else None) or enc_oov_minimax(lemma) |
|
|
return (EN2KOMI.get(lemma) if EN2KOMI else None) or enc_oov_komin(lemma) |
|
|
|
|
|
TAM_MINI = {"Pres":"P","Past":"T","Fut":"F","UNK":"P"} |
|
|
TAM_KOMI = {"Pres":"Ⓟ","Past":"Ⓣ","Fut":"Ⓕ","UNK":"Ⓟ"} |
|
|
|
|
|
def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s", remove_pronouns=False): |
|
|
root, subs, objs, obls, advs = extract_core(doc) |
|
|
tense = detect_tense(root); is_q, is_neg = detect_polarity(doc), detect_neg(doc) |
|
|
vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar") |
|
|
vcode = code_es(vlem, "Minimax-ASCII") if src_lang=="Español" else code_en(vlem, "Minimax-ASCII") |
|
|
tail = TAM_MINI.get(tense, "P") |
|
|
if semi_lossless: tail += (detect_person(root, src_lang) or person_hint) |
|
|
if is_neg: tail += "N" |
|
|
if is_q: tail += "Q" |
|
|
if tail: vcode = f"{vcode}·{tail}" |
|
|
|
|
|
def realize_np(tokens): |
|
|
outs=[] |
|
|
for t in tokens: |
|
|
if remove_pronouns: |
|
|
txt = (getattr(t,"text","") or "").lower() |
|
|
if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue |
|
|
lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","") |
|
|
outs.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII")) |
|
|
return outs |
|
|
|
|
|
S = realize_np(subs); O = realize_np(objs)+realize_np(obls) |
|
|
ADV=[] |
|
|
for a in advs: |
|
|
lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","") |
|
|
ADV.append(code_es(lem,"Minimax-ASCII") if src_lang=="Español" else code_en(lem,"Minimax-ASCII")) |
|
|
|
|
|
parts = S+O+ADV if (zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q) else [vcode]+S+O+ADV |
|
|
return " ".join(p for p in parts if p) |
|
|
|
|
|
def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi_lossless=False, person_hint="2s", remove_pronouns=False): |
|
|
root, subs, objs, obls, advs = extract_core(doc) |
|
|
tense, is_q, is_neg = detect_tense(root), detect_polarity(doc), detect_neg(doc) |
|
|
vlem = lemma_of(root, src_lang) if USE_SPACY else ("ser" if "?" in getattr(doc,"text","") else "estar") |
|
|
vcode = code_es(vlem, "Kōmín-CJK") if src_lang=="Español" else code_en(vlem, "Kōmín-CJK") |
|
|
P_SUBJ, P_OBJ = "ᵖ", "ᵒ"; Q_FIN = "?" |
|
|
TAM = TAM_KOMI.get(tense,"Ⓟ") |
|
|
if semi_lossless: TAM = TAM + f"[{detect_person(root, src_lang) or person_hint}]" |
|
|
|
|
|
def realize_np(tokens, particle): |
|
|
outs=[] |
|
|
for t in tokens: |
|
|
if remove_pronouns: |
|
|
txt = (getattr(t,"text","") or "").lower() |
|
|
if (src_lang=="Español" and txt in PRON_ES) or (src_lang=="English" and txt in PRON_EN): continue |
|
|
lem = lemma_of(t, src_lang) if USE_SPACY else getattr(t,"text","") |
|
|
outs.append((code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK")) + particle) |
|
|
return outs |
|
|
|
|
|
S = realize_np(subs, P_SUBJ); O = realize_np(objs+obls, P_OBJ) |
|
|
ADV=[] |
|
|
for a in advs: |
|
|
lem = lemma_of(a, src_lang) if USE_SPACY else getattr(a,"text","") |
|
|
ADV.append(code_es(lem,"Kōmín-CJK") if src_lang=="Español" else code_en(lem,"Kōmín-CJK")) |
|
|
parts = S+O+ADV+[vcode+TAM] |
|
|
out = " ".join(parts) |
|
|
if is_q: out += " " + Q_FIN |
|
|
return out |
|
|
|
|
|
|
|
|
SIDECAR_B85_RE = re.compile(r"\s?§\((?P<b85>[A-Za-z0-9!#$%&()*+\-;<=>?@^_{|}~]+)\)$") |
|
|
def b85_enc_raw(s: str) -> str: return base64.a85encode(zlib.compress(s.encode("utf-8"), 9), adobe=False).decode("ascii") |
|
|
def b85_dec_raw(b85s: str) -> str: return zlib.decompress(base64.a85decode(b85s.encode("ascii"), adobe=False)).decode("utf-8") |
|
|
def attach_sidecar_b85(conlang_text: str, original_text: str) -> str: return f"{conlang_text} §({b85_enc_raw(original_text)})" |
|
|
def extract_sidecar_b85(text: str) -> Optional[str]: |
|
|
m = SIDECAR_B85_RE.search(text) |
|
|
if not m: return None |
|
|
try: return b85_dec_raw(m.group("b85")) |
|
|
except Exception: return None |
|
|
def strip_sidecar_b85(text: str) -> str: return SIDECAR_B85_RE.sub("", text).rstrip() |
|
|
def custom_sidecar_enc(conlang_text: str, original_text: str) -> str: |
|
|
blob = to_custom_b64(zlib.compress(original_text.encode("utf-8"), 9), ALPHA_MINI64) |
|
|
return f"{conlang_text} ~{blob}" |
|
|
def extract_custom_sidecar(text: str) -> Optional[str]: |
|
|
if '~' in text: |
|
|
_, blob = text.rsplit('~', 1) |
|
|
try: return zlib.decompress(from_custom_b64(blob, ALPHA_MINI64)).decode("utf-8") |
|
|
except Exception: return None |
|
|
return None |
|
|
def strip_custom_sidecar(text: str) -> str: return text.split('~')[0].rstrip() if '~' in text else text |
|
|
|
|
|
|
|
|
def encode_simple(text: str, src_lang: str, target: str) -> str: |
|
|
if not text.strip(): return "" |
|
|
def repl_es(m): |
|
|
key = norm_es(m.group(0)) |
|
|
code = ES2MINI.get(key) if target=="Minimax-ASCII" else ES2KOMI.get(key) |
|
|
return code or (enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))) |
|
|
def repl_en(m): |
|
|
key = norm_en(m.group(0)); table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI |
|
|
if table and key in table: return table[key] |
|
|
return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0)) |
|
|
repl = repl_es if src_lang=="Español" else repl_en |
|
|
return WORD_RE.sub(repl, text) |
|
|
|
|
|
def pluralize_es(word: str) -> str: |
|
|
exceptions = {"uno":"unos","buen":"buenos","hombre":"hombres"} |
|
|
if word in exceptions: return exceptions[word] |
|
|
if word.endswith("z"): return word[:-1]+"ces" |
|
|
if word.endswith(("a","e","i","o")): return word+"s" |
|
|
return word+"es" |
|
|
def pluralize_en(word: str) -> str: |
|
|
exceptions = {"man":"men","woman":"women","child":"children"} |
|
|
if word in exceptions: return exceptions[word] |
|
|
if word.endswith("y") and len(word)>1 and word[-2] not in "aeiou": return word[:-1]+"ies" |
|
|
if word.endswith(("s","sh","ch","x","z")): return word+"es" |
|
|
return word+"s" |
|
|
def pluralize(word: str, tgt_lang: str) -> str: return pluralize_es(word) if tgt_lang=="Español" else pluralize_en(word) |
|
|
|
|
|
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$") |
|
|
|
|
|
def decode_simple(text: str, source: str, tgt_lang: str) -> str: |
|
|
if not text.strip(): return "" |
|
|
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES |
|
|
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN |
|
|
if source=="Kōmín-CJK": |
|
|
text = text.replace("?","?").replace(" "," ") |
|
|
return " ".join([code2es.get(w,w) for w in text.split() if w!="?"]) |
|
|
tokens = text.split() |
|
|
if not tokens: return "" |
|
|
lemma_tokens, pl_flags = [], [] |
|
|
verb_idx=-1; verb_lemma=None; verb_tense="Pres"; verb_person="3s"; has_q=False; is_neg=False |
|
|
for part in tokens: |
|
|
look = part.replace("[PL]",""); had_pl = "[PL]" in part; pl_flags.append(had_pl) |
|
|
m = mini_tail_re.match(look) |
|
|
if m: |
|
|
verb_idx = len(lemma_tokens); stem=m.group("stem"); tail=m.group("tail") |
|
|
vlem_es = code2es.get(stem); vlem_en = code2en.get(stem) if code2en else None |
|
|
vlem = vlem_es if tgt_lang=="Español" else (vlem_en or vlem_es or stem) |
|
|
if not vlem: vlem = dec_oov_minimax(stem) if is_oov_minimax(stem) else stem |
|
|
lemma_tokens.append(vlem); pl_flags.append(False) |
|
|
if tail: |
|
|
if tail[0] in "PTF": |
|
|
verb_tense = {"P":"Pres","T":"Past","F":"Fut"}[tail[0]]; pos=1 |
|
|
if len(tail)>pos and tail[pos] in "123": |
|
|
pos+=1; verb_person = tail[pos-1] + (tail[pos] if len(tail)>pos and tail[pos] in "sp" else "s") |
|
|
if len(tail)>pos and tail[pos] in "sp": pos+=1 |
|
|
is_neg = "N" in tail[pos:]; has_q = "Q" in tail[pos:] |
|
|
verb_lemma = vlem; continue |
|
|
w_es = code2es.get(look); w_en = code2en.get(look) if code2en else None |
|
|
w = w_es if tgt_lang=="Español" else (w_en or w_es or look) |
|
|
if not w: w = dec_oov_minimax(look) if is_oov_minimax(look) else look |
|
|
lemma_tokens.append(w); pl_flags.append(had_pl) |
|
|
out_parts=[] |
|
|
for idx, lem in enumerate(lemma_tokens): |
|
|
if idx==verb_idx: |
|
|
v = _es_conj(verb_lemma, verb_tense, verb_person) if tgt_lang=="Español" else _en_conj(verb_lemma, verb_tense, verb_person) |
|
|
if is_neg: v = ("no " if tgt_lang=="Español" else "not ") + v |
|
|
out_parts.append(v) |
|
|
else: |
|
|
out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem) |
|
|
out_text = " ".join(out_parts) |
|
|
if has_q: |
|
|
start_q = "¿" if tgt_lang=="Español" else "" |
|
|
out_text = f"{start_q}{out_text.capitalize()}?" |
|
|
return out_text |
|
|
|
|
|
|
|
|
def _es_conj_regular(lemma, tense, person): |
|
|
if not lemma.endswith(("ar","er","ir")): return lemma |
|
|
stem, vtype = lemma[:-2], lemma[-2:] |
|
|
pres={"ar":{"1s":"o","2s":"as","3s":"a","1p":"amos","2p":"áis","3p":"an"}, |
|
|
"er":{"1s":"o","2s":"es","3s":"e","1p":"emos","2p":"éis","3p":"en"}, |
|
|
"ir":{"1s":"o","2s":"es","3s":"e","1p":"imos","2p":"ís","3p":"en"}} |
|
|
pret={"ar":{"1s":"é","2s":"aste","3s":"ó","1p":"amos","2p":"asteis","3p":"aron"}, |
|
|
"er":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"rieron"}, |
|
|
"ir":{"1s":"í","2s":"iste","3s":"ió","1p":"imos","2p":"isteis","3p":"rieron"}} |
|
|
fut={"1s":"é","2s":"ás","3s":"á","1p":"emos","2p":"éis","3p":"án"} |
|
|
if tense=="Pres": return stem + pres[vtype].get(person, pres[vtype]["3s"]) |
|
|
if tense=="Past": return stem + pret[vtype].get(person, pret[vtype]["3s"]) |
|
|
return lemma + fut.get(person, fut["3s"]) |
|
|
def _es_conj(lemma, tense, person): |
|
|
if lemma=="ser": |
|
|
tab={"Pres":{"1s":"soy","2s":"eres","3s":"es","1p":"somos","2p":"sois","3p":"son"}, |
|
|
"Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"}, |
|
|
"Fut":{"1s":"seré","2s":"serás","3s":"será","1p":"seremos","2p":"seréis","3p":"serán"}} |
|
|
return tab[tense].get(person, tab[tense]["3s"]) |
|
|
if lemma=="estar": |
|
|
tab={"Pres":{"1s":"estoy","2s":"estás","3s":"está","1p":"estamos","2p":"estáis","3p":"están"}, |
|
|
"Past":{"1s":"estuviste","2s":"estuviste","3s":"estuvo","1p":"estuvimos","2p":"estuvisteis","3p":"estuvieron"}, |
|
|
"Fut":{"1s":"estaré","2s":"estarás","3s":"estará","1p":"estaremos","2p":"estaréis","3p":"estarán"}} |
|
|
return tab[tense].get(person, tab[tense]["3s"]) |
|
|
if lemma=="ir": |
|
|
tab={"Pres":{"1s":"voy","2s":"vas","3s":"va","1p":"vamos","2p":"vais","3p":"van"}, |
|
|
"Past":{"1s":"fui","2s":"fuiste","3s":"fue","1p":"fuimos","2p":"fuisteis","3p":"fueron"}, |
|
|
"Fut":{"1s":"iré","2s":"irás","3s":"irá","1p":"iremos","2p":"iréis","3p":"irán"}} |
|
|
return tab[tense].get(person, tab[tense]["3s"]) |
|
|
return _es_conj_regular(lemma, tense, person) |
|
|
def _en_conj(lemma, tense, person): |
|
|
if lemma=="be": |
|
|
if tense=="Pres": return {"1s":"am","2s":"are","3s":"is","1p":"are","2p":"are","3p":"are"}.get(person,"is") |
|
|
if tense=="Past": return {"1s":"was","2s":"were","3s":"was","1p":"were","2p":"were","3p":"were"}.get(person,"was") |
|
|
return "be" |
|
|
if lemma=="have": |
|
|
if tense=="Pres": return "has" if person=="3s" else "have" |
|
|
if tense=="Past": return "had" |
|
|
return "have" |
|
|
if lemma=="go": |
|
|
if tense=="Past": return "went" |
|
|
return "goes" if (tense=="Pres" and person=="3s") else "go" |
|
|
if lemma=="do": |
|
|
if tense=="Past": return "did" |
|
|
return "does" if (tense=="Pres" and person=="3s") else "do" |
|
|
if tense=="Pres": |
|
|
if person=="3s": |
|
|
if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1]+"ies" |
|
|
if lemma.endswith(("s","sh","ch","x","z","o")): return lemma+"es" |
|
|
return lemma+"s" |
|
|
return lemma |
|
|
if tense=="Past": |
|
|
if lemma.endswith("e"): return lemma+"d" |
|
|
if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1]+"ied" |
|
|
return lemma+"ed" |
|
|
return lemma |
|
|
|
|
|
|
|
|
def _build_with_spacy(text: str, src_lang: str, target: str, drop_articles: bool, zero_copula: bool, semi_lossless: bool, remove_pronouns: bool) -> str: |
|
|
nlp = nlp_es if src_lang=="Español" else nlp_en |
|
|
doc = nlp(text) |
|
|
if target == "Minimax-ASCII": |
|
|
return realize_minimax(doc, src_lang, drop_articles, zero_copula, semi_lossless, remove_pronouns=remove_pronouns) |
|
|
else: |
|
|
return realize_komin(doc, src_lang, drop_articles, zero_copula, semi_lossless, remove_pronouns=remove_pronouns) |
|
|
|
|
|
def build_sentence(text: str, src_lang: str, target: str, drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False, remove_pronouns: bool = False) -> str: |
|
|
if not text.strip(): return "" |
|
|
semi = True |
|
|
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi, remove_pronouns) if USE_SPACY else encode_simple(text, src_lang, target) |
|
|
if max_comp_exact: |
|
|
return custom_sidecar_enc(core, text) |
|
|
return core |
|
|
|
|
|
def universal_translate(text: str, src: str, tgt: str, drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False, remove_pronouns: bool = False) -> str: |
|
|
if not text.strip(): return "" |
|
|
if src == tgt: return text |
|
|
|
|
|
|
|
|
if src in ("Español","English") and tgt in ("Minimax-ASCII","Kōmín-CJK"): |
|
|
return build_sentence(text, src, tgt, drop_articles, zero_copula, mode, max_comp_exact, remove_pronouns) |
|
|
|
|
|
|
|
|
if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Español","English"): |
|
|
orig = extract_custom_sidecar(text) |
|
|
if orig is not None: return orig |
|
|
orig = extract_sidecar_b85(text) |
|
|
if orig is not None: return orig |
|
|
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt) |
|
|
|
|
|
|
|
|
if src in ("Español","English") and tgt in ("Español","English"): |
|
|
return text |
|
|
|
|
|
|
|
|
if src in ("Minimax-ASCII","Kōmín-CJK") and tgt in ("Minimax-ASCII","Kōmín-CJK"): |
|
|
core = strip_custom_sidecar(text) |
|
|
es_lemmas = decode_simple(core, src, "Español") |
|
|
words = re.findall(r"\w+|[^\w\s]+", es_lemmas) |
|
|
out=[] |
|
|
for w in words: |
|
|
if re.fullmatch(r"\w+", w): |
|
|
code = ES2MINI.get(norm_es(w)) if tgt=="Minimax-ASCII" else ES2KOMI.get(norm_es(w)) |
|
|
if not code: |
|
|
code = enc_oov_minimax(w) if tgt=="Minimax-ASCII" else enc_oov_komin(w) |
|
|
out.append(code) |
|
|
else: |
|
|
out.append(w) |
|
|
out_text = " ".join(out) |
|
|
if extract_custom_sidecar(text) is not None: |
|
|
return custom_sidecar_enc(out_text, extract_custom_sidecar(text) or "") |
|
|
return out_text |
|
|
|
|
|
return "[No soportado]" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ALL_LANGS = ["Español","English","Minimax-ASCII","Kōmín-CJK"] |
|
|
|
|
|
|
|
|
EXPLAIN_TAB_TRANSLATE_ES = """ |
|
|
**¿Qué hace “Traducir”?** |
|
|
Convierte lo que escribes en **Texto** al **Destino** que elijas (ES/EN/Minimax/Kōmín). |
|
|
- Con **Máx. Compresión Exacta**, añade un final ~... con el **original comprimido** para recuperarlo tal cual al decodificar. |
|
|
- Las casillas de **compactación** (artículos, cópula, pronombres) **sólo se aplican si el Destino es conlang**. |
|
|
""" |
|
|
EXPLAIN_TAB_BUILD_ES = """ |
|
|
**¿Qué hace “Construir (ES/EN → Conlang)”?** |
|
|
Obliga a que la salida sea **Minimax** o **Kōmín** (desde ES/EN). Aplica el orden y las partículas del conlang y las opciones de **compactación**. |
|
|
""" |
|
|
EXPLAIN_TAB_DECODE_ES = """ |
|
|
**¿Qué hace “Decodificar (Conlang → ES/EN)”?** |
|
|
Convierte de **Minimax/Kōmín** a **Español/Inglés**. |
|
|
- Si el texto trae ~..., devolvemos el **original exacto**. |
|
|
- Si no, reconstruimos lo más fiel posible con el **diccionario**. |
|
|
""" |
|
|
EXPLAIN_TAB_ROUNDTRIP_ES = """ |
|
|
**¿Qué hace “Prueba ida→vuelta”?** |
|
|
Hace el camino **(ES/EN → Conlang) → (Conlang → ES/EN)** para comprobar la **reversibilidad**. |
|
|
Con **exacta**, la vuelta coincide **bit a bit**. |
|
|
""" |
|
|
EXPLAIN_CHECKBOX_ES = """ |
|
|
**Opciones de compactación (para conlang):** |
|
|
- **Omitir artículos** (*el/la/los/las*; *a/an/the*): ahorro típico **~10–15%**. |
|
|
- **Cópula cero** (presente afirmativo): oculta *ser/estar/be* → **~5–10%** extra. |
|
|
- **Quitar pronombres**: suprime pronombres obvios → ahorro **variable**. |
|
|
- **Máx. Compresión Exacta**: añade ~... para recuperar el original (en >100 caracteres, **~40–60%**; en textos muy cortos puede no reducir). |
|
|
**Guía rápida:** sin casillas **0%**; artículos+cópula **~15–20%**. |
|
|
""" |
|
|
|
|
|
EXPLAIN_CONLANGS_ES = """ |
|
|
### ¿Qué son Minimax-ASCII y Kōmín-CJK? |
|
|
|
|
|
Piensa en **dos “idiomas comprimidos”** que sirven para escribir frases de ES/EN con menos caracteres y, además, |
|
|
**poder volver al original**. Son como “zip para texto”, pero legibles. |
|
|
|
|
|
--- |
|
|
|
|
|
#### 1) Minimax-ASCII (compacto y tecleable) |
|
|
- Usa **sólo ASCII**, así que funciona en cualquier sitio (correo, móvil, código). |
|
|
- Cada **palabra** se cambia por un **código corto** (por frecuencia, lo común es más corto). |
|
|
- Los **verbos** llevan una colita con marcas: |
|
|
- **·P / ·T / ·F** → Presente / Pasado / Futuro |
|
|
- **1s, 2p, 3s…** → Persona y número (1=yo/nosotros, 2=tú/vosotros, 3=él/ellos; s=singular, p=plural) |
|
|
- **N** → negación; **Q** → pregunta |
|
|
- **Ejemplo**: “**¿Estás bien?**” → `k·P2sQ` (estar, Presente, 2ª persona, pregunta) |
|
|
|
|
|
**Cuándo usarlo**: si quieres **máxima compatibilidad** y **tamaño pequeño** sin símbolos raros. |
|
|
|
|
|
--- |
|
|
|
|
|
#### 2) Kōmín-CJK (visual y ultracorto) |
|
|
- Usa ideogramas CJK para **aún más compresión** y un aspecto muy limpio. |
|
|
- Añade **partículas**: |
|
|
- `ᵖ` marca el **sujeto**, `ᵒ` marca el **objeto**. |
|
|
- El verbo lleva un **círculo de tiempo**: |
|
|
- **Ⓟ / Ⓣ / Ⓕ** → Presente / Pasado / Futuro |
|
|
- Las **preguntas** suelen acabar en **?**. |
|
|
- **Ejemplo**: “**Los estudiantes leen libros.**” → `学生ᵖ 书ᵒ 读Ⓟ` |
|
|
|
|
|
**Cuándo usarlo**: si buscas **máxima compresión** y no te importa usar caracteres CJK. |
|
|
|
|
|
--- |
|
|
|
|
|
#### ¿Y si falta una palabra? |
|
|
- Si una palabra no está en el diccionario, se guarda **de forma reversible**: |
|
|
- En **Minimax**: `~A9f...` (base64 propio). |
|
|
- En **Kōmín**: `「...」`. |
|
|
Así **no se pierde nada**. |
|
|
|
|
|
#### “Compresión exacta” (el `~...`) |
|
|
- Opcionalmente se añade un **sidecar** `~...` con el **original comprimido**. |
|
|
- Si existe, al decodificar se recupera el **original al 100%** (puntuación, mayúsculas, etc.). |
|
|
- En textos largos ahorra mucho, con **ida/vuelta perfecta**. |
|
|
|
|
|
--- |
|
|
|
|
|
#### Mini-glosario |
|
|
- **Código**: forma corta de una palabra (p. ej., `g` para “que”). |
|
|
- **Partícula**: marca de función (sujeto `ᵖ`, objeto `ᵒ`). |
|
|
- **Cola verbal** (Minimax): `·P/·T/·F`, persona (`1s`, `3p`), `N`, `Q`. |
|
|
- **Sidecar**: `~...` con el original comprimido para **reconstruir exacto**. |
|
|
|
|
|
> Resumen: Minimax-ASCII = **universal y tecleable**. Kōmín-CJK = **más corto y visual**. Ambos son **reversibles** y aceptan **sidecar exacto**. |
|
|
""" |
|
|
|
|
|
|
|
|
EXPLAIN_TAB_TRANSLATE_EN = "Converts **Text → Target** (ES/EN/Minimax/Kōmín). With **Max Exact**, adds ~... to recover the **exact original**. Compaction checkboxes apply only when **Target is conlang**." |
|
|
EXPLAIN_TAB_BUILD_EN = "Forces **conlang output** (Minimax/Kōmín) from ES/EN, applying phrasing rules and compaction options." |
|
|
EXPLAIN_TAB_DECODE_EN = "Converts **Minimax/Kōmín → ES/EN**. If ~... exists, returns the bit-perfect original; else semi-lossless." |
|
|
EXPLAIN_TAB_ROUNDTRIP_EN = "Runs **(ES/EN→Conlang)→(Conlang→ES/EN)** to verify reversibility; with exact, it’s bit-for-bit." |
|
|
EXPLAIN_CHECKBOX_EN = "Drop articles ~10–15%, Zero copula ~5–10% extra, Remove pronouns variable, Max Exact 40–60% for >100 chars." |
|
|
EXPLAIN_CONLANGS_EN = """ |
|
|
### What are Minimax-ASCII and Kōmín-CJK? |
|
|
|
|
|
Think of **two “compressed languages”** that let you write ES/EN sentences with fewer characters while you can still |
|
|
**recover the original**. Like a human-readable “zip” for text. |
|
|
|
|
|
--- |
|
|
|
|
|
#### 1) Minimax-ASCII (compact & typeable) |
|
|
- Uses **ASCII only**, so it works everywhere (email, phones, code editors). |
|
|
- Each **word** becomes a **short code** (high-frequency words get the shortest codes). |
|
|
- **Verbs** get a small **tail**: |
|
|
- **·P / ·T / ·F** → Present / Past / Future |
|
|
- **1s, 2p, 3s…** → Person & number (1=I/we, 2=you, 3=he/they; s=singular, p=plural) |
|
|
- **N** → negation; **Q** → question |
|
|
- **Example**: “**Are you okay?**” → `k·P2sQ` (be, Present, 2nd person, question) |
|
|
|
|
|
**When to use**: you want **maximum compatibility** and **small size** without special symbols. |
|
|
|
|
|
--- |
|
|
|
|
|
#### 2) Kōmín-CJK (visual & ultra-short) |
|
|
- Uses CJK ideograms for **even tighter compression** and a clean visual look. |
|
|
- Adds **particles**: |
|
|
- `ᵖ` marks the **subject**, `ᵒ` marks the **object**. |
|
|
- Verb shows a **time bubble**: |
|
|
- **Ⓟ / Ⓣ / Ⓕ** → Present / Past / Future |
|
|
- **Questions** usually end with **?**. |
|
|
- **Example**: “**Students read books.**” → `学生ᵖ 书ᵒ 读Ⓟ` |
|
|
|
|
|
**When to use**: you want **maximum compression** and you’re fine with CJK. |
|
|
|
|
|
--- |
|
|
|
|
|
#### Unknown words? |
|
|
- If a word isn’t in the lexicon, it’s kept **reversibly**: |
|
|
- In **Minimax**: `~A9f...` (custom base64). |
|
|
- In **Kōmín**: `「...」`. |
|
|
Nothing is lost. |
|
|
|
|
|
#### “Exact compression” (the `~...` sidecar) |
|
|
- Optionally appends `~...` with the **compressed original**. |
|
|
- If present, decoding reproduces the **exact original** (punctuation, casing, etc.). |
|
|
- Great for longer texts: big savings with **perfect round-trip**. |
|
|
|
|
|
--- |
|
|
|
|
|
#### Tiny glossary |
|
|
- **Code**: short form for a word (e.g., `g` for “that/que”). |
|
|
- **Particle**: role marker (subject `ᵖ`, object `ᵒ`). |
|
|
- **Verb tail** (Minimax): `·P/·T/·F`, person (`1s`, `3p`), `N`, `Q`. |
|
|
- **Sidecar**: `~...` holding the compressed original for **bit-perfect recovery**. |
|
|
|
|
|
> TL;DR: Minimax-ASCII = **universal & typeable**. Kōmín-CJK = **shortest & visual**. Both are **reversible** and support the **exact sidecar**. |
|
|
""" |
|
|
|
|
|
|
|
|
LEXICON_FRIENDLY_ES = """ |
|
|
**¿De dónde sale el “diccionario” (léxico) y para qué sirve?** |
|
|
- Usamos **WordNet (OMW)** para listar palabras españolas y sus equivalentes en inglés. |
|
|
- Limpiamos y ordenamos por **frecuencia de uso**. |
|
|
- Asignamos un **código corto** a cada lema para **Minimax** y para **Kōmín**. |
|
|
- Guardamos tres archivos que la app usa al traducir: |
|
|
- lexicon_minimax.json (ES → Minimax) |
|
|
- lexicon_komin.json (ES → Kōmín) |
|
|
- lexicon_master.json (ES + EN + ambos códigos) |
|
|
**Así** podemos convertir tus frases en **códigos compactos** y volver a texto entendible. |
|
|
""" |
|
|
LEXICON_FRIENDLY_EN = "We use **WordNet (OMW)**, pair ES words with EN, sort by frequency, assign short codes (Minimax/Kōmín), and save three JSONs so the app can encode/decode compactly." |
|
|
|
|
|
|
|
|
def _pct_comp(original: str, result: str) -> float: |
|
|
if not original: return 0.0 |
|
|
return max(0.0, 100.0 * (1.0 - (len(result) / len(original)))) |
|
|
|
|
|
def compaction_line_es(text, src, tgt, drop, zero, rm, maxc) -> str: |
|
|
if not text.strip(): return "—" |
|
|
if tgt not in ("Minimax-ASCII","Kōmín-CJK"): |
|
|
return "La compactación aplica cuando el **Destino** es Minimax/Kōmín." |
|
|
base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False) |
|
|
curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm) |
|
|
msg = f"**Base (sin casillas):** {_pct_comp(text, base):.1f}% · **Con tus opciones:** {_pct_comp(text, curr):.1f}%" |
|
|
if maxc: |
|
|
curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm) |
|
|
msg += f" · **Con sidecar ~...:** {_pct_comp(text, curr_exact):.1f}%" |
|
|
return msg |
|
|
|
|
|
def compaction_line_en(text, src, tgt, drop, zero, rm, maxc) -> str: |
|
|
if not text.strip(): return "—" |
|
|
if tgt not in ("Minimax-ASCII","Kōmín-CJK"): |
|
|
return "Compaction applies when **Target** is Minimax/Kōmín." |
|
|
base = build_sentence(text, src, tgt, False, False, "Semi-lossless", False, False) |
|
|
curr = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", False, rm) |
|
|
msg = f"**Base (no options):** {_pct_comp(text, base):.1f}% · **With your options:** {_pct_comp(text, curr):.1f}%" |
|
|
if maxc: |
|
|
curr_exact = build_sentence(text, src, tgt, drop, zero, "Semi-lossless", True, rm) |
|
|
msg += f" · **With ~... sidecar:** {_pct_comp(text, curr_exact):.1f}%" |
|
|
return msg |
|
|
|
|
|
def master_preview(n: int = 20) -> List[List[Any]]: |
|
|
try: |
|
|
entries = (MASTER_OBJ or {}).get("entries", []) |
|
|
head = entries[:max(0, int(n))] |
|
|
rows = [["lemma_es","lemma_en","minimax","komin"]] |
|
|
for e in head: |
|
|
rows.append([e.get("lemma_es",""), e.get("lemma_en",""), e.get("minimax",""), e.get("komin","")]) |
|
|
return rows |
|
|
except Exception: |
|
|
return [["lemma_es","lemma_en","minimax","komin"], ["(no data)","","",""]] |
|
|
|
|
|
|
|
|
def make_panel_translate(lang="ES"): |
|
|
with gr.Group(visible=True) as g: |
|
|
with gr.Accordion(("🔁 Traducir — ayuda" if lang=="ES" else "🔁 Translate — help"), open=False): |
|
|
gr.Markdown(EXPLAIN_TAB_TRANSLATE_ES if lang=="ES" else EXPLAIN_TAB_TRANSLATE_EN) |
|
|
with gr.Row(): |
|
|
src = gr.Dropdown(ALL_LANGS, value=("Español" if lang=="ES" else "English"), label=("Fuente" if lang=="ES" else "Source")) |
|
|
tgt = gr.Dropdown(ALL_LANGS, value="Minimax-ASCII", label=("Destino" if lang=="ES" else "Target")) |
|
|
text = gr.Textbox(lines=3, label=("Texto" if lang=="ES" else "Text"), placeholder=("Ej.: Hola, ¿cómo estás?" if lang=="ES" else "e.g., Hello, how are you?"), show_copy_button=True) |
|
|
with gr.Row(): |
|
|
drop = gr.Checkbox(True, label=("Omitir artículos (ES/EN → conlang)" if lang=="ES" else "Drop articles (ES/EN → conlang)")) |
|
|
zero = gr.Checkbox(False, label=("Cópula cero (presente afirm.)" if lang=="ES" else "Zero copula (present affirmative)")) |
|
|
rmpr = gr.Checkbox(False, label=("Quitar pronombres" if lang=="ES" else "Remove pronouns")) |
|
|
exact = gr.Checkbox(False, label=("Máx. Compresión Exacta (sidecar ~...)" if lang=="ES" else "Max Exact Compression (sidecar ~...)")) |
|
|
mode_hidden = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False) |
|
|
out = gr.Textbox(lines=6, label=("Traducción" if lang=="ES" else "Translation"), show_copy_button=True) |
|
|
comp = gr.Markdown("") |
|
|
def run(text, s, t, d, z, m, e, r): |
|
|
if not text.strip(): return "", "" |
|
|
res = universal_translate(text, s, t, d, z, m, e, r) |
|
|
rep = (compaction_line_es if lang=="ES" else compaction_line_en)(text, s, t, d, z, r, e) |
|
|
return res, rep |
|
|
for c in [text, src, tgt, drop, zero, rmpr, exact]: |
|
|
c.change(run, [text, src, tgt, drop, zero, mode_hidden, exact, rmpr], [out, comp]) |
|
|
return g |
|
|
|
|
|
def make_panel_build(lang="ES"): |
|
|
with gr.Group(visible=False) as g: |
|
|
with gr.Accordion(("🛠️ Construir — ayuda" if lang=="ES" else "🛠️ Build — help"), open=False): |
|
|
gr.Markdown(EXPLAIN_TAB_BUILD_ES if lang=="ES" else EXPLAIN_TAB_BUILD_EN) |
|
|
with gr.Row(): |
|
|
src = gr.Dropdown(["Español","English"], value=("Español" if lang=="ES" else "English"), label=("Fuente" if lang=="ES" else "Source")) |
|
|
tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang") |
|
|
text = gr.Textbox(lines=3, label=("Frase" if lang=="ES" else "Sentence"), show_copy_button=True) |
|
|
with gr.Row(): |
|
|
drop = gr.Checkbox(True, label=("Omitir artículos" if lang=="ES" else "Drop articles")) |
|
|
zero = gr.Checkbox(False, label=("Cópula cero (presente afirm.)" if lang=="ES" else "Zero copula (present affirmative)")) |
|
|
rmpr = gr.Checkbox(False, label=("Quitar pronombres" if lang=="ES" else "Remove pronouns")) |
|
|
exact = gr.Checkbox(False, label=("Máx. Compresión Exacta" if lang=="ES" else "Max Exact Compression")) |
|
|
mode_hidden = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False) |
|
|
out = gr.Textbox(lines=6, label=("Salida" if lang=="ES" else "Output"), show_copy_button=True) |
|
|
comp = gr.Markdown("") |
|
|
def run(text, s, t, d, z, m, e, r): |
|
|
if not text.strip(): return "", "" |
|
|
res = build_sentence(text, s, t, d, z, m, e, r) |
|
|
rep = (compaction_line_es if lang=="ES" else compaction_line_en)(text, s, t, d, z, r, e) |
|
|
return res, rep |
|
|
for c in [text, src, tgt, drop, zero, rmpr, exact]: |
|
|
c.change(run, [text, src, tgt, drop, zero, mode_hidden, exact, rmpr], [out, comp]) |
|
|
return g |
|
|
|
|
|
def make_panel_decode(lang="ES"): |
|
|
with gr.Group(visible=False) as g: |
|
|
with gr.Accordion(("🗝️ Decodificar — ayuda" if lang=="ES" else "🗝️ Decode — help"), open=False): |
|
|
gr.Markdown(EXPLAIN_TAB_DECODE_ES if lang=="ES" else EXPLAIN_TAB_DECODE_EN) |
|
|
with gr.Row(): |
|
|
src = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label=("Fuente" if lang=="ES" else "Source")) |
|
|
tgt = gr.Dropdown(["Español","English"], value=("Español" if lang=="ES" else "English"), label=("Destino" if lang=="ES" else "Target")) |
|
|
text = gr.Textbox(lines=3, label=("Texto en conlang (puede incluir ~...)" if lang=="ES" else "Conlang text (may include ~...)"), show_copy_button=True) |
|
|
out = gr.Textbox(lines=6, label=("Salida" if lang=="ES" else "Output"), show_copy_button=True) |
|
|
def run(t, s, d): |
|
|
if not t.strip(): return "" |
|
|
orig = extract_custom_sidecar(t) |
|
|
if orig is not None: return orig |
|
|
orig = extract_sidecar_b85(t) |
|
|
if orig is not None: return orig |
|
|
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(t)), s, d) |
|
|
for c in [text, src, tgt]: |
|
|
c.change(run, [text, src, tgt], [out]) |
|
|
return g |
|
|
|
|
|
def make_panel_roundtrip(lang="ES"): |
|
|
with gr.Group(visible=False) as g: |
|
|
with gr.Accordion(("🔄 Prueba ida→vuelta — ayuda" if lang=="ES" else "🔄 Round-trip — help"), open=False): |
|
|
gr.Markdown(EXPLAIN_TAB_ROUNDTRIP_ES if lang=="ES" else EXPLAIN_TAB_ROUNDTRIP_EN) |
|
|
with gr.Row(): |
|
|
src = gr.Dropdown(["Español","English"], value=("Español" if lang=="ES" else "English"), label=("Fuente" if lang=="ES" else "Source")) |
|
|
tgt = gr.Dropdown(["Minimax-ASCII","Kōmín-CJK"], value="Minimax-ASCII", label="Conlang") |
|
|
text = gr.Textbox(lines=3, label=("Frase" if lang=="ES" else "Sentence"), show_copy_button=True) |
|
|
exact = gr.Checkbox(False, label=("Máx. Compresión Exacta" if lang=="ES" else "Max Exact Compression")) |
|
|
mode_hidden = gr.Dropdown(["Semi-lossless"], value="Semi-lossless", visible=False) |
|
|
out1 = gr.Textbox(lines=3, label=("Conlang (ida)" if lang=="ES" else "Outward (conlang)"), show_copy_button=True) |
|
|
out2 = gr.Textbox(lines=3, label=("Vuelta" if lang=="ES" else "Back"), show_copy_button=True) |
|
|
def run(t, s, c, m, e): |
|
|
if not t.strip(): return "", "" |
|
|
conlang = universal_translate(t, s, c, True, False, m, e, False) |
|
|
back = universal_translate(conlang, c, s, True, False, m, e, False) |
|
|
return conlang, back |
|
|
for c in [text, src, tgt, exact]: |
|
|
c.change(run, [text, src, tgt, mode_hidden, exact], [out1, out2]) |
|
|
return g |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Universal Conlang Translator", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
|
|
|
gr.Markdown("### 📥 Diccionarios (PDF)") |
|
|
|
|
|
gr.HTML(""" |
|
|
<style> |
|
|
#btn_pdf_es button, #btn_pdf_en button { |
|
|
font-size: 0.85rem; |
|
|
padding: 0.4rem 0.8rem; |
|
|
} |
|
|
</style> |
|
|
""") |
|
|
with gr.Row(): |
|
|
gr.DownloadButton( |
|
|
label="⬇️ Español >> Minimax-ASCII y Kōmín-CJK (PDF)", |
|
|
value="dictionary_ES_to_Minimax_Komin.pdf", |
|
|
elem_id="btn_pdf_es", |
|
|
variant="secondary" |
|
|
) |
|
|
gr.DownloadButton( |
|
|
label="⬇️ English >> Minimax-ASCII y Kōmín-CJK (PDF)", |
|
|
value="dictionary_EN_to_Minimax_Komin.pdf", |
|
|
elem_id="btn_pdf_en", |
|
|
variant="secondary" |
|
|
) |
|
|
|
|
|
gr.Markdown("## 🌍 Idioma / Language") |
|
|
lang = gr.Radio(["ES","EN"], value="ES", label="Selecciona / Select") |
|
|
|
|
|
|
|
|
acc_conlangs_es = gr.Accordion("🧩 ¿Qué son Minimax-ASCII y Kōmín-CJK? (ES)", open=False, visible=True) |
|
|
with acc_conlangs_es: gr.Markdown(EXPLAIN_CONLANGS_ES) |
|
|
acc_conlangs_en = gr.Accordion("🧩 What are Minimax-ASCII and Kōmín-CJK? (EN)", open=False, visible=False) |
|
|
with acc_conlangs_en: gr.Markdown(EXPLAIN_CONLANGS_EN) |
|
|
|
|
|
acc_modes_es = gr.Accordion("📖 ¿Qué hace cada botón / modo? (ES)", open=False, visible=True) |
|
|
with acc_modes_es: gr.Markdown( |
|
|
"- **🔁 Traducir**: Texto → Destino (ES/EN/Minimax/Kōmín), con opciones de compactación y % mostrado.\n" |
|
|
"- **🛠️ Construir**: Obliga salida en conlang (Minimax/Kōmín) desde ES/EN.\n" |
|
|
"- **🗝️ Decodificar**: Conlang → ES/EN (si hay ~..., devuelve el original exacto).\n" |
|
|
"- **🔄 Prueba ida→vuelta**: Comprueba reversibilidad." |
|
|
) |
|
|
acc_modes_en = gr.Accordion("📖 What does each button / mode do? (EN)", open=False, visible=False) |
|
|
with acc_modes_en: gr.Markdown( |
|
|
"- **🔁 Translate**: Text → Target (ES/EN/Minimax/Kōmín) with compaction and %.\n" |
|
|
"- **🛠️ Build**: Force conlang output from ES/EN.\n" |
|
|
"- **🗝️ Decode**: Conlang → ES/EN (if ~..., exact original).\n" |
|
|
"- **🔄 Round-trip**: Check reversibility." |
|
|
) |
|
|
|
|
|
acc_intro_es = gr.Accordion("☑️ Opciones y compactación — guía rápida (ES)", open=False, visible=True) |
|
|
with acc_intro_es: gr.Markdown(EXPLAIN_CHECKBOX_ES) |
|
|
acc_intro_en = gr.Accordion("☑️ Options & compaction — quick guide (EN)", open=False, visible=False) |
|
|
with acc_intro_en: gr.Markdown(EXPLAIN_CHECKBOX_EN) |
|
|
|
|
|
acc_lex_es = gr.Accordion("ℹ️ Léxico — explicación y vista previa (ES)", open=False, visible=True) |
|
|
with acc_lex_es: |
|
|
gr.Markdown(LEXICON_FRIENDLY_ES) |
|
|
n_rows_es = gr.Slider(5, 100, value=20, step=5, label="Filas a mostrar") |
|
|
table_es = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False) |
|
|
gr.Button("Actualizar vista").click(lambda n: master_preview(int(n)), [n_rows_es], [table_es]) |
|
|
|
|
|
acc_lex_en = gr.Accordion("ℹ️ Lexicon — explainer & preview (EN)", open=False, visible=False) |
|
|
with acc_lex_en: |
|
|
gr.Markdown(LEXICON_FRIENDLY_EN) |
|
|
n_rows_en = gr.Slider(5, 100, value=20, step=5, label="Rows to show") |
|
|
table_en = gr.Dataframe(headers=["lemma_es","lemma_en","minimax","komin"], row_count=1, interactive=False) |
|
|
gr.Button("Refresh").click(lambda n: master_preview(int(n)), [n_rows_en], [table_en]) |
|
|
|
|
|
|
|
|
gr.Markdown("### 🧭 Modo de uso (elige uno)") |
|
|
mode = gr.Radio( |
|
|
choices=[ |
|
|
"🔁 Traducir / Translate", |
|
|
"🛠️ Construir (ES/EN → Conlang) / Build", |
|
|
"🗝️ Decodificar (Conlang → ES/EN) / Decode", |
|
|
"🔄 Prueba ida→vuelta / Round-trip", |
|
|
], |
|
|
value="🔁 Traducir / Translate", |
|
|
label=None, |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown("### 🧪 Área de trabajo") |
|
|
panel_tr_es = make_panel_translate("ES"); panel_bu_es = make_panel_build("ES") |
|
|
panel_de_es = make_panel_decode("ES"); panel_rt_es = make_panel_roundtrip("ES") |
|
|
panel_tr_en = make_panel_translate("EN"); panel_bu_en = make_panel_build("EN") |
|
|
panel_de_en = make_panel_decode("EN"); panel_rt_en = make_panel_roundtrip("EN") |
|
|
|
|
|
def _vis(yes): return gr.update(visible=bool(yes)) |
|
|
|
|
|
def _mode_to_flags(mode_str): |
|
|
order = [ |
|
|
"🔁 Traducir / Translate", |
|
|
"🛠️ Construir (ES/EN → Conlang) / Build", |
|
|
"🗝️ Decodificar (Conlang → ES/EN) / Decode", |
|
|
"🔄 Prueba ida→vuelta / Round-trip", |
|
|
] |
|
|
chosen = mode_str if mode_str in order else order[0] |
|
|
return [chosen == o for o in order], chosen |
|
|
|
|
|
|
|
|
def switch_everything(lang_code, tr, bu, de, rt): |
|
|
tr2, bu2, de2, rt2 = False, False, False, False |
|
|
if tr or (not bu and not de and not rt): tr2 = True |
|
|
elif bu: bu2 = True |
|
|
elif de: de2 = True |
|
|
else: rt2 = True |
|
|
|
|
|
is_en = (lang_code == "EN") |
|
|
vis_es = not is_en; vis_en = is_en |
|
|
updates = [ |
|
|
_vis(vis_es), _vis(vis_en), |
|
|
_vis(vis_es), _vis(vis_en), |
|
|
_vis(vis_es), _vis(vis_en), |
|
|
_vis(vis_es), _vis(vis_en), |
|
|
] |
|
|
updates += [ |
|
|
_vis(vis_es and tr2), _vis(vis_es and bu2), _vis(vis_es and de2), _vis(vis_es and rt2), |
|
|
_vis(vis_en and tr2), _vis(vis_en and bu2), _vis(vis_en and de2), _vis(vis_en and rt2), |
|
|
] |
|
|
return updates |
|
|
|
|
|
def _on_lang_or_mode(lang_code, mode_str): |
|
|
flags, chosen = _mode_to_flags(mode_str) |
|
|
tr, bu, de, rt = flags |
|
|
return switch_everything(lang_code, tr, bu, de, rt) |
|
|
|
|
|
|
|
|
lang.change( |
|
|
_on_lang_or_mode, |
|
|
[lang, mode], |
|
|
[ |
|
|
acc_conlangs_es, acc_conlangs_en, |
|
|
acc_modes_es, acc_modes_en, |
|
|
acc_intro_es, acc_intro_en, |
|
|
acc_lex_es, acc_lex_en, |
|
|
panel_tr_es, panel_bu_es, panel_de_es, panel_rt_es, |
|
|
panel_tr_en, panel_bu_en, panel_de_en, panel_rt_en, |
|
|
], |
|
|
) |
|
|
mode.change( |
|
|
_on_lang_or_mode, |
|
|
[lang, mode], |
|
|
[ |
|
|
acc_conlangs_es, acc_conlangs_en, |
|
|
acc_modes_es, acc_modes_en, |
|
|
acc_intro_es, acc_intro_en, |
|
|
acc_lex_es, acc_lex_en, |
|
|
panel_tr_es, panel_bu_es, panel_de_es, panel_rt_es, |
|
|
panel_tr_en, panel_bu_en, panel_de_en, panel_rt_en, |
|
|
], |
|
|
) |
|
|
|
|
|
demo.load( |
|
|
_on_lang_or_mode, |
|
|
[lang, mode], |
|
|
[ |
|
|
acc_conlangs_es, acc_conlangs_en, |
|
|
acc_modes_es, acc_modes_en, |
|
|
acc_intro_es, acc_intro_en, |
|
|
acc_lex_es, acc_lex_en, |
|
|
panel_tr_es, panel_bu_es, panel_de_es, panel_rt_es, |
|
|
panel_tr_en, panel_bu_en, panel_de_en, panel_rt_en, |
|
|
], |
|
|
) |
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|