Update app.py
Browse files
app.py
CHANGED
|
@@ -119,7 +119,7 @@ def dec_oov_komin(code: str) -> str:
|
|
| 119 |
def is_oov_minimax(code: str) -> bool:
|
| 120 |
return code.startswith("~") and len(code) > 1
|
| 121 |
def is_oov_komin(code: str) -> bool:
|
| 122 |
-
return len(code) >= 2 and code.startswith("「")
|
| 123 |
|
| 124 |
# ------------ spaCy opcional ------------
|
| 125 |
USE_SPACY = False
|
|
@@ -185,7 +185,7 @@ def extract_core(doc):
|
|
| 185 |
objs.append(t)
|
| 186 |
elif t.dep_ in ("obl","pobj"):
|
| 187 |
obls.append(t)
|
| 188 |
-
elif t.dep_ in ("advmod","advcl")
|
| 189 |
advs.append(t)
|
| 190 |
subs.sort(key=lambda x: x.i); objs.sort(key=lambda x: x.i)
|
| 191 |
obls.sort(key=lambda x: x.i); advs.sort(key=lambda x: x.i)
|
|
@@ -196,7 +196,7 @@ def _person_of_doc(doc, src_lang: str) -> Optional[str]:
|
|
| 196 |
root = next((t for t in doc if t.dep_=="ROOT"), doc[0])
|
| 197 |
subj = next((t for t in root.children if t.dep_.startswith("nsubj")), None)
|
| 198 |
if subj is None: return None
|
| 199 |
-
plur = ("Number=Plur" in str(subj.morph)) if src_lang=="Español" else (subj.tag_ in ("
|
| 200 |
low = subj.lower_
|
| 201 |
if src_lang=="Español":
|
| 202 |
if low in ("yo",): return "1p" if plur else "1s"
|
|
@@ -263,7 +263,7 @@ def realize_minimax(doc, src_lang: str, drop_articles=True, zero_copula=True, se
|
|
| 263 |
O = realize_np(objs) + realize_np(obls)
|
| 264 |
ADV=[code_es(lemma_of(a, src_lang), "Minimax-ASCII") if src_lang=="Español" else code_en(lemma_of(a, src_lang), "Minimax-ASCII") for a in advs] if USE_SPACY else []
|
| 265 |
|
| 266 |
-
if zero_copula
|
| 267 |
parts = S + O + ADV
|
| 268 |
else:
|
| 269 |
parts = [vcode] + S + O + ADV
|
|
@@ -298,7 +298,7 @@ def realize_komin(doc, src_lang: str, drop_articles=True, zero_copula=True, semi
|
|
| 298 |
|
| 299 |
v_form = vcode + TAM + (NEG_M if is_neg else "")
|
| 300 |
|
| 301 |
-
if zero_copula
|
| 302 |
parts = S + O + ADV
|
| 303 |
else:
|
| 304 |
parts = S + O + ADV + [v_form]
|
|
@@ -360,7 +360,7 @@ def encode_simple(text: str, src_lang: str, target: str) -> str:
|
|
| 360 |
def repl_en(m):
|
| 361 |
key = norm_en(m.group(0))
|
| 362 |
table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
|
| 363 |
-
if table
|
| 364 |
return table[key]
|
| 365 |
return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
|
| 366 |
repl = repl_es if src_lang=="Español" else repl_en
|
|
@@ -386,7 +386,7 @@ def pluralize(word: str, tgt_lang: str) -> str:
|
|
| 386 |
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
|
| 387 |
|
| 388 |
def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
| 389 |
-
if not text.strip():
|
| 390 |
return ""
|
| 391 |
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
|
| 392 |
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
|
|
@@ -466,7 +466,7 @@ def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
|
| 466 |
v_conj = ("no " if tgt_lang == "Español" else "not ") + v_conj
|
| 467 |
out_parts.append(v_conj)
|
| 468 |
continue
|
| 469 |
-
|
| 470 |
out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
|
| 471 |
|
| 472 |
out_text = " ".join(out_parts)
|
|
@@ -531,11 +531,11 @@ def _en_conj(lemma, tense, person):
|
|
| 531 |
return "goes" if (tense=="Pres" and person=="3s") else "go"
|
| 532 |
if lemma == "do":
|
| 533 |
if tense == "Past": return "did"
|
| 534 |
-
return "does" if (tense=="Pres"
|
| 535 |
|
| 536 |
if tense == "Pres":
|
| 537 |
if person == "3s":
|
| 538 |
-
if lemma.endswith("y")
|
| 539 |
return lemma[:-1] + "ies"
|
| 540 |
if lemma.endswith(("s","sh","ch","x","z","o")):
|
| 541 |
return lemma + "es"
|
|
@@ -543,7 +543,7 @@ def _en_conj(lemma, tense, person):
|
|
| 543 |
return lemma
|
| 544 |
elif tense == "Past":
|
| 545 |
if lemma.endswith("e"): return lemma + "d"
|
| 546 |
-
if lemma.endswith("y")
|
| 547 |
return lemma + "ed"
|
| 548 |
else:
|
| 549 |
return lemma
|
|
@@ -562,7 +562,7 @@ def build_sentence(text: str, src_lang: str, target: str,
|
|
| 562 |
drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
|
| 563 |
if not text.strip(): return ""
|
| 564 |
semi = True # siempre semi-lossless
|
| 565 |
-
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula
|
| 566 |
if max_comp_exact:
|
| 567 |
return custom_sidecar_enc(core, text)
|
| 568 |
return core
|
|
@@ -578,7 +578,7 @@ def universal_translate(text: str, src: str, tgt: str,
|
|
| 578 |
orig = extract_custom_sidecar(text)
|
| 579 |
if orig is not None: return orig
|
| 580 |
orig = extract_sidecar_b85(text)
|
| 581 |
-
if orig
|
| 582 |
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 583 |
if src in ("Español","English") and tgt in ("Español","English"):
|
| 584 |
return translate_natural(text, src, tgt)
|
|
|
|
| 119 |
def is_oov_minimax(code: str) -> bool:
|
| 120 |
return code.startswith("~") and len(code) > 1
|
| 121 |
def is_oov_komin(code: str) -> bool:
|
| 122 |
+
return len(code) >= 2 and code.startswith("「") and code.endswith("」")
|
| 123 |
|
| 124 |
# ------------ spaCy opcional ------------
|
| 125 |
USE_SPACY = False
|
|
|
|
| 185 |
objs.append(t)
|
| 186 |
elif t.dep_ in ("obl","pobj"):
|
| 187 |
obls.append(t)
|
| 188 |
+
elif t.dep_ in ("advmod","advcl") and t.pos_ == "ADV":
|
| 189 |
advs.append(t)
|
| 190 |
subs.sort(key=lambda x: x.i); objs.sort(key=lambda x: x.i)
|
| 191 |
obls.sort(key=lambda x: x.i); advs.sort(key=lambda x: x.i)
|
|
|
|
| 196 |
root = next((t for t in doc if t.dep_=="ROOT"), doc[0])
|
| 197 |
subj = next((t for t in root.children if t.dep_.startswith("nsubj")), None)
|
| 198 |
if subj is None: return None
|
| 199 |
+
plur = ("Number=Plur" in str(subj.morph)) if src_lang=="Español" else (subj.tag_ in ("NSS","NNPS","NNS"))
|
| 200 |
low = subj.lower_
|
| 201 |
if src_lang=="Español":
|
| 202 |
if low in ("yo",): return "1p" if plur else "1s"
|
|
|
|
| 263 |
O = realize_np(objs) + realize_np(obls)
|
| 264 |
ADV=[code_es(lemma_of(a, src_lang), "Minimax-ASCII") if src_lang=="Español" else code_en(lemma_of(a, src_lang), "Minimax-ASCII") for a in advs] if USE_SPACY else []
|
| 265 |
|
| 266 |
+
if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
|
| 267 |
parts = S + O + ADV
|
| 268 |
else:
|
| 269 |
parts = [vcode] + S + O + ADV
|
|
|
|
| 298 |
|
| 299 |
v_form = vcode + TAM + (NEG_M if is_neg else "")
|
| 300 |
|
| 301 |
+
if zero_copula and not semi_lossless and vlem in ("ser","estar","be") and tense=="Pres" and not is_neg and not is_q:
|
| 302 |
parts = S + O + ADV
|
| 303 |
else:
|
| 304 |
parts = S + O + ADV + [v_form]
|
|
|
|
| 360 |
def repl_en(m):
|
| 361 |
key = norm_en(m.group(0))
|
| 362 |
table = EN2MINI if target=="Minimax-ASCII" else EN2KOMI
|
| 363 |
+
if table and key in table:
|
| 364 |
return table[key]
|
| 365 |
return enc_oov_minimax(m.group(0)) if target=="Minimax-ASCII" else enc_oov_komin(m.group(0))
|
| 366 |
repl = repl_es if src_lang=="Español" else repl_en
|
|
|
|
| 386 |
mini_tail_re = re.compile(r"^(?P<stem>.+?)·(?P<tail>[PTFNQ12sp]+)$")
|
| 387 |
|
| 388 |
def decode_simple(text: str, source: str, tgt_lang: str) -> str:
|
| 389 |
+
if not text.strip():
|
| 390 |
return ""
|
| 391 |
code2es = MINI2ES if source=="Minimax-ASCII" else KOMI2ES
|
| 392 |
code2en = MINI2EN if source=="Minimax-ASCII" else KOMI2EN
|
|
|
|
| 466 |
v_conj = ("no " if tgt_lang == "Español" else "not ") + v_conj
|
| 467 |
out_parts.append(v_conj)
|
| 468 |
continue
|
| 469 |
+
# resto
|
| 470 |
out_parts.append(pluralize(lem, tgt_lang) if pl_flags[idx] else lem)
|
| 471 |
|
| 472 |
out_text = " ".join(out_parts)
|
|
|
|
| 531 |
return "goes" if (tense=="Pres" and person=="3s") else "go"
|
| 532 |
if lemma == "do":
|
| 533 |
if tense == "Past": return "did"
|
| 534 |
+
return "does" if (tense=="Pres" and person=="3s") else "do"
|
| 535 |
|
| 536 |
if tense == "Pres":
|
| 537 |
if person == "3s":
|
| 538 |
+
if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"):
|
| 539 |
return lemma[:-1] + "ies"
|
| 540 |
if lemma.endswith(("s","sh","ch","x","z","o")):
|
| 541 |
return lemma + "es"
|
|
|
|
| 543 |
return lemma
|
| 544 |
elif tense == "Past":
|
| 545 |
if lemma.endswith("e"): return lemma + "d"
|
| 546 |
+
if lemma.endswith("y") and (len(lemma)<2 or lemma[-2] not in "aeiou"): return lemma[:-1] + "ied"
|
| 547 |
return lemma + "ed"
|
| 548 |
else:
|
| 549 |
return lemma
|
|
|
|
| 562 |
drop_articles: bool, zero_copula: bool, mode: str, max_comp_exact: bool = False) -> str:
|
| 563 |
if not text.strip(): return ""
|
| 564 |
semi = True # siempre semi-lossless
|
| 565 |
+
core = _build_with_spacy(text, src_lang, target, drop_articles, zero_copula and not semi, semi_lossless=semi) if USE_SPACY else encode_simple(text, src_lang, target)
|
| 566 |
if max_comp_exact:
|
| 567 |
return custom_sidecar_enc(core, text)
|
| 568 |
return core
|
|
|
|
| 578 |
orig = extract_custom_sidecar(text)
|
| 579 |
if orig is not None: return orig
|
| 580 |
orig = extract_sidecar_b85(text)
|
| 581 |
+
if orig is not None: return orig
|
| 582 |
return decode_simple(strip_custom_sidecar(strip_sidecar_b85(text)), src, tgt)
|
| 583 |
if src in ("Español","English") and tgt in ("Español","English"):
|
| 584 |
return translate_natural(text, src, tgt)
|