Spaces:

LoloSemper
/

Spanish_NeoIberian_Translator

Sleeping

App Files Files Community

LoloSemper commited on Oct 12, 2025

Commit

82fcbaa

verified ·

1 Parent(s): 15f09de

Upload 3 files

Browse files

Files changed (3) hide show

HF_Pairs_ES_NI.csv +0 -0
Iberia-Georgeos.ttf +0 -0
app.py +318 -0

HF_Pairs_ES_NI.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

Iberia-Georgeos.ttf ADDED Viewed

Binary file (6.58 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,318 @@

+# %%capture
+# Neoíbero — ES→NI con wordfreq (sin spaCy/pandas) + CSV + PDF (Platypus) + copias a /content
+!pip -q install wordfreq==3.0 reportlab==4.2.2
+# ───────────── Config ─────────────
+N_MAX = 33000
+ZIPF_MIN = 3.0
+PER_LETTER_MIN = 120
+GLOBAL_SALT = "neoibero_v1.7_particles_keysfix"
+OUT_DIR = "salida"
+import os, re, csv, hashlib, shutil
+from collections import defaultdict
+os.makedirs(OUT_DIR, exist_ok=True)
+# ───────────── Atestiguados (KEEP) ─────────────
+ATTESTED_MAP = {
+    # Numerales y cuanti basales
+    "uno":"ban","dos":"bi","tres":"irur","cuatro":"laur","cinco":"borste","seis":"śei",
+    "siete":"sisbi","ocho":"sorse","diez":"abaŕ","veinte":"oŕkei",
+    # Partículas
+    "y":"ne","o":"o","no":"eś",
+    "a":"ka",      # base direccional; en el traductor manejaremos dativo (mi) y DOM (te)
+    "para":"ka",
+    # Determinación
+    "el":"", "la":"", "los":"", "las":"",            # definido → Ø
+    "un":"ban", "una":"ban", "unos":"", "unas":"",   # indef. sg. → ban; pl. → Ø por defecto
+    "este":"aŕe","esta":"aŕe","estos":"aŕe","estas":"aŕe",
+    # Contracciones del ES: las resuelve la regla; aquí Ø para no ensuciar el diccionario
+    "al":"", "del":"",
+    # Léxico documentado (ejemplos)
+    "cuervo":"belai","perdiz":"ebee","cereal":"bars","tributo":"kebel",
+    "medida eku":"eku","medida kitev":"kitei",
+}
+ATTESTED_POS = {
+    **{k:"NUM"  for k in ["uno","dos","tres","cuatro","cinco","seis","siete","ocho","diez","veinte"]},
+    **{k:"PART" for k in ["y","o","no","a","para","al","del"]},
+    **{k:"DET"  for k in ["el","la","los","las","un","una","unos","unas","este","esta","estos","estas"]},
+    **{k:"N"    for k in ["cuervo","perdiz","cereal","tributo","medida eku","medida kitev"]},
+}
+ATTESTED_SOURCE = {"default": ("consenso/compendiado","—")}
+# ───────────── STOP (descartar si no está en KEEP) ─────────────
+STOP_SKIP = {
+    # artículos/contracciones
+    "el","la","los","las","lo","un","una","unos","unas","al","del",
+    # preps y conjunciones frecuentes
+    "de","en","con","sin","por","sobre","entre","hasta","desde","hacia","según","tras",
+    "pero","aunque","sino","que","como","si","porque","cuando","donde","mientras",
+    # adverbios muy grales
+    "muy","ya","sí","no","también","solo","sólo","aún","aun","más","menos",
+    # determinantes/pronombres
+    "mi","mis","tu","tus","su","sus","nuestro","nuestra","nuestros","nuestras",
+    "esto","eso","aquello","ese","esa","esos","esas","aquel","aquella","aquellos","aquellas",
+    "quien","quién","cual","cuál","cuales","cuáles","cuyo","cuya","cuyos","cuyas",
+    # interjecciones
+    "eh","ay","oh","uy","ah","aja","jeje","jaja","aah","ahh","ohh","uhh"
+}
+# ───────────── Generador NI ─────────────
+V = ["a","e","i","o","u"]
+C_CORE = ["b","d","t","g","k","s","ś","l","r","ŕ","n","m"]
+CODAS = ["","n","s","ś","r","ŕ","l","m","k","t"]
+NOMINALIZERS=["-ar","-en","-tu","-la","-ŕa","-si"]
+VERBALIZERS=["-ke","-ta","-ni","-bo","-ri"]
+ADJ_SUFFIX="-si"
+def legal_onset(c): return c not in ("r","ŕ")
+def pick(seq,key): h=int.from_bytes(hashlib.sha256(key.encode()).digest()[:4],"big"); return seq[h%len(seq)]
+def gen_syll(seed,i):
+    on=pick([""]+[c for c in C_CORE if legal_onset(c)],f"{seed}:on:{i}")
+    v=pick(V,f"{seed}:v:{i}")
+    co=pick(CODAS,f"{seed}:co:{i}")
+    return on+v+co
+def bad_boundary(a,b): return bool(a and b and (a[-1],b[0]) in {("s","ś"),("ś","s"),("r","ŕ"),("ŕ","r")})
+def make_root(seed):
+    n=int(pick(["1","2","2","2","3","3"],f"{seed}:n"))
+    for att in range(32):
+        parts=[]; ok=True
+        for i in range(n):
+            syl=gen_syll(seed,i+att)
+            if i==0 and syl and syl[0] in ("r","ŕ"): ok=False; break
+            if parts and (parts[-1]==syl or bad_boundary(parts[-1],syl)): ok=False; break
+            parts.append(syl)
+        if ok: return "".join(parts) or "ba"
+    return "ba"
+def build_ni(es,pos):
+    seed=es+"|"+GLOBAL_SALT
+    root=make_root(seed)
+    if pos=="V": suf=pick(VERBALIZERS,seed+":V"); return root+suf,"V"
+    if pos=="ADJ": return root+ADJ_SUFFIX,"ADJ"
+    suf=pick(NOMINALIZERS,seed+":N"); return root+suf,"N"
+# ───────────── Signario (tokens ‹…› → teclas Iberia-Georgeos) ─────────────
+SYL_FOR={"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],"d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
+         "t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],"g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
+         "k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]}
+ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S���","ś":"‹Ś›","l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"}
+CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"}
+def tokens_from_latin(ni):
+    out=[]; i=0
+    while i<len(ni):
+        c=ni[i]
+        if c=="-": out.append("–"); i+=1; continue
+        if c in V: out.append(ALPHA_FOR[c]); i+=1; continue
+        if c in SYL_FOR and i+1<len(ni) and ni[i+1] in V:
+            v=ni[i+1]; token=SYL_FOR[c]["aeiou".index(v)]
+            coda=ni[i+2] if i+2<len(ni) else ""
+            out.append(token + (CODA_FOR[coda] if coda in CODA_FOR and coda!="" else ""))
+            i += 2 + (1 if coda in CODA_FOR and coda!="" else 0)
+            continue
+        out.append(ALPHA_FOR.get(c,c)); i+=1
+    return "".join(out)
+KEYS_OVERRIDE = {
+    "ka":"K",    # KA
+    "mi":"MI",   # M + I
+    "te":"TE",   # T + E
+    "ne":"N",    # NA (ligadura), simplificado
+    "o":"O",
+    "eś":"X",    # Ś → X (convención)
+}
+def georgeos_keys(token_str, ni_plain):
+    low = (ni_plain or "").lower()
+    if low in KEYS_OVERRIDE:
+        return KEYS_OVERRIDE[low]
+    m=re.findall(r"‹(.*?)›", token_str); out=[]
+    for t in m:
+        if len(t)==2 and t[0] in "BDTGK": out.append(t[0])     # CV → B/D/T/G/K
+        elif t in ("A","E","I","O","U"): out.append(t)
+        elif t=="Ś": out.append("X")
+        elif t=="Ŕ": out.append("r")
+        else: out.append(t[0].upper())
+    return "".join(out)
+# ───────────── Vocab ES (wordfreq) + filtros ─────────────
+from wordfreq import top_n_list, zipf_frequency
+def get_spanish_vocab_balanced(n_max, zipf_min, per_letter_min=120):
+    base = top_n_list("es", 200000)
+    base = [w for w in base if zipf_frequency(w, "es") >= zipf_min]
+    seen=set(); buckets=defaultdict(list)
+    for w in base:
+        if not re.match(r"^[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+$", w): continue
+        w = w.lower()
+        if (w in STOP_SKIP) and (w not in ATTESTED_MAP):  # stop salvo KEEP
+            continue
+        # Filtros anti-ruido (salvo atestiguados)
+        if w not in ATTESTED_MAP:
+            if len(w) < 2: continue
+            if re.search(r"(.)\1\1", w): continue       # aaa, ahhh…
+            if not re.search(r"[aeiouáéíóúü]", w): continue
+            if re.fullmatch(r"(a+h+|ah+|eh+|oh+|uh+|uy+|ay+|ey+)", w): continue
+        if w in seen: continue
+        seen.add(w)
+        if w in ATTESTED_MAP:
+            pos = ATTESTED_POS.get(w, "PART")
+        elif re.search(r"(ar|er|ir)$", w):
+            pos = "V"
+        elif w.endswith(("o","a","e","al","il","oso","osa","ivo","iva")) and not w.endswith(("os","as","es")):
+            pos = "ADJ"
+        else:
+            pos = "N"
+        buckets[w[0]].append((w,pos))
+    letters=[chr(c) for c in range(ord('a'),ord('z')+1)]+['ñ']
+    sel=[]
+    for L in letters:
+        if L in buckets: sel.extend(buckets[L][:per_letter_min])
+    if len(sel)<n_max:
+        for L in letters:
+            if L in buckets:
+                for itm in buckets[L][per_letter_min:]:
+                    if len(sel)>=n_max: break
+                    sel.append(itm)
+            if len(sel)>=n_max: break
+    for es_fixed in ATTESTED_MAP:
+        if all(es_fixed!=es for es,_ in sel):
+            sel.append((es_fixed, ATTESTED_POS.get(es_fixed,"N")))
+    return sel[:n_max]
+# ───────────── Proyección + CSV ─────────────
+def project(rows):
+    used=set(ATTESTED_MAP.values()); out=[]
+    for es,pos in rows:
+        es=es.strip().lower(); pos=(ATTESTED_POS.get(es) or pos or "N").upper()
+        if es in ATTESTED_MAP:
+            ni=ATTESTED_MAP[es]
+            tok=tokens_from_latin(ni) if ni else ""
+            keys=georgeos_keys(tok, ni) if ni else ""
+            out.append({"es":es,"pos_es":pos,"ni_lemma":ni,"pos_ni":"",
+                        "evidencia":"consenso/inscripción","fuente":"—","autor":"—",
+                        "ni_tokens":tok,"georgeos_keys":keys})
+            used.add(ni); continue
+        ni,pos_ni=build_ni(es,pos); salt=0
+        while ni in used:
+            salt+=1; ni,pos_ni=build_ni(es+f":{salt}",pos)
+            if salt>64: break
+        used.add(ni)
+        tok=tokens_from_latin(ni); keys=georgeos_keys(tok, ni)
+        out.append({"es":es,"pos_es":pos,"ni_lemma":ni,"pos_ni":pos_ni,
+                    "evidencia":"conjetural","fuente":"—","autor":"—",
+                    "ni_tokens":tok,"georgeos_keys":keys})
+    out.sort(key=lambda d:(d["es"],d["pos_es"]))
+    return out
+rows = get_spanish_vocab_balanced(N_MAX, ZIPF_MIN, PER_LETTER_MIN)
+mapped = project(rows)
+def write_csv(path, rows, fields):
+    with open(path,"w",newline="",encoding="utf-8") as f:
+        w=csv.DictWriter(f, fieldnames=fields); w.writeheader()
+        for r in rows: w.writerow({k:r.get(k,"") for k in fields})
+csv_es_ni  = os.path.join(OUT_DIR,"diccionario_es_neoibero.csv")
+csv_ni_es  = os.path.join(OUT_DIR,"diccionario_neoibero_es.csv")
+csv_pairs  = os.path.join(OUT_DIR,"hf_pairs.csv")
+write_csv(csv_es_ni, mapped,
+          ["es","pos_es","ni_lemma","pos_ni","evidencia","fuente","autor","ni_tokens","georgeos_keys"])
+inv=[{"ni_lemma":r["ni_lemma"],"es":r["es"],"pos_ni":r["pos_ni"],"pos_es":r["pos_es"],
+      "evidencia":r["evidencia"],"fuente":r["fuente"],"autor":r["autor"]} for r in mapped]
+inv.sort(key=lambda x:(x["ni_lemma"],x["es"]))
+write_csv(csv_ni_es, inv, ["ni_lemma","es","pos_ni","pos_es","evidencia","fuente","autor"])
+write_csv(csv_pairs, [{"source_es":r["es"],"target_ni":r["ni_lemma"]} for r in mapped],
+          ["source_es","target_ni"])
+print("CSV OK — Entradas:", len(mapped))
+print("CSV en:", csv_es_ni, "|", csv_ni_es, "|", csv_pairs)
+# ───────────── PDF (Platypus, 2 columnas, sin solapes) ─────────────
+from reportlab.lib.pagesizes import A4
+from reportlab.lib.units import mm
+from reportlab.pdfbase import pdfmetrics
+from reportlab.pdfbase.ttfonts import TTFont
+from reportlab.platypus import BaseDocTemplate, PageTemplate, Frame, Paragraph, Spacer, KeepTogether
+from reportlab.lib.styles import ParagraphStyle
+from reportlab.lib.enums import TA_LEFT
+from google.colab import files
+print("Sube 1) Iberia-Georgeos.ttf y 2) una fuente Unicode (DejaVuSans.ttf / NotoSans-Regular.ttf):")
+up_fonts = files.upload()
+font_sign = next(k for k in up_fonts if k.lower().endswith((".ttf",".otf")) and "georgeos" in k.lower())
+font_lat  = None
+for k in up_fonts:
+    nm=k.lower()
+    if nm.endswith((".ttf",".otf")) and ("dejavu" in nm or "noto" in nm or "unicode" in nm) and "georgeos" not in nm:
+        font_lat = k; break
+if font_lat is None:
+    print("Sube ahora una fuente Unicode para la línea latina (DejaVuSans.ttf / NotoSans-Regular.ttf):")
+    up2 = files.upload()
+    font_lat = next(k for k in up2 if k.lower().endswith((".ttf",".otf")))
+pdfmetrics.registerFont(TTFont("IberiaGeorgeos", font_sign))
+pdfmetrics.registerFont(TTFont("UniLatin", font_lat))
+def clean_keys(s):
+    s = (s or "")
+    s = s.replace("—"," ").replace("–"," / ").replace("-", " / ")
+    s = s.replace("•","·")
+    s = re.sub(r"[^A-Za-z r/\\·\.,;:]", " ", s)
+    s = re.sub(r"\s*/\s*", " / ", s)
+    s = re.sub(r"\s+", " ", s).strip()
+    s = "".join(ch if ch=="r" else ch.upper() for ch in s)  # mayús salvo 'r' (Ŕ)
+    return s
+LINE_ES, LINE_NI, LINE_SIG = 10, 10, 18
+style_es  = ParagraphStyle("es",  fontName="UniLatin",       fontSize=LINE_ES,  leading=LINE_ES*1.2,  alignment=TA_LEFT, spaceAfter=0)
+style_ni  = ParagraphStyle("ni",  fontName="UniLatin",       fontSize=LINE_NI,  leading=LINE_NI*1.2,  alignment=TA_LEFT, spaceAfter=2)
+style_sig = ParagraphStyle("sig", fontName="IberiaGeorgeos", fontSize=LINE_SIG, leading=LINE_SIG*1.08,alignment=TA_LEFT, spaceAfter=4)
+PAGE_W, PAGE_H = A4; M, GAP, COLS = 16*mm, 8*mm, 2
+COL_W = (PAGE_W - 2*M - (COLS-1)*GAP) / COLS
+frames = [Frame(M + i*(COL_W+GAP), M, COL_W, PAGE_H - 2*M, id=f"col{i}") for i in range(COLS)]
+def on_page(canvas, doc):
+    canvas.setFont("UniLatin", 12)
+    canvas.drawString(M, PAGE_H - M + 2, "Diccionario Español → Neoíbero (conlang; atestiguado vs conjetural)")
+doc = BaseDocTemplate("Diccionario_ES_Neoibero.pdf", pagesize=A4, leftMargin=M, rightMargin=M, topMargin=M, bottomMargin=M)
+doc.addPageTemplates(PageTemplate(id="TwoCol", frames=frames, onPage=on_page))
+story=[]
+for r in mapped:
+    es   = r["es"]; pos = r["pos_es"]; ni = r["ni_lemma"]
+    keys = clean_keys(r.get("georgeos_keys",""))
+    block = [Paragraph(f"<b>{es}</b>", style_es),
+             Paragraph(f"[{pos}] — {ni if ni!='' else 'Ø'}", style_ni)]
+    if keys:  # si target Ø, no imprimimos línea ibérica
+        block.append(Paragraph(keys, style_sig))
+    block.append(Spacer(1, 4))
+    story.append(KeepTogether(block))
+doc.build(story)
+# ───────────── Copiar CSVs a /content y descargar todo ─────────────
+from google.colab import files as _f
+root_csv_es_ni = "Diccionario_ES_Neoibero.csv"
+root_csv_ni_es = "Diccionario_Neoibero_ES.csv"
+root_csv_pairs = "HF_Pairs_ES_NI.csv"
+shutil.copyfile(csv_es_ni, root_csv_es_ni)
+shutil.copyfile(csv_ni_es, root_csv_ni_es)
+shutil.copyfile(csv_pairs, root_csv_pairs)
+print("Copiados a /content:")
+print(os.path.abspath(root_csv_es_ni))
+print(os.path.abspath(root_csv_ni_es))
+print(os.path.abspath(root_csv_pairs))
+print("PDF:", os.path.abspath("Diccionario_ES_Neoibero.pdf"))
+# Auto-descarga (puedes comentar estas cuatro líneas si no quieres bajarlos ahora)
+_f.download(root_csv_es_ni)
+_f.download(root_csv_ni_es)
+_f.download(root_csv_pairs)
+_f.download("Diccionario_ES_Neoibero.pdf")
+print("Listo ���")