Upload 3 files
Browse files- HF_Pairs_ES_NI.csv +0 -0
- Iberia-Georgeos.ttf +0 -0
- app.py +318 -0
HF_Pairs_ES_NI.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Iberia-Georgeos.ttf
ADDED
|
Binary file (6.58 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,318 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# %%capture
|
| 2 |
+
# NeoΓbero β ESβNI con wordfreq (sin spaCy/pandas) + CSV + PDF (Platypus) + copias a /content
|
| 3 |
+
!pip -q install wordfreq==3.0 reportlab==4.2.2
|
| 4 |
+
|
| 5 |
+
# βββββββββββββ Config βββββββββββββ
|
| 6 |
+
N_MAX = 33000
|
| 7 |
+
ZIPF_MIN = 3.0
|
| 8 |
+
PER_LETTER_MIN = 120
|
| 9 |
+
GLOBAL_SALT = "neoibero_v1.7_particles_keysfix"
|
| 10 |
+
OUT_DIR = "salida"
|
| 11 |
+
|
| 12 |
+
import os, re, csv, hashlib, shutil
|
| 13 |
+
from collections import defaultdict
|
| 14 |
+
os.makedirs(OUT_DIR, exist_ok=True)
|
| 15 |
+
|
| 16 |
+
# βββββββββββββ Atestiguados (KEEP) βββββββββββββ
|
| 17 |
+
ATTESTED_MAP = {
|
| 18 |
+
# Numerales y cuanti basales
|
| 19 |
+
"uno":"ban","dos":"bi","tres":"irur","cuatro":"laur","cinco":"borste","seis":"Εei",
|
| 20 |
+
"siete":"sisbi","ocho":"sorse","diez":"abaΕ","veinte":"oΕkei",
|
| 21 |
+
# PartΓculas
|
| 22 |
+
"y":"ne","o":"o","no":"eΕ",
|
| 23 |
+
"a":"ka", # base direccional; en el traductor manejaremos dativo (mi) y DOM (te)
|
| 24 |
+
"para":"ka",
|
| 25 |
+
# DeterminaciΓ³n
|
| 26 |
+
"el":"", "la":"", "los":"", "las":"", # definido β Γ
|
| 27 |
+
"un":"ban", "una":"ban", "unos":"", "unas":"", # indef. sg. β ban; pl. β Γ por defecto
|
| 28 |
+
"este":"aΕe","esta":"aΕe","estos":"aΕe","estas":"aΕe",
|
| 29 |
+
# Contracciones del ES: las resuelve la regla; aquΓ Γ para no ensuciar el diccionario
|
| 30 |
+
"al":"", "del":"",
|
| 31 |
+
# LΓ©xico documentado (ejemplos)
|
| 32 |
+
"cuervo":"belai","perdiz":"ebee","cereal":"bars","tributo":"kebel",
|
| 33 |
+
"medida eku":"eku","medida kitev":"kitei",
|
| 34 |
+
}
|
| 35 |
+
ATTESTED_POS = {
|
| 36 |
+
**{k:"NUM" for k in ["uno","dos","tres","cuatro","cinco","seis","siete","ocho","diez","veinte"]},
|
| 37 |
+
**{k:"PART" for k in ["y","o","no","a","para","al","del"]},
|
| 38 |
+
**{k:"DET" for k in ["el","la","los","las","un","una","unos","unas","este","esta","estos","estas"]},
|
| 39 |
+
**{k:"N" for k in ["cuervo","perdiz","cereal","tributo","medida eku","medida kitev"]},
|
| 40 |
+
}
|
| 41 |
+
ATTESTED_SOURCE = {"default": ("consenso/compendiado","β")}
|
| 42 |
+
|
| 43 |
+
# βββββββββββββ STOP (descartar si no estΓ‘ en KEEP) βββββββββββββ
|
| 44 |
+
STOP_SKIP = {
|
| 45 |
+
# artΓculos/contracciones
|
| 46 |
+
"el","la","los","las","lo","un","una","unos","unas","al","del",
|
| 47 |
+
# preps y conjunciones frecuentes
|
| 48 |
+
"de","en","con","sin","por","sobre","entre","hasta","desde","hacia","segΓΊn","tras",
|
| 49 |
+
"pero","aunque","sino","que","como","si","porque","cuando","donde","mientras",
|
| 50 |
+
# adverbios muy grales
|
| 51 |
+
"muy","ya","sΓ","no","tambiΓ©n","solo","sΓ³lo","aΓΊn","aun","mΓ‘s","menos",
|
| 52 |
+
# determinantes/pronombres
|
| 53 |
+
"mi","mis","tu","tus","su","sus","nuestro","nuestra","nuestros","nuestras",
|
| 54 |
+
"esto","eso","aquello","ese","esa","esos","esas","aquel","aquella","aquellos","aquellas",
|
| 55 |
+
"quien","quiΓ©n","cual","cuΓ‘l","cuales","cuΓ‘les","cuyo","cuya","cuyos","cuyas",
|
| 56 |
+
# interjecciones
|
| 57 |
+
"eh","ay","oh","uy","ah","aja","jeje","jaja","aah","ahh","ohh","uhh"
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
# βββββββββββββ Generador NI βββββββββββββ
|
| 61 |
+
V = ["a","e","i","o","u"]
|
| 62 |
+
C_CORE = ["b","d","t","g","k","s","Ε","l","r","Ε","n","m"]
|
| 63 |
+
CODAS = ["","n","s","Ε","r","Ε","l","m","k","t"]
|
| 64 |
+
NOMINALIZERS=["-ar","-en","-tu","-la","-Εa","-si"]
|
| 65 |
+
VERBALIZERS=["-ke","-ta","-ni","-bo","-ri"]
|
| 66 |
+
ADJ_SUFFIX="-si"
|
| 67 |
+
|
| 68 |
+
def legal_onset(c): return c not in ("r","Ε")
|
| 69 |
+
def pick(seq,key): h=int.from_bytes(hashlib.sha256(key.encode()).digest()[:4],"big"); return seq[h%len(seq)]
|
| 70 |
+
def gen_syll(seed,i):
|
| 71 |
+
on=pick([""]+[c for c in C_CORE if legal_onset(c)],f"{seed}:on:{i}")
|
| 72 |
+
v=pick(V,f"{seed}:v:{i}")
|
| 73 |
+
co=pick(CODAS,f"{seed}:co:{i}")
|
| 74 |
+
return on+v+co
|
| 75 |
+
def bad_boundary(a,b): return bool(a and b and (a[-1],b[0]) in {("s","Ε"),("Ε","s"),("r","Ε"),("Ε","r")})
|
| 76 |
+
def make_root(seed):
|
| 77 |
+
n=int(pick(["1","2","2","2","3","3"],f"{seed}:n"))
|
| 78 |
+
for att in range(32):
|
| 79 |
+
parts=[]; ok=True
|
| 80 |
+
for i in range(n):
|
| 81 |
+
syl=gen_syll(seed,i+att)
|
| 82 |
+
if i==0 and syl and syl[0] in ("r","Ε"): ok=False; break
|
| 83 |
+
if parts and (parts[-1]==syl or bad_boundary(parts[-1],syl)): ok=False; break
|
| 84 |
+
parts.append(syl)
|
| 85 |
+
if ok: return "".join(parts) or "ba"
|
| 86 |
+
return "ba"
|
| 87 |
+
def build_ni(es,pos):
|
| 88 |
+
seed=es+"|"+GLOBAL_SALT
|
| 89 |
+
root=make_root(seed)
|
| 90 |
+
if pos=="V": suf=pick(VERBALIZERS,seed+":V"); return root+suf,"V"
|
| 91 |
+
if pos=="ADJ": return root+ADJ_SUFFIX,"ADJ"
|
| 92 |
+
suf=pick(NOMINALIZERS,seed+":N"); return root+suf,"N"
|
| 93 |
+
|
| 94 |
+
# βββββββββββββ Signario (tokens βΉβ¦βΊ β teclas Iberia-Georgeos) βββββββββββββ
|
| 95 |
+
SYL_FOR={"b":["βΉBAβΊ","βΉBEβΊ","βΉBIβΊ","βΉBOβΊ","βΉBUβΊ"],"d":["βΉDAβΊ","βΉDEβΊ","βΉDIβΊ","βΉDOβΊ","βΉDUβΊ"],
|
| 96 |
+
"t":["βΉTAβΊ","βΉTEβΊ","βΉTIβΊ","βΉTOβΊ","βΉTUβΊ"],"g":["βΉGAβΊ","βΉGEβΊ","βΉGIβΊ","βΉGOβΊ","βΉGUβΊ"],
|
| 97 |
+
"k":["βΉKAβΊ","βΉKEβΊ","βΉKIβΊ","βΉKOβΊ","βΉKUβΊ"]}
|
| 98 |
+
ALPHA_FOR={"a":"βΉAβΊ","e":"βΉEβΊ","i":"βΉIβΊ","o":"βΉOβΊ","u":"βΉUβΊ","s":"βΉSοΏ½οΏ½οΏ½","Ε":"βΉΕβΊ","l":"βΉLβΊ","r":"βΉRβΊ","Ε":"βΉΕβΊ","n":"βΉNβΊ","m":"βΉMβΊ"}
|
| 99 |
+
CODA_FOR={"":"","n":"βΉNβΊ","s":"βΉSβΊ","Ε":"βΉΕβΊ","r":"βΉRβΊ","Ε":"βΉΕβΊ","l":"βΉLβΊ","m":"βΉMβΊ","k":"βΉKβΊ","t":"βΉTβΊ"}
|
| 100 |
+
|
| 101 |
+
def tokens_from_latin(ni):
|
| 102 |
+
out=[]; i=0
|
| 103 |
+
while i<len(ni):
|
| 104 |
+
c=ni[i]
|
| 105 |
+
if c=="-": out.append("β"); i+=1; continue
|
| 106 |
+
if c in V: out.append(ALPHA_FOR[c]); i+=1; continue
|
| 107 |
+
if c in SYL_FOR and i+1<len(ni) and ni[i+1] in V:
|
| 108 |
+
v=ni[i+1]; token=SYL_FOR[c]["aeiou".index(v)]
|
| 109 |
+
coda=ni[i+2] if i+2<len(ni) else ""
|
| 110 |
+
out.append(token + (CODA_FOR[coda] if coda in CODA_FOR and coda!="" else ""))
|
| 111 |
+
i += 2 + (1 if coda in CODA_FOR and coda!="" else 0)
|
| 112 |
+
continue
|
| 113 |
+
out.append(ALPHA_FOR.get(c,c)); i+=1
|
| 114 |
+
return "".join(out)
|
| 115 |
+
|
| 116 |
+
KEYS_OVERRIDE = {
|
| 117 |
+
"ka":"K", # KA
|
| 118 |
+
"mi":"MI", # M + I
|
| 119 |
+
"te":"TE", # T + E
|
| 120 |
+
"ne":"N", # NA (ligadura), simplificado
|
| 121 |
+
"o":"O",
|
| 122 |
+
"eΕ":"X", # Ε β X (convenciΓ³n)
|
| 123 |
+
}
|
| 124 |
+
def georgeos_keys(token_str, ni_plain):
|
| 125 |
+
low = (ni_plain or "").lower()
|
| 126 |
+
if low in KEYS_OVERRIDE:
|
| 127 |
+
return KEYS_OVERRIDE[low]
|
| 128 |
+
m=re.findall(r"βΉ(.*?)βΊ", token_str); out=[]
|
| 129 |
+
for t in m:
|
| 130 |
+
if len(t)==2 and t[0] in "BDTGK": out.append(t[0]) # CV β B/D/T/G/K
|
| 131 |
+
elif t in ("A","E","I","O","U"): out.append(t)
|
| 132 |
+
elif t=="Ε": out.append("X")
|
| 133 |
+
elif t=="Ε": out.append("r")
|
| 134 |
+
else: out.append(t[0].upper())
|
| 135 |
+
return "".join(out)
|
| 136 |
+
|
| 137 |
+
# βββββββββββββ Vocab ES (wordfreq) + filtros βββββββββββββ
|
| 138 |
+
from wordfreq import top_n_list, zipf_frequency
|
| 139 |
+
def get_spanish_vocab_balanced(n_max, zipf_min, per_letter_min=120):
|
| 140 |
+
base = top_n_list("es", 200000)
|
| 141 |
+
base = [w for w in base if zipf_frequency(w, "es") >= zipf_min]
|
| 142 |
+
seen=set(); buckets=defaultdict(list)
|
| 143 |
+
for w in base:
|
| 144 |
+
if not re.match(r"^[A-Za-zΓΓΓΓΓΓΓÑéΓΓ³ΓΊΓΌΓ±]+$", w): continue
|
| 145 |
+
w = w.lower()
|
| 146 |
+
|
| 147 |
+
if (w in STOP_SKIP) and (w not in ATTESTED_MAP): # stop salvo KEEP
|
| 148 |
+
continue
|
| 149 |
+
|
| 150 |
+
# Filtros anti-ruido (salvo atestiguados)
|
| 151 |
+
if w not in ATTESTED_MAP:
|
| 152 |
+
if len(w) < 2: continue
|
| 153 |
+
if re.search(r"(.)\1\1", w): continue # aaa, ahhhβ¦
|
| 154 |
+
if not re.search(r"[aeiouÑéΓΓ³ΓΊΓΌ]", w): continue
|
| 155 |
+
if re.fullmatch(r"(a+h+|ah+|eh+|oh+|uh+|uy+|ay+|ey+)", w): continue
|
| 156 |
+
|
| 157 |
+
if w in seen: continue
|
| 158 |
+
seen.add(w)
|
| 159 |
+
|
| 160 |
+
if w in ATTESTED_MAP:
|
| 161 |
+
pos = ATTESTED_POS.get(w, "PART")
|
| 162 |
+
elif re.search(r"(ar|er|ir)$", w):
|
| 163 |
+
pos = "V"
|
| 164 |
+
elif w.endswith(("o","a","e","al","il","oso","osa","ivo","iva")) and not w.endswith(("os","as","es")):
|
| 165 |
+
pos = "ADJ"
|
| 166 |
+
else:
|
| 167 |
+
pos = "N"
|
| 168 |
+
|
| 169 |
+
buckets[w[0]].append((w,pos))
|
| 170 |
+
|
| 171 |
+
letters=[chr(c) for c in range(ord('a'),ord('z')+1)]+['Γ±']
|
| 172 |
+
sel=[]
|
| 173 |
+
for L in letters:
|
| 174 |
+
if L in buckets: sel.extend(buckets[L][:per_letter_min])
|
| 175 |
+
if len(sel)<n_max:
|
| 176 |
+
for L in letters:
|
| 177 |
+
if L in buckets:
|
| 178 |
+
for itm in buckets[L][per_letter_min:]:
|
| 179 |
+
if len(sel)>=n_max: break
|
| 180 |
+
sel.append(itm)
|
| 181 |
+
if len(sel)>=n_max: break
|
| 182 |
+
for es_fixed in ATTESTED_MAP:
|
| 183 |
+
if all(es_fixed!=es for es,_ in sel):
|
| 184 |
+
sel.append((es_fixed, ATTESTED_POS.get(es_fixed,"N")))
|
| 185 |
+
return sel[:n_max]
|
| 186 |
+
|
| 187 |
+
# βββββββββββββ ProyecciΓ³n + CSV βββββββββββββ
|
| 188 |
+
def project(rows):
|
| 189 |
+
used=set(ATTESTED_MAP.values()); out=[]
|
| 190 |
+
for es,pos in rows:
|
| 191 |
+
es=es.strip().lower(); pos=(ATTESTED_POS.get(es) or pos or "N").upper()
|
| 192 |
+
if es in ATTESTED_MAP:
|
| 193 |
+
ni=ATTESTED_MAP[es]
|
| 194 |
+
tok=tokens_from_latin(ni) if ni else ""
|
| 195 |
+
keys=georgeos_keys(tok, ni) if ni else ""
|
| 196 |
+
out.append({"es":es,"pos_es":pos,"ni_lemma":ni,"pos_ni":"",
|
| 197 |
+
"evidencia":"consenso/inscripciΓ³n","fuente":"β","autor":"β",
|
| 198 |
+
"ni_tokens":tok,"georgeos_keys":keys})
|
| 199 |
+
used.add(ni); continue
|
| 200 |
+
ni,pos_ni=build_ni(es,pos); salt=0
|
| 201 |
+
while ni in used:
|
| 202 |
+
salt+=1; ni,pos_ni=build_ni(es+f":{salt}",pos)
|
| 203 |
+
if salt>64: break
|
| 204 |
+
used.add(ni)
|
| 205 |
+
tok=tokens_from_latin(ni); keys=georgeos_keys(tok, ni)
|
| 206 |
+
out.append({"es":es,"pos_es":pos,"ni_lemma":ni,"pos_ni":pos_ni,
|
| 207 |
+
"evidencia":"conjetural","fuente":"β","autor":"β",
|
| 208 |
+
"ni_tokens":tok,"georgeos_keys":keys})
|
| 209 |
+
out.sort(key=lambda d:(d["es"],d["pos_es"]))
|
| 210 |
+
return out
|
| 211 |
+
|
| 212 |
+
rows = get_spanish_vocab_balanced(N_MAX, ZIPF_MIN, PER_LETTER_MIN)
|
| 213 |
+
mapped = project(rows)
|
| 214 |
+
|
| 215 |
+
def write_csv(path, rows, fields):
|
| 216 |
+
with open(path,"w",newline="",encoding="utf-8") as f:
|
| 217 |
+
w=csv.DictWriter(f, fieldnames=fields); w.writeheader()
|
| 218 |
+
for r in rows: w.writerow({k:r.get(k,"") for k in fields})
|
| 219 |
+
|
| 220 |
+
csv_es_ni = os.path.join(OUT_DIR,"diccionario_es_neoibero.csv")
|
| 221 |
+
csv_ni_es = os.path.join(OUT_DIR,"diccionario_neoibero_es.csv")
|
| 222 |
+
csv_pairs = os.path.join(OUT_DIR,"hf_pairs.csv")
|
| 223 |
+
|
| 224 |
+
write_csv(csv_es_ni, mapped,
|
| 225 |
+
["es","pos_es","ni_lemma","pos_ni","evidencia","fuente","autor","ni_tokens","georgeos_keys"])
|
| 226 |
+
inv=[{"ni_lemma":r["ni_lemma"],"es":r["es"],"pos_ni":r["pos_ni"],"pos_es":r["pos_es"],
|
| 227 |
+
"evidencia":r["evidencia"],"fuente":r["fuente"],"autor":r["autor"]} for r in mapped]
|
| 228 |
+
inv.sort(key=lambda x:(x["ni_lemma"],x["es"]))
|
| 229 |
+
write_csv(csv_ni_es, inv, ["ni_lemma","es","pos_ni","pos_es","evidencia","fuente","autor"])
|
| 230 |
+
write_csv(csv_pairs, [{"source_es":r["es"],"target_ni":r["ni_lemma"]} for r in mapped],
|
| 231 |
+
["source_es","target_ni"])
|
| 232 |
+
print("CSV OK β Entradas:", len(mapped))
|
| 233 |
+
print("CSV en:", csv_es_ni, "|", csv_ni_es, "|", csv_pairs)
|
| 234 |
+
|
| 235 |
+
# βββββββββββββ PDF (Platypus, 2 columnas, sin solapes) βββββββββββββ
|
| 236 |
+
from reportlab.lib.pagesizes import A4
|
| 237 |
+
from reportlab.lib.units import mm
|
| 238 |
+
from reportlab.pdfbase import pdfmetrics
|
| 239 |
+
from reportlab.pdfbase.ttfonts import TTFont
|
| 240 |
+
from reportlab.platypus import BaseDocTemplate, PageTemplate, Frame, Paragraph, Spacer, KeepTogether
|
| 241 |
+
from reportlab.lib.styles import ParagraphStyle
|
| 242 |
+
from reportlab.lib.enums import TA_LEFT
|
| 243 |
+
from google.colab import files
|
| 244 |
+
|
| 245 |
+
print("Sube 1) Iberia-Georgeos.ttf y 2) una fuente Unicode (DejaVuSans.ttf / NotoSans-Regular.ttf):")
|
| 246 |
+
up_fonts = files.upload()
|
| 247 |
+
font_sign = next(k for k in up_fonts if k.lower().endswith((".ttf",".otf")) and "georgeos" in k.lower())
|
| 248 |
+
font_lat = None
|
| 249 |
+
for k in up_fonts:
|
| 250 |
+
nm=k.lower()
|
| 251 |
+
if nm.endswith((".ttf",".otf")) and ("dejavu" in nm or "noto" in nm or "unicode" in nm) and "georgeos" not in nm:
|
| 252 |
+
font_lat = k; break
|
| 253 |
+
if font_lat is None:
|
| 254 |
+
print("Sube ahora una fuente Unicode para la lΓnea latina (DejaVuSans.ttf / NotoSans-Regular.ttf):")
|
| 255 |
+
up2 = files.upload()
|
| 256 |
+
font_lat = next(k for k in up2 if k.lower().endswith((".ttf",".otf")))
|
| 257 |
+
pdfmetrics.registerFont(TTFont("IberiaGeorgeos", font_sign))
|
| 258 |
+
pdfmetrics.registerFont(TTFont("UniLatin", font_lat))
|
| 259 |
+
|
| 260 |
+
def clean_keys(s):
|
| 261 |
+
s = (s or "")
|
| 262 |
+
s = s.replace("β"," ").replace("β"," / ").replace("-", " / ")
|
| 263 |
+
s = s.replace("β’","Β·")
|
| 264 |
+
s = re.sub(r"[^A-Za-z r/\\Β·\.,;:]", " ", s)
|
| 265 |
+
s = re.sub(r"\s*/\s*", " / ", s)
|
| 266 |
+
s = re.sub(r"\s+", " ", s).strip()
|
| 267 |
+
s = "".join(ch if ch=="r" else ch.upper() for ch in s) # mayΓΊs salvo 'r' (Ε)
|
| 268 |
+
return s
|
| 269 |
+
|
| 270 |
+
LINE_ES, LINE_NI, LINE_SIG = 10, 10, 18
|
| 271 |
+
style_es = ParagraphStyle("es", fontName="UniLatin", fontSize=LINE_ES, leading=LINE_ES*1.2, alignment=TA_LEFT, spaceAfter=0)
|
| 272 |
+
style_ni = ParagraphStyle("ni", fontName="UniLatin", fontSize=LINE_NI, leading=LINE_NI*1.2, alignment=TA_LEFT, spaceAfter=2)
|
| 273 |
+
style_sig = ParagraphStyle("sig", fontName="IberiaGeorgeos", fontSize=LINE_SIG, leading=LINE_SIG*1.08,alignment=TA_LEFT, spaceAfter=4)
|
| 274 |
+
|
| 275 |
+
PAGE_W, PAGE_H = A4; M, GAP, COLS = 16*mm, 8*mm, 2
|
| 276 |
+
COL_W = (PAGE_W - 2*M - (COLS-1)*GAP) / COLS
|
| 277 |
+
frames = [Frame(M + i*(COL_W+GAP), M, COL_W, PAGE_H - 2*M, id=f"col{i}") for i in range(COLS)]
|
| 278 |
+
def on_page(canvas, doc):
|
| 279 |
+
canvas.setFont("UniLatin", 12)
|
| 280 |
+
canvas.drawString(M, PAGE_H - M + 2, "Diccionario EspaΓ±ol β NeoΓbero (conlang; atestiguado vs conjetural)")
|
| 281 |
+
|
| 282 |
+
doc = BaseDocTemplate("Diccionario_ES_Neoibero.pdf", pagesize=A4, leftMargin=M, rightMargin=M, topMargin=M, bottomMargin=M)
|
| 283 |
+
doc.addPageTemplates(PageTemplate(id="TwoCol", frames=frames, onPage=on_page))
|
| 284 |
+
|
| 285 |
+
story=[]
|
| 286 |
+
for r in mapped:
|
| 287 |
+
es = r["es"]; pos = r["pos_es"]; ni = r["ni_lemma"]
|
| 288 |
+
keys = clean_keys(r.get("georgeos_keys",""))
|
| 289 |
+
block = [Paragraph(f"<b>{es}</b>", style_es),
|
| 290 |
+
Paragraph(f"[{pos}] β {ni if ni!='' else 'Γ'}", style_ni)]
|
| 291 |
+
if keys: # si target Γ, no imprimimos lΓnea ibΓ©rica
|
| 292 |
+
block.append(Paragraph(keys, style_sig))
|
| 293 |
+
block.append(Spacer(1, 4))
|
| 294 |
+
story.append(KeepTogether(block))
|
| 295 |
+
doc.build(story)
|
| 296 |
+
|
| 297 |
+
# βββββββββββββ Copiar CSVs a /content y descargar todo βββββββββββββ
|
| 298 |
+
from google.colab import files as _f
|
| 299 |
+
root_csv_es_ni = "Diccionario_ES_Neoibero.csv"
|
| 300 |
+
root_csv_ni_es = "Diccionario_Neoibero_ES.csv"
|
| 301 |
+
root_csv_pairs = "HF_Pairs_ES_NI.csv"
|
| 302 |
+
shutil.copyfile(csv_es_ni, root_csv_es_ni)
|
| 303 |
+
shutil.copyfile(csv_ni_es, root_csv_ni_es)
|
| 304 |
+
shutil.copyfile(csv_pairs, root_csv_pairs)
|
| 305 |
+
|
| 306 |
+
print("Copiados a /content:")
|
| 307 |
+
print(os.path.abspath(root_csv_es_ni))
|
| 308 |
+
print(os.path.abspath(root_csv_ni_es))
|
| 309 |
+
print(os.path.abspath(root_csv_pairs))
|
| 310 |
+
print("PDF:", os.path.abspath("Diccionario_ES_Neoibero.pdf"))
|
| 311 |
+
|
| 312 |
+
# Auto-descarga (puedes comentar estas cuatro lΓneas si no quieres bajarlos ahora)
|
| 313 |
+
_f.download(root_csv_es_ni)
|
| 314 |
+
_f.download(root_csv_ni_es)
|
| 315 |
+
_f.download(root_csv_pairs)
|
| 316 |
+
_f.download("Diccionario_ES_Neoibero.pdf")
|
| 317 |
+
|
| 318 |
+
print("Listo οΏ½οΏ½οΏ½")
|