LoloSemper commited on
Commit
82fcbaa
Β·
verified Β·
1 Parent(s): 15f09de

Upload 3 files

Browse files
Files changed (3) hide show
  1. HF_Pairs_ES_NI.csv +0 -0
  2. Iberia-Georgeos.ttf +0 -0
  3. app.py +318 -0
HF_Pairs_ES_NI.csv ADDED
The diff for this file is too large to render. See raw diff
 
Iberia-Georgeos.ttf ADDED
Binary file (6.58 kB). View file
 
app.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # %%capture
2
+ # Neoíbero — ES→NI con wordfreq (sin spaCy/pandas) + CSV + PDF (Platypus) + copias a /content
3
+ !pip -q install wordfreq==3.0 reportlab==4.2.2
4
+
5
+ # ───────────── Config ─────────────
6
+ N_MAX = 33000
7
+ ZIPF_MIN = 3.0
8
+ PER_LETTER_MIN = 120
9
+ GLOBAL_SALT = "neoibero_v1.7_particles_keysfix"
10
+ OUT_DIR = "salida"
11
+
12
+ import os, re, csv, hashlib, shutil
13
+ from collections import defaultdict
14
+ os.makedirs(OUT_DIR, exist_ok=True)
15
+
16
+ # ───────────── Atestiguados (KEEP) ─────────────
17
+ ATTESTED_MAP = {
18
+ # Numerales y cuanti basales
19
+ "uno":"ban","dos":"bi","tres":"irur","cuatro":"laur","cinco":"borste","seis":"Ε›ei",
20
+ "siete":"sisbi","ocho":"sorse","diez":"abaΕ•","veinte":"oΕ•kei",
21
+ # PartΓ­culas
22
+ "y":"ne","o":"o","no":"eΕ›",
23
+ "a":"ka", # base direccional; en el traductor manejaremos dativo (mi) y DOM (te)
24
+ "para":"ka",
25
+ # DeterminaciΓ³n
26
+ "el":"", "la":"", "los":"", "las":"", # definido β†’ Ø
27
+ "un":"ban", "una":"ban", "unos":"", "unas":"", # indef. sg. β†’ ban; pl. β†’ Ø por defecto
28
+ "este":"aΕ•e","esta":"aΕ•e","estos":"aΕ•e","estas":"aΕ•e",
29
+ # Contracciones del ES: las resuelve la regla; aquí Ø para no ensuciar el diccionario
30
+ "al":"", "del":"",
31
+ # LΓ©xico documentado (ejemplos)
32
+ "cuervo":"belai","perdiz":"ebee","cereal":"bars","tributo":"kebel",
33
+ "medida eku":"eku","medida kitev":"kitei",
34
+ }
35
+ ATTESTED_POS = {
36
+ **{k:"NUM" for k in ["uno","dos","tres","cuatro","cinco","seis","siete","ocho","diez","veinte"]},
37
+ **{k:"PART" for k in ["y","o","no","a","para","al","del"]},
38
+ **{k:"DET" for k in ["el","la","los","las","un","una","unos","unas","este","esta","estos","estas"]},
39
+ **{k:"N" for k in ["cuervo","perdiz","cereal","tributo","medida eku","medida kitev"]},
40
+ }
41
+ ATTESTED_SOURCE = {"default": ("consenso/compendiado","β€”")}
42
+
43
+ # ───────────── STOP (descartar si no estΓ‘ en KEEP) ─────────────
44
+ STOP_SKIP = {
45
+ # artΓ­culos/contracciones
46
+ "el","la","los","las","lo","un","una","unos","unas","al","del",
47
+ # preps y conjunciones frecuentes
48
+ "de","en","con","sin","por","sobre","entre","hasta","desde","hacia","segΓΊn","tras",
49
+ "pero","aunque","sino","que","como","si","porque","cuando","donde","mientras",
50
+ # adverbios muy grales
51
+ "muy","ya","sΓ­","no","tambiΓ©n","solo","sΓ³lo","aΓΊn","aun","mΓ‘s","menos",
52
+ # determinantes/pronombres
53
+ "mi","mis","tu","tus","su","sus","nuestro","nuestra","nuestros","nuestras",
54
+ "esto","eso","aquello","ese","esa","esos","esas","aquel","aquella","aquellos","aquellas",
55
+ "quien","quiΓ©n","cual","cuΓ‘l","cuales","cuΓ‘les","cuyo","cuya","cuyos","cuyas",
56
+ # interjecciones
57
+ "eh","ay","oh","uy","ah","aja","jeje","jaja","aah","ahh","ohh","uhh"
58
+ }
59
+
60
+ # ───────────── Generador NI ─────────────
61
+ V = ["a","e","i","o","u"]
62
+ C_CORE = ["b","d","t","g","k","s","Ε›","l","r","Ε•","n","m"]
63
+ CODAS = ["","n","s","Ε›","r","Ε•","l","m","k","t"]
64
+ NOMINALIZERS=["-ar","-en","-tu","-la","-Ε•a","-si"]
65
+ VERBALIZERS=["-ke","-ta","-ni","-bo","-ri"]
66
+ ADJ_SUFFIX="-si"
67
+
68
+ def legal_onset(c): return c not in ("r","Ε•")
69
+ def pick(seq,key): h=int.from_bytes(hashlib.sha256(key.encode()).digest()[:4],"big"); return seq[h%len(seq)]
70
+ def gen_syll(seed,i):
71
+ on=pick([""]+[c for c in C_CORE if legal_onset(c)],f"{seed}:on:{i}")
72
+ v=pick(V,f"{seed}:v:{i}")
73
+ co=pick(CODAS,f"{seed}:co:{i}")
74
+ return on+v+co
75
+ def bad_boundary(a,b): return bool(a and b and (a[-1],b[0]) in {("s","Ε›"),("Ε›","s"),("r","Ε•"),("Ε•","r")})
76
+ def make_root(seed):
77
+ n=int(pick(["1","2","2","2","3","3"],f"{seed}:n"))
78
+ for att in range(32):
79
+ parts=[]; ok=True
80
+ for i in range(n):
81
+ syl=gen_syll(seed,i+att)
82
+ if i==0 and syl and syl[0] in ("r","Ε•"): ok=False; break
83
+ if parts and (parts[-1]==syl or bad_boundary(parts[-1],syl)): ok=False; break
84
+ parts.append(syl)
85
+ if ok: return "".join(parts) or "ba"
86
+ return "ba"
87
+ def build_ni(es,pos):
88
+ seed=es+"|"+GLOBAL_SALT
89
+ root=make_root(seed)
90
+ if pos=="V": suf=pick(VERBALIZERS,seed+":V"); return root+suf,"V"
91
+ if pos=="ADJ": return root+ADJ_SUFFIX,"ADJ"
92
+ suf=pick(NOMINALIZERS,seed+":N"); return root+suf,"N"
93
+
94
+ # ───────────── Signario (tokens ‹…› β†’ teclas Iberia-Georgeos) ─────────────
95
+ SYL_FOR={"b":["β€ΉBAβ€Ί","β€ΉBEβ€Ί","β€ΉBIβ€Ί","β€ΉBOβ€Ί","β€ΉBUβ€Ί"],"d":["β€ΉDAβ€Ί","β€ΉDEβ€Ί","β€ΉDIβ€Ί","β€ΉDOβ€Ί","β€ΉDUβ€Ί"],
96
+ "t":["β€ΉTAβ€Ί","β€ΉTEβ€Ί","β€ΉTIβ€Ί","β€ΉTOβ€Ί","β€ΉTUβ€Ί"],"g":["β€ΉGAβ€Ί","β€ΉGEβ€Ί","β€ΉGIβ€Ί","β€ΉGOβ€Ί","β€ΉGUβ€Ί"],
97
+ "k":["β€ΉKAβ€Ί","β€ΉKEβ€Ί","β€ΉKIβ€Ί","β€ΉKOβ€Ί","β€ΉKUβ€Ί"]}
98
+ ALPHA_FOR={"a":"β€ΉAβ€Ί","e":"β€ΉEβ€Ί","i":"β€ΉIβ€Ί","o":"β€ΉOβ€Ί","u":"β€ΉUβ€Ί","s":"β€ΉSοΏ½οΏ½οΏ½","Ε›":"β€ΉΕšβ€Ί","l":"β€ΉLβ€Ί","r":"β€ΉRβ€Ί","Ε•":"β€ΉΕ”β€Ί","n":"β€ΉNβ€Ί","m":"β€ΉMβ€Ί"}
99
+ CODA_FOR={"":"","n":"β€ΉNβ€Ί","s":"β€ΉSβ€Ί","Ε›":"β€ΉΕšβ€Ί","r":"β€ΉRβ€Ί","Ε•":"β€ΉΕ”β€Ί","l":"β€ΉLβ€Ί","m":"β€ΉMβ€Ί","k":"β€ΉKβ€Ί","t":"β€ΉTβ€Ί"}
100
+
101
+ def tokens_from_latin(ni):
102
+ out=[]; i=0
103
+ while i<len(ni):
104
+ c=ni[i]
105
+ if c=="-": out.append("–"); i+=1; continue
106
+ if c in V: out.append(ALPHA_FOR[c]); i+=1; continue
107
+ if c in SYL_FOR and i+1<len(ni) and ni[i+1] in V:
108
+ v=ni[i+1]; token=SYL_FOR[c]["aeiou".index(v)]
109
+ coda=ni[i+2] if i+2<len(ni) else ""
110
+ out.append(token + (CODA_FOR[coda] if coda in CODA_FOR and coda!="" else ""))
111
+ i += 2 + (1 if coda in CODA_FOR and coda!="" else 0)
112
+ continue
113
+ out.append(ALPHA_FOR.get(c,c)); i+=1
114
+ return "".join(out)
115
+
116
+ KEYS_OVERRIDE = {
117
+ "ka":"K", # KA
118
+ "mi":"MI", # M + I
119
+ "te":"TE", # T + E
120
+ "ne":"N", # NA (ligadura), simplificado
121
+ "o":"O",
122
+ "eΕ›":"X", # Ś β†’ X (convenciΓ³n)
123
+ }
124
+ def georgeos_keys(token_str, ni_plain):
125
+ low = (ni_plain or "").lower()
126
+ if low in KEYS_OVERRIDE:
127
+ return KEYS_OVERRIDE[low]
128
+ m=re.findall(r"β€Ή(.*?)β€Ί", token_str); out=[]
129
+ for t in m:
130
+ if len(t)==2 and t[0] in "BDTGK": out.append(t[0]) # CV β†’ B/D/T/G/K
131
+ elif t in ("A","E","I","O","U"): out.append(t)
132
+ elif t=="Ś": out.append("X")
133
+ elif t=="Ε”": out.append("r")
134
+ else: out.append(t[0].upper())
135
+ return "".join(out)
136
+
137
+ # ───────────── Vocab ES (wordfreq) + filtros ─────────────
138
+ from wordfreq import top_n_list, zipf_frequency
139
+ def get_spanish_vocab_balanced(n_max, zipf_min, per_letter_min=120):
140
+ base = top_n_list("es", 200000)
141
+ base = [w for w in base if zipf_frequency(w, "es") >= zipf_min]
142
+ seen=set(); buckets=defaultdict(list)
143
+ for w in base:
144
+ if not re.match(r"^[A-Za-zΓΓ‰ΓΓ“ΓšΓœΓ‘Γ‘Γ©Γ­Γ³ΓΊΓΌΓ±]+$", w): continue
145
+ w = w.lower()
146
+
147
+ if (w in STOP_SKIP) and (w not in ATTESTED_MAP): # stop salvo KEEP
148
+ continue
149
+
150
+ # Filtros anti-ruido (salvo atestiguados)
151
+ if w not in ATTESTED_MAP:
152
+ if len(w) < 2: continue
153
+ if re.search(r"(.)\1\1", w): continue # aaa, ahhh…
154
+ if not re.search(r"[aeiouÑéíóúü]", w): continue
155
+ if re.fullmatch(r"(a+h+|ah+|eh+|oh+|uh+|uy+|ay+|ey+)", w): continue
156
+
157
+ if w in seen: continue
158
+ seen.add(w)
159
+
160
+ if w in ATTESTED_MAP:
161
+ pos = ATTESTED_POS.get(w, "PART")
162
+ elif re.search(r"(ar|er|ir)$", w):
163
+ pos = "V"
164
+ elif w.endswith(("o","a","e","al","il","oso","osa","ivo","iva")) and not w.endswith(("os","as","es")):
165
+ pos = "ADJ"
166
+ else:
167
+ pos = "N"
168
+
169
+ buckets[w[0]].append((w,pos))
170
+
171
+ letters=[chr(c) for c in range(ord('a'),ord('z')+1)]+['Γ±']
172
+ sel=[]
173
+ for L in letters:
174
+ if L in buckets: sel.extend(buckets[L][:per_letter_min])
175
+ if len(sel)<n_max:
176
+ for L in letters:
177
+ if L in buckets:
178
+ for itm in buckets[L][per_letter_min:]:
179
+ if len(sel)>=n_max: break
180
+ sel.append(itm)
181
+ if len(sel)>=n_max: break
182
+ for es_fixed in ATTESTED_MAP:
183
+ if all(es_fixed!=es for es,_ in sel):
184
+ sel.append((es_fixed, ATTESTED_POS.get(es_fixed,"N")))
185
+ return sel[:n_max]
186
+
187
+ # ───────────── ProyecciΓ³n + CSV ─────────────
188
+ def project(rows):
189
+ used=set(ATTESTED_MAP.values()); out=[]
190
+ for es,pos in rows:
191
+ es=es.strip().lower(); pos=(ATTESTED_POS.get(es) or pos or "N").upper()
192
+ if es in ATTESTED_MAP:
193
+ ni=ATTESTED_MAP[es]
194
+ tok=tokens_from_latin(ni) if ni else ""
195
+ keys=georgeos_keys(tok, ni) if ni else ""
196
+ out.append({"es":es,"pos_es":pos,"ni_lemma":ni,"pos_ni":"",
197
+ "evidencia":"consenso/inscripciΓ³n","fuente":"β€”","autor":"β€”",
198
+ "ni_tokens":tok,"georgeos_keys":keys})
199
+ used.add(ni); continue
200
+ ni,pos_ni=build_ni(es,pos); salt=0
201
+ while ni in used:
202
+ salt+=1; ni,pos_ni=build_ni(es+f":{salt}",pos)
203
+ if salt>64: break
204
+ used.add(ni)
205
+ tok=tokens_from_latin(ni); keys=georgeos_keys(tok, ni)
206
+ out.append({"es":es,"pos_es":pos,"ni_lemma":ni,"pos_ni":pos_ni,
207
+ "evidencia":"conjetural","fuente":"β€”","autor":"β€”",
208
+ "ni_tokens":tok,"georgeos_keys":keys})
209
+ out.sort(key=lambda d:(d["es"],d["pos_es"]))
210
+ return out
211
+
212
+ rows = get_spanish_vocab_balanced(N_MAX, ZIPF_MIN, PER_LETTER_MIN)
213
+ mapped = project(rows)
214
+
215
+ def write_csv(path, rows, fields):
216
+ with open(path,"w",newline="",encoding="utf-8") as f:
217
+ w=csv.DictWriter(f, fieldnames=fields); w.writeheader()
218
+ for r in rows: w.writerow({k:r.get(k,"") for k in fields})
219
+
220
+ csv_es_ni = os.path.join(OUT_DIR,"diccionario_es_neoibero.csv")
221
+ csv_ni_es = os.path.join(OUT_DIR,"diccionario_neoibero_es.csv")
222
+ csv_pairs = os.path.join(OUT_DIR,"hf_pairs.csv")
223
+
224
+ write_csv(csv_es_ni, mapped,
225
+ ["es","pos_es","ni_lemma","pos_ni","evidencia","fuente","autor","ni_tokens","georgeos_keys"])
226
+ inv=[{"ni_lemma":r["ni_lemma"],"es":r["es"],"pos_ni":r["pos_ni"],"pos_es":r["pos_es"],
227
+ "evidencia":r["evidencia"],"fuente":r["fuente"],"autor":r["autor"]} for r in mapped]
228
+ inv.sort(key=lambda x:(x["ni_lemma"],x["es"]))
229
+ write_csv(csv_ni_es, inv, ["ni_lemma","es","pos_ni","pos_es","evidencia","fuente","autor"])
230
+ write_csv(csv_pairs, [{"source_es":r["es"],"target_ni":r["ni_lemma"]} for r in mapped],
231
+ ["source_es","target_ni"])
232
+ print("CSV OK β€” Entradas:", len(mapped))
233
+ print("CSV en:", csv_es_ni, "|", csv_ni_es, "|", csv_pairs)
234
+
235
+ # ───────────── PDF (Platypus, 2 columnas, sin solapes) ─────────────
236
+ from reportlab.lib.pagesizes import A4
237
+ from reportlab.lib.units import mm
238
+ from reportlab.pdfbase import pdfmetrics
239
+ from reportlab.pdfbase.ttfonts import TTFont
240
+ from reportlab.platypus import BaseDocTemplate, PageTemplate, Frame, Paragraph, Spacer, KeepTogether
241
+ from reportlab.lib.styles import ParagraphStyle
242
+ from reportlab.lib.enums import TA_LEFT
243
+ from google.colab import files
244
+
245
+ print("Sube 1) Iberia-Georgeos.ttf y 2) una fuente Unicode (DejaVuSans.ttf / NotoSans-Regular.ttf):")
246
+ up_fonts = files.upload()
247
+ font_sign = next(k for k in up_fonts if k.lower().endswith((".ttf",".otf")) and "georgeos" in k.lower())
248
+ font_lat = None
249
+ for k in up_fonts:
250
+ nm=k.lower()
251
+ if nm.endswith((".ttf",".otf")) and ("dejavu" in nm or "noto" in nm or "unicode" in nm) and "georgeos" not in nm:
252
+ font_lat = k; break
253
+ if font_lat is None:
254
+ print("Sube ahora una fuente Unicode para la lΓ­nea latina (DejaVuSans.ttf / NotoSans-Regular.ttf):")
255
+ up2 = files.upload()
256
+ font_lat = next(k for k in up2 if k.lower().endswith((".ttf",".otf")))
257
+ pdfmetrics.registerFont(TTFont("IberiaGeorgeos", font_sign))
258
+ pdfmetrics.registerFont(TTFont("UniLatin", font_lat))
259
+
260
+ def clean_keys(s):
261
+ s = (s or "")
262
+ s = s.replace("β€”"," ").replace("–"," / ").replace("-", " / ")
263
+ s = s.replace("β€’","Β·")
264
+ s = re.sub(r"[^A-Za-z r/\\Β·\.,;:]", " ", s)
265
+ s = re.sub(r"\s*/\s*", " / ", s)
266
+ s = re.sub(r"\s+", " ", s).strip()
267
+ s = "".join(ch if ch=="r" else ch.upper() for ch in s) # mayΓΊs salvo 'r' (Ε”)
268
+ return s
269
+
270
+ LINE_ES, LINE_NI, LINE_SIG = 10, 10, 18
271
+ style_es = ParagraphStyle("es", fontName="UniLatin", fontSize=LINE_ES, leading=LINE_ES*1.2, alignment=TA_LEFT, spaceAfter=0)
272
+ style_ni = ParagraphStyle("ni", fontName="UniLatin", fontSize=LINE_NI, leading=LINE_NI*1.2, alignment=TA_LEFT, spaceAfter=2)
273
+ style_sig = ParagraphStyle("sig", fontName="IberiaGeorgeos", fontSize=LINE_SIG, leading=LINE_SIG*1.08,alignment=TA_LEFT, spaceAfter=4)
274
+
275
+ PAGE_W, PAGE_H = A4; M, GAP, COLS = 16*mm, 8*mm, 2
276
+ COL_W = (PAGE_W - 2*M - (COLS-1)*GAP) / COLS
277
+ frames = [Frame(M + i*(COL_W+GAP), M, COL_W, PAGE_H - 2*M, id=f"col{i}") for i in range(COLS)]
278
+ def on_page(canvas, doc):
279
+ canvas.setFont("UniLatin", 12)
280
+ canvas.drawString(M, PAGE_H - M + 2, "Diccionario EspaΓ±ol β†’ NeoΓ­bero (conlang; atestiguado vs conjetural)")
281
+
282
+ doc = BaseDocTemplate("Diccionario_ES_Neoibero.pdf", pagesize=A4, leftMargin=M, rightMargin=M, topMargin=M, bottomMargin=M)
283
+ doc.addPageTemplates(PageTemplate(id="TwoCol", frames=frames, onPage=on_page))
284
+
285
+ story=[]
286
+ for r in mapped:
287
+ es = r["es"]; pos = r["pos_es"]; ni = r["ni_lemma"]
288
+ keys = clean_keys(r.get("georgeos_keys",""))
289
+ block = [Paragraph(f"<b>{es}</b>", style_es),
290
+ Paragraph(f"[{pos}] β€” {ni if ni!='' else 'Ø'}", style_ni)]
291
+ if keys: # si target Ø, no imprimimos línea ibérica
292
+ block.append(Paragraph(keys, style_sig))
293
+ block.append(Spacer(1, 4))
294
+ story.append(KeepTogether(block))
295
+ doc.build(story)
296
+
297
+ # ───────────── Copiar CSVs a /content y descargar todo ─────────────
298
+ from google.colab import files as _f
299
+ root_csv_es_ni = "Diccionario_ES_Neoibero.csv"
300
+ root_csv_ni_es = "Diccionario_Neoibero_ES.csv"
301
+ root_csv_pairs = "HF_Pairs_ES_NI.csv"
302
+ shutil.copyfile(csv_es_ni, root_csv_es_ni)
303
+ shutil.copyfile(csv_ni_es, root_csv_ni_es)
304
+ shutil.copyfile(csv_pairs, root_csv_pairs)
305
+
306
+ print("Copiados a /content:")
307
+ print(os.path.abspath(root_csv_es_ni))
308
+ print(os.path.abspath(root_csv_ni_es))
309
+ print(os.path.abspath(root_csv_pairs))
310
+ print("PDF:", os.path.abspath("Diccionario_ES_Neoibero.pdf"))
311
+
312
+ # Auto-descarga (puedes comentar estas cuatro lΓ­neas si no quieres bajarlos ahora)
313
+ _f.download(root_csv_es_ni)
314
+ _f.download(root_csv_ni_es)
315
+ _f.download(root_csv_pairs)
316
+ _f.download("Diccionario_ES_Neoibero.pdf")
317
+
318
+ print("Listo οΏ½οΏ½οΏ½")