Commit ·
6942462
1
Parent(s): e68e0f6
Upload 7 files
Browse files- .gitattributes +2 -0
- HF_Pairs_ES_NI.csv +0 -0
- HF_Pairs_ES_NI_RICH.csv +0 -0
- Iberia-Georgeos.ttf +0 -0
- app.py +1676 -0
- requirements.txt +6 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
HF_Pairs_ES_NI_RICH_v4.csv filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
HF_Pairs_NI_ES_Translator_v4.csv filter=lfs diff=lfs merge=lfs -text
|
HF_Pairs_ES_NI.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
HF_Pairs_ES_NI_RICH.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Iberia-Georgeos.ttf
ADDED
|
Binary file (5.72 kB). View file
|
|
|
app.py
ADDED
|
@@ -0,0 +1,1676 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# app.py — Traductor Español ↔ Neoíbero v4.4.1 FIXED
|
| 2 |
+
# UI clásica (v2.3 LTS) + motor bidireccional v4.4, con Línea Ibérica y Locución
|
| 3 |
+
# 2025-01 – Actualizado para CSVs v4.4 ULTRA-DEFINITIVO + FIXES CRÍTICOS
|
| 4 |
+
# Cambios v4.4:
|
| 5 |
+
# - Compatible con HF_Pairs_ES_NI_RICH_v4.csv (783K pares)
|
| 6 |
+
# - Compatible con HF_Pairs_NI_ES_Translator_v4.csv (783K pares)
|
| 7 |
+
# - Usa campos nuevos: ni_surface, ni_tam, ni_pn
|
| 8 |
+
# - Números 1-100 invariables funcionando
|
| 9 |
+
# - Subjuntivos irregulares corregidos (vengas ✅)
|
| 10 |
+
# - RESPETA caracteres especiales (ŕ, ś) completamente
|
| 11 |
+
# Cambios v4.4.1 FIXED:
|
| 12 |
+
# - FIX: Carga correcta de CSV NI→ES (índices y minúsculas corregidos)
|
| 13 |
+
# - FIX: Sistema de conjugación con persona/número completo (1S,2S,3S,1P,2P,3P)
|
| 14 |
+
# - FIX: Interrogativas y exclamativas (-na/-ba) mejoradas
|
| 15 |
+
# - FIX: Preservación de nombres propios en traducción NI→ES
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
import os, csv, re, base64, unicodedata
|
| 19 |
+
import torch
|
| 20 |
+
from transformers import AutoProcessor, VitsModel
|
| 21 |
+
import numpy as np
|
| 22 |
+
from html import escape # ← para escapar la línea ibérica en HTML
|
| 23 |
+
|
| 24 |
+
# Caches locales (si existen)
|
| 25 |
+
os.environ['TRANSFORMERS_CACHE'] = os.environ.get('TRANSFORMERS_CACHE', '/tmp/cache')
|
| 26 |
+
os.environ['HF_HOME'] = os.environ.get('HF_HOME', '/tmp/hf')
|
| 27 |
+
|
| 28 |
+
DEBUG_MODE = False
|
| 29 |
+
def debug_print(msg):
|
| 30 |
+
if DEBUG_MODE:
|
| 31 |
+
print(f"[DEBUG] {msg}")
|
| 32 |
+
|
| 33 |
+
# =========================
|
| 34 |
+
# LÉXICO Y ESTRUCTURAS v4.4
|
| 35 |
+
# =========================
|
| 36 |
+
# << RUTAS EN RAÍZ DEL REPO >>
|
| 37 |
+
CSV_CANDIDATES = [
|
| 38 |
+
"HF_Pairs_ES_NI_RICH_v4.csv", # ← NUEVO v4.4
|
| 39 |
+
"HF_Pairs_ES_NI_RICH.csv",
|
| 40 |
+
"HF_Pairs_ES_NI.csv",
|
| 41 |
+
"Diccionario_ES_Neoibero.csv",
|
| 42 |
+
]
|
| 43 |
+
CSV_NI_ES = [
|
| 44 |
+
"HF_Pairs_NI_ES_Translator_v4.csv", # ← NUEVO v4.4
|
| 45 |
+
"HF_Pairs_NI_ES_Translator.csv",
|
| 46 |
+
]
|
| 47 |
+
|
| 48 |
+
# ES→NI
|
| 49 |
+
SURF_RICH = {} # (es_lower, tag) -> ni_surface
|
| 50 |
+
LEX_FORM = {} # es_form -> ni_lemma/surface
|
| 51 |
+
LEX_LEMMA = {} # es_lemma -> ni_lemma
|
| 52 |
+
FOLD_FORM = {} # es_form_no_diacritics -> ni_lemma
|
| 53 |
+
LEX_META = {} # es_form/lemma -> {"pos":..., "tam_ok":...}
|
| 54 |
+
FORCE_KEYS = set()
|
| 55 |
+
|
| 56 |
+
# NI→ES
|
| 57 |
+
NI_TO_ES_SURF = {} # (ni_surface, ni_tam) -> es_surface
|
| 58 |
+
NI_TO_ES_FORM = {} # ni_form/root -> es_form
|
| 59 |
+
NI_TO_ES_LEMMA= {} # ni_root -> es_lemma
|
| 60 |
+
|
| 61 |
+
# Mapeo de sufijos de persona en neoíbero → español
|
| 62 |
+
NI_PERSON_MAP = {
|
| 63 |
+
"-mu": "1S", # yo
|
| 64 |
+
"-su": "2S", # tú
|
| 65 |
+
"-i": "3S", # él/ella
|
| 66 |
+
"-gu": "1P", # nosotros
|
| 67 |
+
"-zu": "2P", # vosotros
|
| 68 |
+
"-te": "3P", # ellos/ellas
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# =========================
|
| 72 |
+
# MORFOLOGÍA – ESPAÑOL
|
| 73 |
+
# =========================
|
| 74 |
+
RE_GER = re.compile(r"(ando|iendo|yendo)$", re.I)
|
| 75 |
+
RE_PART = re.compile(r"(ado|ido|to|so|cho)$", re.I)
|
| 76 |
+
|
| 77 |
+
FUT_END = ("é","ás","á","emos","éis","án")
|
| 78 |
+
COND_END = ("ía","ías","ía","íamos","íais","ían")
|
| 79 |
+
PRET_AR = ("é","aste","ó","amos","asteis","aron")
|
| 80 |
+
PRET_ERIR = ("í","iste","ió","imos","isteis","ieron")
|
| 81 |
+
IMPF_AR = ("aba","abas","ábamos","abais","aban")
|
| 82 |
+
IMPF_ERIR = ("ía","ías","íamos","íais","ían")
|
| 83 |
+
SUBJ_AR = ("e","es","e","emos","éis","en")
|
| 84 |
+
SUBJ_ERIR = ("a","as","a","amos","áis","an")
|
| 85 |
+
SUBJ_PAST_AR = ("ara","aras","ara","áramos","arais","aran","ase","ases","ase","ásemos","aseis","asen")
|
| 86 |
+
SUBJ_PAST_ERIR = ("iera","ieras","iera","iéramos","ierais","ieran","iese","ieses","iese","iésemos","ieseis","iesen")
|
| 87 |
+
PRS_AR = ("o","as","a","amos","áis","an")
|
| 88 |
+
PRS_ER = ("o","es","e","emos","éis","en")
|
| 89 |
+
PRS_IR = ("o","es","e","imos","ís","en")
|
| 90 |
+
|
| 91 |
+
# ——— Parches FUT/COND sin tilde + tallos irregulares ———
|
| 92 |
+
RE_COND_NT_REG = re.compile(r"(?:ar|er|ir)(?:ia|ias|iamos|iais|ian)$", re.I) # hablaria, comerias...
|
| 93 |
+
RE_COND_NT_IRR = re.compile(r"(tendr|vendr|pondr|saldr|valdr|podr|habr|sabr|cabr|querr|dir|har)(?:ia|ias|iamos|iais|ian)$", re.I)
|
| 94 |
+
RE_FUT_NT_IRR = re.compile(r"(tendr|vendr|pondr|saldr|valdr|podr|habr|sabr|cabr|querr|dir|har)(?:re|ras|ra|remos|reis|ran)$", re.I)
|
| 95 |
+
|
| 96 |
+
def _strip_any(w, ends):
|
| 97 |
+
for s in sorted(ends, key=len, reverse=True):
|
| 98 |
+
if w.endswith(s):
|
| 99 |
+
return w[:-len(s)], s
|
| 100 |
+
return None, None
|
| 101 |
+
|
| 102 |
+
def _guess_class_from_ending(ending):
|
| 103 |
+
if ending in PRET_AR or ending in IMPF_AR or ending in SUBJ_AR or ending in PRS_AR:
|
| 104 |
+
return "ar"
|
| 105 |
+
return "er"
|
| 106 |
+
|
| 107 |
+
IRREG_LEMMA = {
|
| 108 |
+
"fui":"ir","fuiste":"ir","fue":"ir","fuimos":"ir","fuisteis":"ir","fueron":"ir",
|
| 109 |
+
"voy":"ir","vas":"ir","va":"ir","vamos":"ir","vais":"ir","van":"ir",
|
| 110 |
+
"soy":"ser","eres":"ser","es":"ser","somos":"ser","sois":"ser","son":"ser",
|
| 111 |
+
"era":"ser","eras":"ser","éramos":"ser","erais":"ser","eran":"ser",
|
| 112 |
+
"he":"haber","has":"haber","ha":"haber","hemos":"haber","habéis":"haber","han":"haber",
|
| 113 |
+
"hube":"haber","hubo":"haber","hubimos":"haber","hubiste":"haber","hubisteis":"haber","hubieron":"haber",
|
| 114 |
+
"estoy":"estar","estás":"estar","está":"estar","estamos":"estar","estáis":"estar","están":"estar",
|
| 115 |
+
"estuve":"estar","estuviste":"estar","estuvo":"estar","estuvimos":"estar","estuvisteis":"estar","estuvieron":"estar",
|
| 116 |
+
"estaba":"estar","estabas":"estar","estábamos":"estar","estabais":"estar","estaban":"estar",
|
| 117 |
+
|
| 118 |
+
"tuve":"tener","tuviste":"tener","tuvo":"tener","tuvimos":"tener","tuvisteis":"tener","tuvieron":"tener",
|
| 119 |
+
"vine":"venir","viniste":"venir","vino":"venir","vinimos":"venir","vinisteis":"venir","vinieron":"venir",
|
| 120 |
+
"hice":"hacer","hiciste":"hacer","hizo":"hacer","hicimos":"hacer","hicisteis":"hacer","hicieron":"hacer",
|
| 121 |
+
"puse":"poner","pusiste":"poner","puso":"poner","pusimos":"poner","pusisteis":"poner","pusieron":"poner",
|
| 122 |
+
"pude":"poder","pudiste":"poder","pudo":"poder","pudimos":"poder","pudisteis":"poder","pudieron":"poder",
|
| 123 |
+
"quise":"querer","quisiste":"querer","quiso":"querer","quisimos":"querer","quisisteis":"querer","quisieron":"querer",
|
| 124 |
+
"supe":"saber","supiste":"saber","supo":"saber","supimos":"saber","supisteis":"saber","supieron":"saber",
|
| 125 |
+
"traje":"traer","trajiste":"traer","trajo":"traer","trajimos":"traer","trajisteis":"traer","trajeron":"traer",
|
| 126 |
+
"dije":"decir","dijiste":"decir","dijo":"decir","dijimos":"decir","dijisteis":"decir","dijeron":"decir",
|
| 127 |
+
"conduje":"conducir","condujiste":"conducir","condujo":"conducir","condujimos":"conducir","condujisteis":"conducir","condujeron":"conducir",
|
| 128 |
+
"anduve":"andar","anduviste":"andar","anduvo":"andar","anduvimos":"andar","anduvisteis":"andar","anduvieron":"andar",
|
| 129 |
+
"cupe":"caber","cupiste":"caber","cupo":"caber","cupimos":"caber","cupisteis":"caber","cupieron":"caber",
|
| 130 |
+
"di":"dar","diste":"dar","dio":"dar","dimos":"dar","disteis":"dar","dieron":"dar",
|
| 131 |
+
"vi":"ver","viste":"ver","vio":"ver","vimos":"ver","visteis":"ver","vieron":"ver",
|
| 132 |
+
|
| 133 |
+
"tengo":"tener","vengo":"venir","pongo":"poner","salgo":"salir","traigo":"traer","caigo":"caer",
|
| 134 |
+
"hago":"hacer","oigo":"oír","digo":"decir","valgo":"valer","sigo":"seguir",
|
| 135 |
+
|
| 136 |
+
"tienes":"tener","tiene":"tener","tienen":"tener",
|
| 137 |
+
"vienes":"venir","viene":"venir","vienen":"venir",
|
| 138 |
+
"pienso":"pensar","piensas":"pensar","piensa":"pensar","piensan":"pensar",
|
| 139 |
+
"quiero":"querer","quieres":"querer","quiere":"querer","quieren":"querer",
|
| 140 |
+
"prefiero":"preferir","prefieres":"preferir","prefiere":"preferir","prefieren":"preferir",
|
| 141 |
+
|
| 142 |
+
"vaya":"ir","vayas":"ir","vayamos":"ir","vayáis":"ir","vayan":"ir",
|
| 143 |
+
"sea":"ser","seas":"ser","seamos":"ser","seáis":"ser","sean":"ser",
|
| 144 |
+
"haya":"haber","hayas":"haber","hayamos":"haber","hayáis":"haber","hayan":"haber",
|
| 145 |
+
"dé":"dar","des":"dar","demos":"dar","deis":"dar","den":"dar",
|
| 146 |
+
"esté":"estar","estés":"estar","estemos":"estar","estéis":"estar","estén":"estar",
|
| 147 |
+
"tenga":"tener","tengas":"tener","tengamos":"tener","tengáis":"tener","tengan":"tener",
|
| 148 |
+
"venga":"venir","vengas":"venir","vengamos":"venir","vengáis":"venir","vengan":"venir", # ← FIX v4.4
|
| 149 |
+
|
| 150 |
+
"ve":"ir","id":"ir",
|
| 151 |
+
"sé":"ser","sed":"ser",
|
| 152 |
+
"haz":"hacer","haced":"hacer",
|
| 153 |
+
"pon":"poner","poned":"poner",
|
| 154 |
+
"ven":"venir","venid":"venir",
|
| 155 |
+
"ten":"tener","tened":"tener",
|
| 156 |
+
"sal":"salir","salid":"salir",
|
| 157 |
+
"di":"decir","decid":"decir",
|
| 158 |
+
|
| 159 |
+
"doy":"dar","das":"dar","da":"dar","damos":"dar","dais":"dar","dan":"dar",
|
| 160 |
+
"veo":"ver","ves":"ver","vemos":"ver","veis":"ver","ven":"ver",
|
| 161 |
+
"oí":"oír","oíste":"oír","oyó":"oír","oímos":"oír","oísteis":"oír","oyeron":"oír",
|
| 162 |
+
"iba":"ir","ibas":"ir","íbamos":"ir","ibais":"ir","iban":"ir",
|
| 163 |
+
"veía":"ver","veías":"ver","veíamos":"ver","veíais":"ver","veían":"ver",
|
| 164 |
+
|
| 165 |
+
"vinieras":"venir","lloviera":"llover",
|
| 166 |
+
}
|
| 167 |
+
# NUEVO: lemas para futuro de subjuntivo arcaico
|
| 168 |
+
IRREG_LEMMA.update({
|
| 169 |
+
"viniere":"venir","vinieres":"venir","vinieren":"venir",
|
| 170 |
+
"hiciere":"hacer","hicieres":"hacer","hicieren":"hacer",
|
| 171 |
+
"tuviere":"tener","tuvieres":"tener","tuvieren":"tener",
|
| 172 |
+
})
|
| 173 |
+
|
| 174 |
+
IRREG_MORPH_TAGS = {
|
| 175 |
+
# Subjuntivos
|
| 176 |
+
"vaya":"SBJV","vayas":"SBJV","vayamos":"SBJV","vayáis":"SBJV","vayan":"SBJV",
|
| 177 |
+
"sea":"SBJV","seas":"SBJV","seamos":"SBJV","seáis":"SBJV","sean":"SBJV",
|
| 178 |
+
"haya":"SBJV","hayas":"SBJV","hayamos":"SBJV","hayáis":"SBJV","hayan":"SBJV",
|
| 179 |
+
"dé":"SBJV","des":"SBJV","demos":"SBJV","deis":"SBJV","den":"SBJV",
|
| 180 |
+
"esté":"SBJV","estés":"SBJV","estemos":"SBJV","estéis":"SBJV","estén":"SBJV",
|
| 181 |
+
"tenga":"SBJV","tengas":"SBJV","tengamos":"SBJV","tengáis":"SBJV","tengan":"SBJV",
|
| 182 |
+
"venga":"SBJV","vengas":"SBJV","vengamos":"SBJV","vayáis":"SBJV","vengan":"SBJV",
|
| 183 |
+
"haga":"SBJV","hagas":"SBJV","hagamos":"SBJV","hagáis":"SBJV","hagan":"SBJV",
|
| 184 |
+
"pueda":"SBJV","puedas":"SBJV","podamos":"SBJV","podáis":"SBJV","puedan":"SBJV",
|
| 185 |
+
|
| 186 |
+
# Imperativos
|
| 187 |
+
"id":"IMP","sed":"IMP",
|
| 188 |
+
"haz":"IMP","haced":"IMP","pon":"IMP","poned":"IMP","ven":"IMP","venid":"IMP",
|
| 189 |
+
"ten":"IMP","tened":"IMP","sal":"IMP","salid":"IMP","decid":"IMP",
|
| 190 |
+
|
| 191 |
+
# Imperativos con clíticos
|
| 192 |
+
"llámame":"IMP","llámalo":"IMP","llámala":"IMP","llámanos":"IMP","llámalos":"IMP","llámalas":"IMP",
|
| 193 |
+
"dime":"IMP","d��melo":"IMP","dinos":"IMP","dínoslo":"IMP",
|
| 194 |
+
"hazme":"IMP","hazlo":"IMP","hazla":"IMP","haznos":"IMP",
|
| 195 |
+
"ponme":"IMP","ponlo":"IMP","ponla":"IMP","ponnos":"IMP",
|
| 196 |
+
"dame":"IMP","dámelo":"IMP","danos":"IMP","dánoslo":"IMP",
|
| 197 |
+
"tráeme":"IMP","tráelo":"IMP","tráela":"IMP","tráenos":"IMP",
|
| 198 |
+
"díselo":"IMP","pónselo":"IMP","házselo":"IMP",
|
| 199 |
+
|
| 200 |
+
# Futuro de subjuntivo (arcaico)
|
| 201 |
+
"viniere":"FUT_SBJV","vinieres":"FUT_SBJV","vinieren":"FUT_SBJV",
|
| 202 |
+
"hiciere":"FUT_SBJV","hicieres":"FUT_SBJV","hicieren":"FUT_SBJV",
|
| 203 |
+
"fuere":"FUT_SBJV","fueres":"FUT_SBJV","fueren":"FUT_SBJV",
|
| 204 |
+
"hubiere":"FUT_SBJV","hubieres":"FUT_SBJV","hubieren":"FUT_SBJV",
|
| 205 |
+
|
| 206 |
+
# Pretéritos
|
| 207 |
+
"creísteis":"PST","dijisteis":"PST","hicisteis":"PST","pusisteis":"PST",
|
| 208 |
+
"supisteis":"PST","quisisteis":"PST","trajisteis":"PST",
|
| 209 |
+
"vi":"PST","dio":"PST","fue":"PST","fui":"PST",
|
| 210 |
+
|
| 211 |
+
# Imperfectos
|
| 212 |
+
"iba":"IPFV","ibas":"IPFV","íbamos":"IPFV","ibais":"IPFV","iban":"IPFV",
|
| 213 |
+
"veía":"IPFV","veías":"IPFV","veíamos":"IPFV","veíais":"IPFV","veían":"IPFV",
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
def looks_like_verb_form_strict(w: str) -> bool:
|
| 217 |
+
w = (w or "").lower()
|
| 218 |
+
if w.endswith(("ar","er","ir")): return True
|
| 219 |
+
if RE_GER.search(w) or RE_PART.search(w): return True
|
| 220 |
+
if re.search(r"(á|ás|áis|és|éis|ís)$", w): return True
|
| 221 |
+
if _strip_any(w, FUT_END+COND_END)[0] is not None: return True
|
| 222 |
+
if _strip_any(w, PRET_AR+PRET_ERIR)[0] is not None: return True
|
| 223 |
+
if _strip_any(w, IMPF_AR+IMPF_ERIR)[0] is not None: return True
|
| 224 |
+
if _strip_any(w, SUBJ_PAST_AR+SUBJ_PAST_ERIR)[0] is not None: return True
|
| 225 |
+
# Irregulares sin tilde (FUT/COND) + COND regular sin tilde
|
| 226 |
+
if RE_COND_NT_REG.search(w): return True
|
| 227 |
+
if RE_COND_NT_IRR.search(w): return True
|
| 228 |
+
if RE_FUT_NT_IRR.search(w): return True
|
| 229 |
+
# Irregulares pretéritos y compañía
|
| 230 |
+
if re.search(r"(anduve|anduviste|anduvo|anduvimos|anduvieron|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w):
|
| 231 |
+
return True
|
| 232 |
+
return False
|
| 233 |
+
|
| 234 |
+
def _zco_guess(w:str)->str:
|
| 235 |
+
if w.endswith("uzco"): return w[:-4] + "ucir"
|
| 236 |
+
if w.endswith("ezco"): return w[:-4] + "ecer"
|
| 237 |
+
if w.endswith("ozco"): return w[:-4] + "ocer"
|
| 238 |
+
if w.endswith("azco"): return w[:-4] + "acer"
|
| 239 |
+
return ""
|
| 240 |
+
|
| 241 |
+
def guess_infinitive_es(w: str) -> str:
|
| 242 |
+
w = (w or "").lower()
|
| 243 |
+
if w in IRREG_LEMMA: return IRREG_LEMMA[w]
|
| 244 |
+
if w in ("vámonos","vamonos"): return "ir"
|
| 245 |
+
if w.endswith("zco"):
|
| 246 |
+
z = _zco_guess(w)
|
| 247 |
+
if z: return z
|
| 248 |
+
if w.endswith("go"):
|
| 249 |
+
base = w[:-2]
|
| 250 |
+
map_go = {"ten":"tener","ven":"venir","pon":"poner","sal":"salir","tra":"traer","ca":"caer","ha":"hacer","oi":"oír","di":"decir","val":"valer","si":"seguir"}
|
| 251 |
+
for k,v in map_go.items():
|
| 252 |
+
if base.startswith(k): return v
|
| 253 |
+
if w.endswith(("ar","er","ir")): return w
|
| 254 |
+
m = RE_GER.search(w)
|
| 255 |
+
if m:
|
| 256 |
+
base = w[:m.start()]
|
| 257 |
+
return base + ("ar" if m.group(0)=="ando" else "er")
|
| 258 |
+
m = RE_PART.search(w)
|
| 259 |
+
if m:
|
| 260 |
+
base = w[:m.start()]
|
| 261 |
+
part_irreg = {
|
| 262 |
+
"hecho":"hacer","dicho":"decir","visto":"ver","puesto":"poner","escrito":"escribir",
|
| 263 |
+
"abierto":"abrir","cubierto":"cubrir","muerto":"morir","roto":"romper",
|
| 264 |
+
"vuelto":"volver","resuelto":"resolver","frito":"freír","impreso":"imprimir",
|
| 265 |
+
"satisfecho":"satisfacer","provisto":"proveer"
|
| 266 |
+
}
|
| 267 |
+
if w in part_irreg: return part_irreg[w]
|
| 268 |
+
return base + "er"
|
| 269 |
+
# FUT/COND con tilde
|
| 270 |
+
base, end = _strip_any(w, FUT_END+COND_END)
|
| 271 |
+
if base is not None:
|
| 272 |
+
irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder",
|
| 273 |
+
"habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"}
|
| 274 |
+
if base in irreg: return irreg[base]
|
| 275 |
+
return base
|
| 276 |
+
# COND sin tilde (regular e irregular)
|
| 277 |
+
m = RE_COND_NT_IRR.search(w)
|
| 278 |
+
if m:
|
| 279 |
+
irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder",
|
| 280 |
+
"habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"}
|
| 281 |
+
stem = m.group(1)
|
| 282 |
+
return irreg.get(stem, "")
|
| 283 |
+
m = RE_COND_NT_REG.search(w)
|
| 284 |
+
if m:
|
| 285 |
+
suf = m.group(0).replace("ar","",1).replace("er","",1).replace("ir","",1) # "ia" / "ias" / ...
|
| 286 |
+
return w[:-len(suf)] # quita "ia/ias/..." → deja el infinitivo
|
| 287 |
+
# FUT sin tilde (solo irregulares para evitar ambigüedades)
|
| 288 |
+
m = RE_FUT_NT_IRR.search(w)
|
| 289 |
+
if m:
|
| 290 |
+
irreg = {"saldr":"salir","vendr":"venir","tendr":"tener","pondr":"poner","valdr":"valer","podr":"poder",
|
| 291 |
+
"habr":"haber","sabr":"saber","cabr":"caber","querr":"querer","dir":"decir","har":"hacer"}
|
| 292 |
+
stem = m.group(1)
|
| 293 |
+
return irreg.get(stem, "")
|
| 294 |
+
# Otras heurísticas
|
| 295 |
+
if w.endswith("áis"): return w[:-3] + "ar"
|
| 296 |
+
if w.endswith("éis"): return w[:-3] + "er"
|
| 297 |
+
if w.endswith("ís"): return w[:-2] + "ir"
|
| 298 |
+
if w.endswith("ás"): return w[:-2] + "ar"
|
| 299 |
+
if w.endswith("és"): return w[:-2] + "er"
|
| 300 |
+
if w.endswith("á"): return w[:-1] + "ar"
|
| 301 |
+
for group in (PRET_AR+PRET_ERIR, IMPF_AR+IMPF_ERIR, SUBJ_AR+SUBJ_ERIR, PRS_AR+PRS_ER+PRS_IR):
|
| 302 |
+
base, end = _strip_any(w, group)
|
| 303 |
+
if base is not None:
|
| 304 |
+
return base + _guess_class_from_ending(end)
|
| 305 |
+
base, end = _strip_any(w, SUBJ_PAST_AR)
|
| 306 |
+
if base is not None: return base + "ar"
|
| 307 |
+
base, end = _strip_any(w, SUBJ_PAST_ERIR)
|
| 308 |
+
if base is not None: return base + "er"
|
| 309 |
+
return ""
|
| 310 |
+
|
| 311 |
+
def es_morph_tag(w: str) -> str:
|
| 312 |
+
w = (w or "").lower()
|
| 313 |
+
if w in IRREG_MORPH_TAGS: return IRREG_MORPH_TAGS[w]
|
| 314 |
+
|
| 315 |
+
# Imperativos con clíticos / perífrasis
|
| 316 |
+
if re.search(r"^(llám|dím|házm|pónm|vén|dám|tén|tráe)(a|e)?(me|te|lo|la|nos|os|les|se|melo|telo|selo)$", w): return "IMP"
|
| 317 |
+
if re.search(r"(adme|edme|idme|adlo|edle|idle|adnos|ednos)$", w): return "IMP"
|
| 318 |
+
if re.search(r"(?:ad|ed|id|ád|éd|íd)(?:me|te|se|lo|la|nos|os|les|melo|telo|selo|noslo|oslo|sela|selas|selos)$", w): return "IMP"
|
| 319 |
+
if re.search(r"^.*[áéí]ndo(me|te|se|lo|la|nos|os|les|melo|telo|selo)$", w): return "IPFV"
|
| 320 |
+
if re.search(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", w):
|
| 321 |
+
base = re.sub(r"(melo|telo|selo|noslo|oslo|sela|selas|selos)$", "", w)
|
| 322 |
+
if base and len(base) > 2: return "IMP"
|
| 323 |
+
|
| 324 |
+
# FUT/COND sin tilde (prioridad antes de otras reglas)
|
| 325 |
+
if RE_FUT_NT_IRR.search(w): return "FUT"
|
| 326 |
+
if RE_COND_NT_IRR.search(w): return "COND"
|
| 327 |
+
if RE_COND_NT_REG.search(w): return "COND"
|
| 328 |
+
|
| 329 |
+
if w.endswith(("ar","er","ir")): return "INF"
|
| 330 |
+
if RE_GER.search(w): return "IPFV"
|
| 331 |
+
if RE_PART.search(w): return "PST"
|
| 332 |
+
if _strip_any(w, PRET_AR+PRET_ERIR)[0] is not None: return "PST"
|
| 333 |
+
if _strip_any(w, IMPF_AR+IMPF_ERIR)[0] is not None: return "IPFV"
|
| 334 |
+
if _strip_any(w, FUT_END)[0] is not None: return "FUT"
|
| 335 |
+
if _strip_any(w, COND_END)[0] is not None: return "COND"
|
| 336 |
+
if re.search(r"(á|ás|áis|és|éis|ís)$", w): return "PRS"
|
| 337 |
+
if _strip_any(w, SUBJ_AR+SUBJ_ERIR)[0] is not None: return "SBJV"
|
| 338 |
+
if _strip_any(w, PRS_AR+PRS_ER+PRS_IR)[0] is not None: return "PRS"
|
| 339 |
+
if _strip_any(w, SUBJ_PAST_AR+SUBJ_PAST_ERIR)[0] is not None: return "SBJV"
|
| 340 |
+
if re.search(r"(anduve|conduje|traduje|produje|reduje|introduje|supe|quise|pude|puse|hice|hizo|dije|dijo|traje|trajo|tuve|tuvo|vine|vino|cupe|cupo)$", w):
|
| 341 |
+
return "PST"
|
| 342 |
+
if re.search(r"^.+[aei]d$", w): return "IMP"
|
| 343 |
+
return "UNK"
|
| 344 |
+
|
| 345 |
+
# =========================
|
| 346 |
+
# MORFOLOGÍA – NEOÍBERO
|
| 347 |
+
# =========================
|
| 348 |
+
NI_TAM_SUFFIXES = {
|
| 349 |
+
"-ke": "PRS","-ei": "PST","-ta": "IPFV","-na": "FUT",
|
| 350 |
+
"-ne": "COND","-ni": "SBJV","-tu": "IMP","-ra":"FUT_SBJV"
|
| 351 |
+
}
|
| 352 |
+
def detect_ni_tam(word: str):
|
| 353 |
+
# Parche: tolerar colas pronominales tras TAM (p. ej., -i, -mu)
|
| 354 |
+
word = (word or "").lower().strip()
|
| 355 |
+
for pn in ("-i", "-mu", "-su", "-gu", "-zu", "-te"):
|
| 356 |
+
if word.endswith(pn):
|
| 357 |
+
cand = word[:-len(pn)]
|
| 358 |
+
# solo aceptamos cortar PN si entonces aparece un TAM conocido
|
| 359 |
+
if any(cand.endswith(suf) for suf in NI_TAM_SUFFIXES.keys()):
|
| 360 |
+
word = cand
|
| 361 |
+
break
|
| 362 |
+
for suf, tag in NI_TAM_SUFFIXES.items():
|
| 363 |
+
if word.endswith(suf):
|
| 364 |
+
return word[:-len(suf)], tag, suf
|
| 365 |
+
return word, "INF", ""
|
| 366 |
+
|
| 367 |
+
# =========================
|
| 368 |
+
# UTILIDADES
|
| 369 |
+
# =========================
|
| 370 |
+
def fold(s:str)->str:
|
| 371 |
+
return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c)!="Mn")
|
| 372 |
+
def has_diacritic(s:str)->bool:
|
| 373 |
+
return bool(re.search(r"[áéíóúüÁÉÍÓÚÜ]", s or ""))
|
| 374 |
+
|
| 375 |
+
def _canon_pos(p: str) -> str:
|
| 376 |
+
p = (p or "").strip().upper()
|
| 377 |
+
MAP = {"V":"V","VERB":"V","N":"N","NOUN":"N","ADJ":"ADJ","ADJECTIVE":"ADJ","ADV":"ADV","ADVERB":"ADV",
|
| 378 |
+
"INTJ":"INTJ","INTERJ":"INTJ","INTERJECTION":"INTJ","PRON":"PRON","PRONOUN":"PRON",
|
| 379 |
+
"PART":"PART","PARTICLE":"PART","POSTP":"POSTP","ADP":"POSTP","ADPOSITION":"POSTP",
|
| 380 |
+
"NUM":"NUM","DET":"DET"}
|
| 381 |
+
return MAP.get(p, "")
|
| 382 |
+
def _boolish(x):
|
| 383 |
+
if x is None: return None
|
| 384 |
+
s = str(x).strip().lower()
|
| 385 |
+
if s in ("1","true","t","yes","y","si","sí"): return True
|
| 386 |
+
if s in ("0","false","f","no","n"): return False
|
| 387 |
+
return None
|
| 388 |
+
def _meta_set(form_es:str, pos:str=None, tam_ok=None):
|
| 389 |
+
if not form_es: return
|
| 390 |
+
d = LEX_META.setdefault(form_es, {})
|
| 391 |
+
if pos and not d.get("pos"): d["pos"] = pos
|
| 392 |
+
if tam_ok is not None and d.get("tam_ok") is None: d["tam_ok"] = bool(tam_ok)
|
| 393 |
+
def pos_of_es(token_low:str) -> str:
|
| 394 |
+
m = LEX_META.get(token_low, {})
|
| 395 |
+
if m.get("pos"): return m["pos"]
|
| 396 |
+
return "V" if looks_like_verb_form_strict(token_low) else ""
|
| 397 |
+
def tam_allowed_for_es(token_low:str) -> bool:
|
| 398 |
+
m = LEX_META.get(token_low, {})
|
| 399 |
+
if m.get("tam_ok") is not None: return bool(m["tam_ok"])
|
| 400 |
+
return pos_of_es(token_low) == "V"
|
| 401 |
+
|
| 402 |
+
# =========================
|
| 403 |
+
# TTS (Meta MMS)
|
| 404 |
+
# =========================
|
| 405 |
+
print("Cargando modelo de voz...")
|
| 406 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 407 |
+
processor = model = None
|
| 408 |
+
try:
|
| 409 |
+
processor = AutoProcessor.from_pretrained("facebook/mms-tts-spa")
|
| 410 |
+
model = VitsModel.from_pretrained("facebook/mms-tts-spa").to(device)
|
| 411 |
+
print("Modelo de voz cargado.")
|
| 412 |
+
except Exception as e:
|
| 413 |
+
print(f"ERROR TTS: {e}")
|
| 414 |
+
|
| 415 |
+
PAUSE_LEVEL=3
|
| 416 |
+
def add_reading_pauses(text: str, level:int=3) -> str:
|
| 417 |
+
if level <= 1: return text
|
| 418 |
+
t = text
|
| 419 |
+
if level >= 2: t = re.sub(r",\s*", ", , ", t)
|
| 420 |
+
if level >= 3:
|
| 421 |
+
t = re.sub(r"\.\s*", ". . ", t); t = re.sub(r";\s*", "; ; ", t)
|
| 422 |
+
return re.sub(r"\s+"," ",t).strip()
|
| 423 |
+
|
| 424 |
+
def hispanize_for_tts(ni_text: str) -> str:
|
| 425 |
+
text=(ni_text or "").lower()
|
| 426 |
+
# CRÍTICO: Respetar caracteres iberos
|
| 427 |
+
text=text.replace('ŕ','rr').replace('ś','s').replace('eś','es')
|
| 428 |
+
text=text.replace('ŕa','rra').replace('aŕe','arre').replace('-', ' ')
|
| 429 |
+
text=re.sub(r'\[.*?\]','',text)
|
| 430 |
+
text=re.sub(r'\s+',' ',text).strip()
|
| 431 |
+
return add_reading_pauses(text, PAUSE_LEVEL)
|
| 432 |
+
|
| 433 |
+
def synthesize_speech(text):
|
| 434 |
+
if not text or not text.strip() or model is None or processor is None: return None
|
| 435 |
+
try:
|
| 436 |
+
inputs = processor(text=hispanize_for_tts(text), return_tensors="pt").to(device)
|
| 437 |
+
with torch.no_grad(): output = model(**inputs).waveform
|
| 438 |
+
speech_np = output.cpu().numpy().squeeze()
|
| 439 |
+
mx = max(abs(speech_np.min()), abs(speech_np.max()))
|
| 440 |
+
if mx>0: speech_np = speech_np/mx*0.9
|
| 441 |
+
return (16000, speech_np.astype(np.float32))
|
| 442 |
+
except Exception as e:
|
| 443 |
+
print(f"Error TTS: {e}"); return None
|
| 444 |
+
|
| 445 |
+
# =========================
|
| 446 |
+
# LÍNEA IBÉRICA (claves Georgeos)
|
| 447 |
+
# =========================
|
| 448 |
+
KEYS_MODE = "explicit"
|
| 449 |
+
V = "aeiou"
|
| 450 |
+
SYL_FOR={"b":["‹BA›","‹BE›","‹BI›","‹BO›","‹BU›"],
|
| 451 |
+
"d":["‹DA›","‹DE›","‹DI›","‹DO›","‹DU›"],
|
| 452 |
+
"t":["‹TA›","‹TE›","‹TI›","‹TO›","‹TU›"],
|
| 453 |
+
"g":["‹GA›","‹GE›","‹GI›","‹GO›","‹GU›"],
|
| 454 |
+
"k":["‹KA›","‹KE›","‹KI›","‹KO›","‹KU›"]}
|
| 455 |
+
ALPHA_FOR={"a":"‹A›","e":"‹E›","i":"‹I›","o":"‹O›","u":"‹U›","s":"‹S›","ś":"‹Ś›","l":"‹L›","r":"‹R›","ŕ":"‹Ŕ›","n":"‹N›","m":"‹M›"}
|
| 456 |
+
CODA_FOR={"":"","n":"‹N›","s":"‹S›","ś":"‹Ś›","r":"‹R›","ŕ":"‹Ŕ›","l":"‹L›","m":"‹M›","k":"‹K›","t":"‹T›"}
|
| 457 |
+
|
| 458 |
+
def tokens_from_latin(ni:str)->str:
|
| 459 |
+
out=[]; i=0; ni=(ni or "").lower()
|
| 460 |
+
while i<len(ni):
|
| 461 |
+
c=ni[i]
|
| 462 |
+
if c=="p": c="b" # no /p/ independiente
|
| 463 |
+
if c=="-": out.append("—"); i+=1; continue
|
| 464 |
+
if c in V:
|
| 465 |
+
out.append(ALPHA_FOR[c]); i+=1; continue
|
| 466 |
+
if c in SYL_FOR and i+1<len(ni) and ni[i+1] in V:
|
| 467 |
+
idx=V.index(ni[i+1]); tok=SYL_FOR[c][idx]
|
| 468 |
+
coda=ni[i+2] if i+2<len(ni) else ""
|
| 469 |
+
if coda in CODA_FOR and coda!="":
|
| 470 |
+
tok+=CODA_FOR[coda]; i+=3
|
| 471 |
+
else:
|
| 472 |
+
i+=2
|
| 473 |
+
out.append(tok); continue
|
| 474 |
+
out.append(ALPHA_FOR.get(c, c.upper())); i+=1
|
| 475 |
+
return "".join(out)
|
| 476 |
+
|
| 477 |
+
KEYS_OVERRIDE={"ka":"K","mi":"MI","te":"TE","ne":"N","o":"O","eś":"X"}
|
| 478 |
+
def georgeos_keys(token_str:str, ni_plain:str)->str:
|
| 479 |
+
low=(ni_plain or "").lower()
|
| 480 |
+
if low in KEYS_OVERRIDE: return KEYS_OVERRIDE[low]
|
| 481 |
+
m=re.findall(r"‹(.*?)›", token_str)
|
| 482 |
+
out=[]
|
| 483 |
+
for t in m:
|
| 484 |
+
if KEYS_MODE == "compact":
|
| 485 |
+
if len(t)==2 and t[0] in "BDTGK": out.append(t[0])
|
| 486 |
+
elif t in ("A","E","I","O","U"): out.append(t)
|
| 487 |
+
elif t=="Ś": out.append("X")
|
| 488 |
+
elif t=="Ŕ": out.append("r")
|
| 489 |
+
else: out.append(t[0].upper())
|
| 490 |
+
else:
|
| 491 |
+
if len(t)==2 and t[0] in "BDTGK": out.append(t)
|
| 492 |
+
elif t=="Ś": out.append("X")
|
| 493 |
+
elif t=="Ŕ": out.append("r")
|
| 494 |
+
else: out.append(t.upper())
|
| 495 |
+
return "".join(out)
|
| 496 |
+
|
| 497 |
+
TRIDOT = "/"
|
| 498 |
+
VISIBLE_PUNCT = {",",".",";","; ",":","…","(",")","[","]","{","}","\"","'","«","»","—","–",""",""","'","'"}
|
| 499 |
+
HARD_BOUND = {".",";","—","–",":","(",")","«","»"}
|
| 500 |
+
|
| 501 |
+
def render_ib_with_tridots(toks):
|
| 502 |
+
res=[]; prev_word=False
|
| 503 |
+
for tk in toks:
|
| 504 |
+
is_punct = tk in VISIBLE_PUNCT
|
| 505 |
+
if is_punct:
|
| 506 |
+
res.append(" "+tk+" "); prev_word=False
|
| 507 |
+
else:
|
| 508 |
+
if prev_word: res.append(" "+TRIDOT+" ")
|
| 509 |
+
res.append(tk); prev_word=True
|
| 510 |
+
return "".join(res).strip()
|
| 511 |
+
|
| 512 |
+
# =========================
|
| 513 |
+
# TRADUCTOR ES→NI
|
| 514 |
+
# =========================
|
| 515 |
+
TAM_SUFFIX={"PRS":"-ke","PST":"-ei","FUT":"-na","IPFV":"-ta",
|
| 516 |
+
"COND":"-ne","SBJV":"-ni","IMP":"-tu","INF":"","FUT_SBJV":"-ra","UNK":"-ke"}
|
| 517 |
+
VERB_TAM = ("-ke","-na","-ei","-ta","-ni","-ne","-tu","-ra")
|
| 518 |
+
|
| 519 |
+
def strip_ni_tam(lemma: str):
|
| 520 |
+
lemma = lemma or ""
|
| 521 |
+
for s in sorted(VERB_TAM, key=len, reverse=True):
|
| 522 |
+
if lemma.endswith(s): return lemma[:-len(s)], s
|
| 523 |
+
return lemma, ""
|
| 524 |
+
|
| 525 |
+
STOP=set("""
|
| 526 |
+
el la los las lo un una unos unas al del de en con sin por sobre entre hasta desde hacia según tras
|
| 527 |
+
pero aunque sino que como si porque cuando donde mientras
|
| 528 |
+
muy ya sí no también solo sólo aún aun más menos
|
| 529 |
+
mi mis tu tus su sus nuestro nuestra nuestros nuestras
|
| 530 |
+
esto eso aquello ese esa esos esas aquel aquella aquellos aquellas
|
| 531 |
+
quien quién quiénes cual cuál cuales cuáles cuyo cuya cuyos cuyas
|
| 532 |
+
eh ay oh uy ah aja jeje jaja aah ahh ohh uhh
|
| 533 |
+
""".split())
|
| 534 |
+
|
| 535 |
+
# --- Reglas "a" → ka/mi/te
|
| 536 |
+
def rule_a(prev_tok:str, token:str, next_tok:str)->str:
|
| 537 |
+
verbs={"dar","decir","contar","enviar","ofrecer","mostrar","prestar","regalar","entregar"}
|
| 538 |
+
if prev_tok in verbs: return "mi"
|
| 539 |
+
nombres={"ana","marta","juan","pedro","luis","maría","jose","carlos","laura"}
|
| 540 |
+
if next_tok in nombres: return "te"
|
| 541 |
+
return "ka"
|
| 542 |
+
|
| 543 |
+
Q_ENCLITIC_INT = "-na"
|
| 544 |
+
Q_ENCLITIC_EXC = "-ba"
|
| 545 |
+
WH_WORDS = {
|
| 546 |
+
"qué","quien","quién","quienes","quiénes","cual","cuál","cuales","cuáles",
|
| 547 |
+
"donde","dónde","cuando","cuándo","como","cómo",
|
| 548 |
+
"cuanto","cuánto","cuanta","cuánta","cuantos","cuántos","cuantas","cuántas"
|
| 549 |
+
}
|
| 550 |
+
def is_wh_token(t: str) -> bool:
|
| 551 |
+
low = (t or "").lower()
|
| 552 |
+
if low in WH_WORDS: return True
|
| 553 |
+
f = fold(low)
|
| 554 |
+
return f in {"que","quien","quienes","cual","cuales","donde","cuando","como","cuanto","cuanta","cuantos","cuantas"}
|
| 555 |
+
|
| 556 |
+
def has_wh_outside_parens(toks) -> bool:
|
| 557 |
+
depth = 0
|
| 558 |
+
for tk in toks:
|
| 559 |
+
if tk in {"(", "«", """, "'"}: depth += 1
|
| 560 |
+
elif tk in {")", "»", """, "'"}: depth = max(0, depth-1)
|
| 561 |
+
elif depth == 0 and is_wh_token(tk): return True
|
| 562 |
+
return False
|
| 563 |
+
|
| 564 |
+
ESTAR_SET={"estoy","estás","está","estamos","estáis","están","estaba","estabas","estábamos","estabais","estaban"}
|
| 565 |
+
HABER_SET={"he","has","ha","hemos","habéis","han","había","habías","habíamos","habíais","habían"}
|
| 566 |
+
|
| 567 |
+
def detect_tam_with_context(toks, i, sentence_start=False):
|
| 568 |
+
t=toks[i].lower()
|
| 569 |
+
prev=toks[i-1].lower() if i>0 else ""
|
| 570 |
+
prev2=toks[i-2].lower() if i>1 else ""
|
| 571 |
+
nxt=toks[i+1].lower() if i+1<len(toks) else ""
|
| 572 |
+
tag=es_morph_tag(t)
|
| 573 |
+
|
| 574 |
+
# imperativos con clíticos al principio
|
| 575 |
+
if re.search(r"(melo|telo|selo|noslo|oslo)$", t):
|
| 576 |
+
if sentence_start or prev in {",", ".", "!", "¡", ";", ":"}: return "IMP"
|
| 577 |
+
if i == 0 or prev in {",", ".", "!", "¡", ";", ":"}:
|
| 578 |
+
if t in {"ve","ven","haz","pon","sal","di","ten","sé","id","venid","tened"}: return "IMP"
|
| 579 |
+
|
| 580 |
+
if prev in {"que","si","cuando","aunque","mientras","hasta","para"}:
|
| 581 |
+
if tag=="SBJV": return "SBJV"
|
| 582 |
+
if tag=="UNK" and re.search(r"(e|a)$", t) and not t.endswith(("ar","er","ir")): return "SBJV"
|
| 583 |
+
|
| 584 |
+
if t in ESTAR_SET or t in HABER_SET: return "PRS"
|
| 585 |
+
if prev in ESTAR_SET and RE_GER.search(nxt): return "IPFV"
|
| 586 |
+
if prev in HABER_SET and RE_PART.search(nxt): return "PST"
|
| 587 |
+
if prev == "a" and prev2 in {"voy","vas","va","vamos","vais","van"} and t.endswith(("ar","er","ir")): return "FUT"
|
| 588 |
+
if RE_GER.search(t): return "IPFV"
|
| 589 |
+
if RE_PART.search(t): return "PST"
|
| 590 |
+
return tag if tag!="UNK" else "PRS"
|
| 591 |
+
|
| 592 |
+
def forced_lemma_with_context(low:str, prev:str, nxt:str)->str:
|
| 593 |
+
if low=="visto" and nxt=="de": return "vestir"
|
| 594 |
+
return ""
|
| 595 |
+
|
| 596 |
+
def has_tilde_equiv_lookup(low:str)->str:
|
| 597 |
+
if has_diacritic(low) and not looks_like_verb_form_strict(low):
|
| 598 |
+
f=fold(low)
|
| 599 |
+
if f in LEX_FORM: return LEX_FORM[f]
|
| 600 |
+
if f in FOLD_FORM: return FOLD_FORM[f]
|
| 601 |
+
return ""
|
| 602 |
+
|
| 603 |
+
def lookup_form_lemma(token:str, prev:str, nxt:str):
|
| 604 |
+
if not token: return "", False
|
| 605 |
+
low=token.lower()
|
| 606 |
+
fl=forced_lemma_with_context(low, prev, nxt)
|
| 607 |
+
if fl and fl in LEX_LEMMA: return LEX_LEMMA[fl], True
|
| 608 |
+
if low in LEX_FORM: return LEX_FORM[low], True
|
| 609 |
+
til=has_tilde_equiv_lookup(low)
|
| 610 |
+
if til: return til, True
|
| 611 |
+
if looks_like_verb_form_strict(low):
|
| 612 |
+
lem=guess_infinitive_es(low)
|
| 613 |
+
if lem and lem in LEX_LEMMA: return LEX_LEMMA[lem], True
|
| 614 |
+
return "", False
|
| 615 |
+
|
| 616 |
+
def attach_enclitic(out_words, ib_keys, plain, attach_idx, encl):
|
| 617 |
+
if attach_idx is None or attach_idx < 0 or attach_idx >= len(out_words): return
|
| 618 |
+
cur = out_words[attach_idx] or ""
|
| 619 |
+
if cur.endswith(encl): return
|
| 620 |
+
out_words[attach_idx] = cur + encl
|
| 621 |
+
plain[attach_idx] = (plain[attach_idx] or "") + encl
|
| 622 |
+
ib_keys[attach_idx] = georgeos_keys(tokens_from_latin(plain[attach_idx]), plain[attach_idx])
|
| 623 |
+
|
| 624 |
+
def ensure_terminal_qmark(out_words, ib_keys, plain):
|
| 625 |
+
if not out_words:
|
| 626 |
+
out_words.append("?"); ib_keys.append(""); plain.append("?"); return
|
| 627 |
+
j = len(out_words) - 1
|
| 628 |
+
while j >= 0 and (out_words[j] == "" or out_words[j] is None): j -= 1
|
| 629 |
+
if j < 0:
|
| 630 |
+
out_words.append("?"); ib_keys.append(""); plain.append("?"); return
|
| 631 |
+
if out_words[j] == ".":
|
| 632 |
+
out_words[j] = "?"; ib_keys[j] = ""; plain[j] = "?"
|
| 633 |
+
elif out_words[j] not in {"?","!"}:
|
| 634 |
+
out_words.append("?"); ib_keys.append(""); plain.append("?")
|
| 635 |
+
|
| 636 |
+
def normalize_surface_by_pos(ni_surface:str, pos:str) -> str:
|
| 637 |
+
if not ni_surface: return ni_surface
|
| 638 |
+
if pos != "V":
|
| 639 |
+
root, _ = strip_ni_tam(ni_surface)
|
| 640 |
+
return root
|
| 641 |
+
return ni_surface
|
| 642 |
+
|
| 643 |
+
def translate_sentence(sent:str):
|
| 644 |
+
toks = re.sub(r"\s+"," ", (sent or "").strip())
|
| 645 |
+
# Separamos también comillas curvas
|
| 646 |
+
toks = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–""''])", r" \1 ", toks)
|
| 647 |
+
toks = [t for t in toks.split() if t]
|
| 648 |
+
|
| 649 |
+
out_words=[]; ib_keys=[]; plain=[]
|
| 650 |
+
neg_next=False; last_finite_idx=None; has_qmark=False
|
| 651 |
+
saw_wh = has_wh_outside_parens(toks)
|
| 652 |
+
sentence_start=True
|
| 653 |
+
|
| 654 |
+
for i,t in enumerate(toks):
|
| 655 |
+
if t in {"¿","¡"}:
|
| 656 |
+
sentence_start=True; continue
|
| 657 |
+
if t in {"?","!"}:
|
| 658 |
+
if t=="?": has_qmark=True
|
| 659 |
+
encl = Q_ENCLITIC_INT if t=="?" else Q_ENCLITIC_EXC
|
| 660 |
+
attach_idx = last_finite_idx
|
| 661 |
+
if attach_idx is None:
|
| 662 |
+
for j in range(len(out_words)-1, -1, -1):
|
| 663 |
+
if out_words[j] and out_words[j] not in VISIBLE_PUNCT:
|
| 664 |
+
attach_idx = j; break
|
| 665 |
+
if attach_idx is not None: attach_enclitic(out_words, ib_keys, plain, attach_idx, encl)
|
| 666 |
+
out_words.append(t); ib_keys.append(""); plain.append(t)
|
| 667 |
+
sentence_start=True; continue
|
| 668 |
+
|
| 669 |
+
if t in VISIBLE_PUNCT:
|
| 670 |
+
out_words.append(t); ib_keys.append(t); plain.append(t)
|
| 671 |
+
if t in HARD_BOUND:
|
| 672 |
+
last_finite_idx=None
|
| 673 |
+
sentence_start = (t in {".",":",";","—","–"})
|
| 674 |
+
continue
|
| 675 |
+
|
| 676 |
+
low=t.lower()
|
| 677 |
+
prev = toks[i-1].lower() if i>0 else ""
|
| 678 |
+
nxt = toks[i+1].lower() if i+1<len(toks) else ""
|
| 679 |
+
|
| 680 |
+
if (sentence_start and t in {"ve","ven","haz","pon","sal","di","ten","sé","id","venid","tened"}) or \
|
| 681 |
+
(re.search(r"(me|te|lo|la|nos|os|les|se)$", low) and looks_like_verb_form_strict(low)):
|
| 682 |
+
tag_detected="IMP"
|
| 683 |
+
else:
|
| 684 |
+
tag_detected = detect_tam_with_context(toks, i, sentence_start)
|
| 685 |
+
|
| 686 |
+
pos_hint = pos_of_es(low)
|
| 687 |
+
is_verb_like = looks_like_verb_form_strict(low) or (pos_hint=="V")
|
| 688 |
+
tam_ok = tam_allowed_for_es(low)
|
| 689 |
+
|
| 690 |
+
if low=="no": neg_next=True; continue
|
| 691 |
+
|
| 692 |
+
# Contracciones: al/del → (a/de + el)
|
| 693 |
+
if low == "al":
|
| 694 |
+
for ni in ("ka","do"):
|
| 695 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 696 |
+
sentence_start=False; continue
|
| 697 |
+
if low == "del":
|
| 698 |
+
for ni in ("ta","do"):
|
| 699 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 700 |
+
sentence_start=False; continue
|
| 701 |
+
|
| 702 |
+
if low=="a":
|
| 703 |
+
ni=rule_a(prev,low,nxt)
|
| 704 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 705 |
+
continue
|
| 706 |
+
if low == "un":
|
| 707 |
+
ni="banu"; out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni); continue
|
| 708 |
+
if low == "una":
|
| 709 |
+
ni="bana"; out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni); continue
|
| 710 |
+
if low == "uno":
|
| 711 |
+
ni="ban"; out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni); continue
|
| 712 |
+
|
| 713 |
+
if (low in STOP) and (low not in LEX_FORM):
|
| 714 |
+
continue
|
| 715 |
+
|
| 716 |
+
ni_direct = SURF_RICH.get((low, tag_detected))
|
| 717 |
+
if neg_next and is_verb_like:
|
| 718 |
+
out_words.append("eś"); ib_keys.append(georgeos_keys(tokens_from_latin("eś"),"eś")); plain.append("eś")
|
| 719 |
+
neg_next=False
|
| 720 |
+
if ni_direct:
|
| 721 |
+
if any(ni_direct.endswith(s) for s in VERB_TAM):
|
| 722 |
+
ni=ni_direct
|
| 723 |
+
else:
|
| 724 |
+
ni=normalize_surface_by_pos(ni_direct, "V" if tam_ok else (pos_hint or ""))
|
| 725 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 726 |
+
if tam_ok and any(ni.endswith(s) for s in VERB_TAM): last_finite_idx=len(out_words)-1
|
| 727 |
+
sentence_start=False; continue
|
| 728 |
+
|
| 729 |
+
ni_lemma, ok = lookup_form_lemma(t, prev, nxt)
|
| 730 |
+
if ok:
|
| 731 |
+
if low in FORCE_KEYS:
|
| 732 |
+
ni = LEX_FORM.get(low, ni_lemma)
|
| 733 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 734 |
+
last_finite_idx=len(out_words)-1; sentence_start=False; continue
|
| 735 |
+
root, old_suf = strip_ni_tam(ni_lemma or "")
|
| 736 |
+
if tag_detected=="IMP":
|
| 737 |
+
ni=root+"-tu"
|
| 738 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 739 |
+
last_finite_idx=len(out_words)-1; sentence_start=False; continue
|
| 740 |
+
if old_suf=="-tu":
|
| 741 |
+
ni=ni_lemma
|
| 742 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 743 |
+
last_finite_idx=len(out_words)-1; sentence_start=False; continue
|
| 744 |
+
if tam_ok and is_verb_like:
|
| 745 |
+
suf=TAM_SUFFIX.get(tag_detected,"-ke")
|
| 746 |
+
base=root or (ni_lemma or "")
|
| 747 |
+
ni= base+suf if suf else base
|
| 748 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 749 |
+
last_finite_idx=len(out_words)-1
|
| 750 |
+
else:
|
| 751 |
+
ni=normalize_surface_by_pos(ni_lemma if ni_lemma!="" else "Ø", pos_hint or "")
|
| 752 |
+
out_words.append(ni); ib_keys.append(georgeos_keys(tokens_from_latin(ni),ni)); plain.append(ni)
|
| 753 |
+
sentence_start=False; continue
|
| 754 |
+
|
| 755 |
+
placeholder=f"[SIN-LEX:{t}]"
|
| 756 |
+
out_words.append(placeholder); ib_keys.append(placeholder); plain.append(placeholder)
|
| 757 |
+
sentence_start=False
|
| 758 |
+
|
| 759 |
+
appended_na=False
|
| 760 |
+
if saw_wh and not has_qmark:
|
| 761 |
+
encl=Q_ENCLITIC_INT
|
| 762 |
+
attach_idx=last_finite_idx
|
| 763 |
+
if attach_idx is None:
|
| 764 |
+
for j in range(len(out_words)-1,-1,-1):
|
| 765 |
+
if out_words[j] and out_words[j] not in VISIBLE_PUNCT and out_words[j] not in {"?","!"} and not out_words[j].startswith("["):
|
| 766 |
+
attach_idx=j; break
|
| 767 |
+
if attach_idx is not None and not (out_words[attach_idx].endswith("-na") or out_words[attach_idx].endswith("-ba")):
|
| 768 |
+
attach_enclitic(out_words, ib_keys, plain, attach_idx, encl); appended_na=True
|
| 769 |
+
if appended_na and not has_qmark: ensure_terminal_qmark(out_words, ib_keys, plain)
|
| 770 |
+
|
| 771 |
+
ib_clean=[k for k in ib_keys if k!=""]
|
| 772 |
+
return " ".join(out_words), ib_clean
|
| 773 |
+
|
| 774 |
+
def translate(text:str):
|
| 775 |
+
lines=[l for l in (text or "").split("\n") if l.strip()]
|
| 776 |
+
ni_lines=[]; ib_lines=[]
|
| 777 |
+
for ln in lines:
|
| 778 |
+
ni,ib_toks=translate_sentence(ln)
|
| 779 |
+
ni_lines.append(ni); ib_lines.append(render_ib_with_tridots(ib_toks))
|
| 780 |
+
return "\n".join(ni_lines), "\n".join(ib_lines)
|
| 781 |
+
|
| 782 |
+
# =========================
|
| 783 |
+
# TRADUCTOR NI→ES (mejorado)
|
| 784 |
+
# =========================
|
| 785 |
+
# Sufijos nominales más comunes (no-TAM) para fallback suave
|
| 786 |
+
NI_NOMINAL_SUFFIXES = ("-ar","-en","-ka","-la","-si","-ŕa")
|
| 787 |
+
|
| 788 |
+
def normalize_ni(text: str) -> str:
|
| 789 |
+
# Normaliza espacios, permite "tridots" (/) y desencapsula [SIN-LEX:…]
|
| 790 |
+
t = (text or "").replace("/", " ")
|
| 791 |
+
t = re.sub(r"\[SIN-LEX:([^\]]+)\]", r"\1", t)
|
| 792 |
+
return re.sub(r"\s+", " ", t.strip())
|
| 793 |
+
|
| 794 |
+
def tokenize_ni(text: str):
|
| 795 |
+
# Separar también comillas curvas
|
| 796 |
+
text = re.sub(r"([,.;:!?¡¿…()\[\]{}\"'«»—–""''])", r" \1 ", text)
|
| 797 |
+
return [t for t in text.split() if t]
|
| 798 |
+
|
| 799 |
+
# --- NUEVO: solo conjugar si el lema ES termina en -ar/-er/-ir
|
| 800 |
+
def _is_spanish_verb_lemma(lemma: str) -> bool:
|
| 801 |
+
return isinstance(lemma, str) and re.search(r"(ar|er|ir)$", lemma)
|
| 802 |
+
|
| 803 |
+
def _detect_ni_person(ni_form: str):
|
| 804 |
+
"""Detecta persona/número del sufijo neoíbero y devuelve (person, root_limpia)"""
|
| 805 |
+
for suf, pn in NI_PERSON_MAP.items():
|
| 806 |
+
if ni_form.endswith(suf):
|
| 807 |
+
return pn, ni_form[:-len(suf)]
|
| 808 |
+
return "3S", ni_form # default
|
| 809 |
+
|
| 810 |
+
def _conj_es_from_lemma(lemma: str, tag: str, person: str = "3S"):
|
| 811 |
+
"""Conjuga un verbo español según TAM y persona/número"""
|
| 812 |
+
if not _is_spanish_verb_lemma(lemma):
|
| 813 |
+
return lemma
|
| 814 |
+
|
| 815 |
+
lemma = lemma.lower()
|
| 816 |
+
|
| 817 |
+
# Verbos irregulares completos (clave: lemma, tag, person)
|
| 818 |
+
IRREG_FULL = {
|
| 819 |
+
# SER
|
| 820 |
+
("ser", "PRS", "1S"): "soy", ("ser", "PRS", "2S"): "eres", ("ser", "PRS", "3S"): "es",
|
| 821 |
+
("ser", "PRS", "1P"): "somos", ("ser", "PRS", "2P"): "sois", ("ser", "PRS", "3P"): "son",
|
| 822 |
+
("ser", "PST", "1S"): "fui", ("ser", "PST", "2S"): "fuiste", ("ser", "PST", "3S"): "fue",
|
| 823 |
+
("ser", "PST", "1P"): "fuimos", ("ser", "PST", "2P"): "fuisteis", ("ser", "PST", "3P"): "fueron",
|
| 824 |
+
("ser", "IPFV", "1S"): "era", ("ser", "IPFV", "2S"): "eras", ("ser", "IPFV", "3S"): "era",
|
| 825 |
+
("ser", "IPFV", "1P"): "éramos", ("ser", "IPFV", "2P"): "erais", ("ser", "IPFV", "3P"): "eran",
|
| 826 |
+
("ser", "SBJV", "1S"): "sea", ("ser", "SBJV", "2S"): "seas", ("ser", "SBJV", "3S"): "sea",
|
| 827 |
+
("ser", "SBJV", "1P"): "seamos", ("ser", "SBJV", "2P"): "seáis", ("ser", "SBJV", "3P"): "sean",
|
| 828 |
+
# IR
|
| 829 |
+
("ir", "PRS", "1S"): "voy", ("ir", "PRS", "2S"): "vas", ("ir", "PRS", "3S"): "va",
|
| 830 |
+
("ir", "PRS", "1P"): "vamos", ("ir", "PRS", "2P"): "vais", ("ir", "PRS", "3P"): "van",
|
| 831 |
+
("ir", "PST", "1S"): "fui", ("ir", "PST", "2S"): "fuiste", ("ir", "PST", "3S"): "fue",
|
| 832 |
+
("ir", "PST", "1P"): "fuimos", ("ir", "PST", "2P"): "fuisteis", ("ir", "PST", "3P"): "fueron",
|
| 833 |
+
("ir", "IPFV", "1S"): "iba", ("ir", "IPFV", "2S"): "ibas", ("ir", "IPFV", "3S"): "iba",
|
| 834 |
+
("ir", "IPFV", "1P"): "íbamos", ("ir", "IPFV", "2P"): "ibais", ("ir", "IPFV", "3P"): "iban",
|
| 835 |
+
("ir", "SBJV", "1S"): "vaya", ("ir", "SBJV", "2S"): "vayas", ("ir", "SBJV", "3S"): "vaya",
|
| 836 |
+
("ir", "SBJV", "1P"): "vayamos", ("ir", "SBJV", "2P"): "vayáis", ("ir", "SBJV", "3P"): "vayan",
|
| 837 |
+
# ESTAR
|
| 838 |
+
("estar", "PRS", "1S"): "estoy", ("estar", "PRS", "2S"): "estás", ("estar", "PRS", "3S"): "está",
|
| 839 |
+
("estar", "PRS", "1P"): "estamos", ("estar", "PRS", "2P"): "estáis", ("estar", "PRS", "3P"): "están",
|
| 840 |
+
("estar", "SBJV", "1S"): "esté", ("estar", "SBJV", "2S"): "estés", ("estar", "SBJV", "3S"): "esté",
|
| 841 |
+
("estar", "SBJV", "1P"): "estemos", ("estar", "SBJV", "2P"): "estéis", ("estar", "SBJV", "3P"): "estén",
|
| 842 |
+
# TENER
|
| 843 |
+
("tener", "PRS", "1S"): "tengo", ("tener", "PRS", "2S"): "tienes", ("tener", "PRS", "3S"): "tiene",
|
| 844 |
+
("tener", "PRS", "1P"): "tenemos", ("tener", "PRS", "2P"): "tenéis", ("tener", "PRS", "3P"): "tienen",
|
| 845 |
+
("tener", "SBJV", "1S"): "tenga", ("tener", "SBJV", "2S"): "tengas", ("tener", "SBJV", "3S"): "tenga",
|
| 846 |
+
("tener", "SBJV", "1P"): "tengamos", ("tener", "SBJV", "2P"): "tengáis", ("tener", "SBJV", "3P"): "tengan",
|
| 847 |
+
# VENIR
|
| 848 |
+
("venir", "PRS", "1S"): "vengo", ("venir", "PRS", "2S"): "vienes", ("venir", "PRS", "3S"): "viene",
|
| 849 |
+
("venir", "PRS", "1P"): "venimos", ("venir", "PRS", "2P"): "venís", ("venir", "PRS", "3P"): "vienen",
|
| 850 |
+
("venir", "SBJV", "1S"): "venga", ("venir", "SBJV", "2S"): "vengas", ("venir", "SBJV", "3S"): "venga",
|
| 851 |
+
("venir", "SBJV", "1P"): "vengamos", ("venir", "SBJV", "2P"): "vengáis", ("venir", "SBJV", "3P"): "vengan",
|
| 852 |
+
# HACER
|
| 853 |
+
("hacer", "PRS", "1S"): "hago", ("hacer", "PRS", "2S"): "haces", ("hacer", "PRS", "3S"): "hace",
|
| 854 |
+
("hacer", "PRS", "1P"): "hacemos", ("hacer", "PRS", "2P"): "hacéis", ("hacer", "PRS", "3P"): "hacen",
|
| 855 |
+
("hacer", "SBJV", "1S"): "haga", ("hacer", "SBJV", "2S"): "hagas", ("hacer", "SBJV", "3S"): "haga",
|
| 856 |
+
("hacer", "SBJV", "1P"): "hagamos", ("hacer", "SBJV", "2P"): "hagáis", ("hacer", "SBJV", "3P"): "hagan",
|
| 857 |
+
("hacer", "PST", "1S"): "hice", ("hacer", "PST", "3S"): "hizo",
|
| 858 |
+
# PONER
|
| 859 |
+
("poner", "PRS", "1S"): "pongo", ("poner", "PRS", "2S"): "pones", ("poner", "PRS", "3S"): "pone",
|
| 860 |
+
("poner", "PRS", "1P"): "ponemos", ("poner", "PRS", "2P"): "ponéis", ("poner", "PRS", "3P"): "ponen",
|
| 861 |
+
("poner", "SBJV", "1S"): "ponga", ("poner", "SBJV", "2S"): "pongas", ("poner", "SBJV", "3S"): "ponga",
|
| 862 |
+
("poner", "SBJV", "1P"): "pongamos", ("poner", "SBJV", "2P"): "pongáis", ("poner", "SBJV", "3P"): "pongan",
|
| 863 |
+
# DAR
|
| 864 |
+
("dar", "PRS", "1S"): "doy", ("dar", "PRS", "2S"): "das", ("dar", "PRS", "3S"): "da",
|
| 865 |
+
("dar", "PRS", "1P"): "damos", ("dar", "PRS", "2P"): "dais", ("dar", "PRS", "3P"): "dan",
|
| 866 |
+
("dar", "SBJV", "1S"): "dé", ("dar", "SBJV", "2S"): "des", ("dar", "SBJV", "3S"): "dé",
|
| 867 |
+
("dar", "SBJV", "1P"): "demos", ("dar", "SBJV", "2P"): "deis", ("dar", "SBJV", "3P"): "den",
|
| 868 |
+
# HABER
|
| 869 |
+
("haber", "PRS", "1S"): "he", ("haber", "PRS", "2S"): "has", ("haber", "PRS", "3S"): "ha",
|
| 870 |
+
("haber", "PRS", "1P"): "hemos", ("haber", "PRS", "2P"): "habéis", ("haber", "PRS", "3P"): "han",
|
| 871 |
+
("haber", "SBJV", "1S"): "haya", ("haber", "SBJV", "2S"): "hayas", ("haber", "SBJV", "3S"): "haya",
|
| 872 |
+
("haber", "SBJV", "1P"): "hayamos", ("haber", "SBJV", "2P"): "hayáis", ("haber", "SBJV", "3P"): "hayan",
|
| 873 |
+
# PODER
|
| 874 |
+
("poder", "PRS", "1S"): "puedo", ("poder", "PRS", "2S"): "puedes", ("poder", "PRS", "3S"): "puede",
|
| 875 |
+
("poder", "PRS", "1P"): "podemos", ("poder", "PRS", "2P"): "podéis", ("poder", "PRS", "3P"): "pueden",
|
| 876 |
+
("poder", "SBJV", "1S"): "pueda", ("poder", "SBJV", "2S"): "puedas", ("poder", "SBJV", "3S"): "pueda",
|
| 877 |
+
("poder", "SBJV", "1P"): "podamos", ("poder", "SBJV", "2P"): "podáis", ("poder", "SBJV", "3P"): "puedan",
|
| 878 |
+
# DECIR
|
| 879 |
+
("decir", "PRS", "1S"): "digo", ("decir", "PRS", "2S"): "dices", ("decir", "PRS", "3S"): "dice",
|
| 880 |
+
("decir", "PRS", "1P"): "decimos", ("decir", "PRS", "2P"): "decís", ("decir", "PRS", "3P"): "dicen",
|
| 881 |
+
# SABER
|
| 882 |
+
("saber", "PRS", "1S"): "sé", ("saber", "PRS", "2S"): "sabes", ("saber", "PRS", "3S"): "sabe",
|
| 883 |
+
# VER
|
| 884 |
+
("ver", "PRS", "1S"): "veo", ("ver", "PRS", "2S"): "ves", ("ver", "PRS", "3S"): "ve",
|
| 885 |
+
("ver", "PRS", "1P"): "vemos", ("ver", "PRS", "2P"): "veis", ("ver", "PRS", "3P"): "ven",
|
| 886 |
+
}
|
| 887 |
+
|
| 888 |
+
# Buscar forma irregular completa
|
| 889 |
+
if (lemma, tag, person) in IRREG_FULL:
|
| 890 |
+
return IRREG_FULL[(lemma, tag, person)]
|
| 891 |
+
|
| 892 |
+
# Tallos irregulares FUT/COND
|
| 893 |
+
irr_stems = {
|
| 894 |
+
"salir":"saldr","venir":"vendr","tener":"tendr","poner":"pondr","valer":"valdr","poder":"podr",
|
| 895 |
+
"haber":"habr","saber":"sabr","caber":"cabr","querer":"querr","decir":"dir","hacer":"har"
|
| 896 |
+
}
|
| 897 |
+
|
| 898 |
+
# Conjugación regular
|
| 899 |
+
root = lemma[:-2]
|
| 900 |
+
verb_class = lemma[-2:] # ar, er, ir
|
| 901 |
+
|
| 902 |
+
# PRESENTE
|
| 903 |
+
if tag == "PRS":
|
| 904 |
+
endings_ar = {"1S":"o","2S":"as","3S":"a","1P":"amos","2P":"áis","3P":"an"}
|
| 905 |
+
endings_er = {"1S":"o","2S":"es","3S":"e","1P":"emos","2P":"éis","3P":"en"}
|
| 906 |
+
endings_ir = {"1S":"o","2S":"es","3S":"e","1P":"imos","2P":"ís","3P":"en"}
|
| 907 |
+
endings = endings_ar if verb_class == "ar" else (endings_ir if verb_class == "ir" else endings_er)
|
| 908 |
+
return root + endings.get(person, "a")
|
| 909 |
+
|
| 910 |
+
# PRETÉRITO
|
| 911 |
+
if tag == "PST":
|
| 912 |
+
endings_ar = {"1S":"é","2S":"aste","3S":"ó","1P":"amos","2P":"asteis","3P":"aron"}
|
| 913 |
+
endings_er = {"1S":"í","2S":"iste","3S":"ió","1P":"imos","2P":"isteis","3P":"ieron"}
|
| 914 |
+
endings = endings_ar if verb_class == "ar" else endings_er
|
| 915 |
+
return root + endings.get(person, "ó")
|
| 916 |
+
|
| 917 |
+
# FUTURO
|
| 918 |
+
if tag == "FUT":
|
| 919 |
+
stem = irr_stems.get(lemma, lemma)
|
| 920 |
+
endings = {"1S":"é","2S":"ás","3S":"á","1P":"emos","2P":"éis","3P":"án"}
|
| 921 |
+
return stem + endings.get(person, "á")
|
| 922 |
+
|
| 923 |
+
# CONDICIONAL
|
| 924 |
+
if tag == "COND":
|
| 925 |
+
stem = irr_stems.get(lemma, lemma)
|
| 926 |
+
endings = {"1S":"ía","2S":"ías","3S":"ía","1P":"íamos","2P":"íais","3P":"ían"}
|
| 927 |
+
return stem + endings.get(person, "ía")
|
| 928 |
+
|
| 929 |
+
# SUBJUNTIVO PRESENTE
|
| 930 |
+
if tag == "SBJV":
|
| 931 |
+
if verb_class == "ar":
|
| 932 |
+
endings = {"1S":"e","2S":"es","3S":"e","1P":"emos","2P":"éis","3P":"en"}
|
| 933 |
+
else:
|
| 934 |
+
endings = {"1S":"a","2S":"as","3S":"a","1P":"amos","2P":"áis","3P":"an"}
|
| 935 |
+
return root + endings.get(person, "e" if verb_class == "ar" else "a")
|
| 936 |
+
|
| 937 |
+
# IMPERFECTO
|
| 938 |
+
if tag == "IPFV":
|
| 939 |
+
if verb_class == "ar":
|
| 940 |
+
endings = {"1S":"aba","2S":"abas","3S":"aba","1P":"ábamos","2P":"abais","3P":"aban"}
|
| 941 |
+
else:
|
| 942 |
+
endings = {"1S":"ía","2S":"ías","3S":"ía","1P":"íamos","2P":"íais","3P":"ían"}
|
| 943 |
+
return root + endings.get(person, "aba" if verb_class == "ar" else "ía")
|
| 944 |
+
|
| 945 |
+
# IMPERATIVO
|
| 946 |
+
if tag == "IMP":
|
| 947 |
+
if person == "2S":
|
| 948 |
+
return root + ("a" if verb_class == "ar" else "e")
|
| 949 |
+
return lemma # otras personas usan subjuntivo
|
| 950 |
+
|
| 951 |
+
# INFINITIVO/GERUNDIO/PARTICIPIO
|
| 952 |
+
if tag in {"INF","UNK"}:
|
| 953 |
+
return lemma
|
| 954 |
+
|
| 955 |
+
# Default
|
| 956 |
+
return lemma
|
| 957 |
+
|
| 958 |
+
# Mantener compatibilidad con código antiguo
|
| 959 |
+
def _conj_es_3sg(lemma:str, tag:str) -> str:
|
| 960 |
+
"""Wrapper para compatibilidad - llama a _conj_es_from_lemma con 3S"""
|
| 961 |
+
return _conj_es_from_lemma(lemma, tag, "3S")
|
| 962 |
+
|
| 963 |
+
def _strip_nominal_suffix(base: str):
|
| 964 |
+
"""Si no hay match directo, intenta quitar sufijos nominales comunes."""
|
| 965 |
+
for suf in sorted(NI_NOMINAL_SUFFIXES, key=len, reverse=True):
|
| 966 |
+
if base.endswith(suf):
|
| 967 |
+
return base[:-len(suf)], suf
|
| 968 |
+
return base, ""
|
| 969 |
+
|
| 970 |
+
def _cleanup_es_spaces(s: str) -> str:
|
| 971 |
+
s = re.sub(r"\s+([,.;:!?])", r"\1", s)
|
| 972 |
+
s = re.sub(r"\(\s+", "(", s)
|
| 973 |
+
s = re.sub(r"\s+\)", ")", s)
|
| 974 |
+
s = re.sub(r"\s{2,}", " ", s).strip()
|
| 975 |
+
# micro-limpiezas
|
| 976 |
+
s = s.replace("a a ", " a ")
|
| 977 |
+
return s
|
| 978 |
+
|
| 979 |
+
def translate_ni_to_es(sent: str):
|
| 980 |
+
toks = tokenize_ni(normalize_ni(sent))
|
| 981 |
+
out=[]
|
| 982 |
+
for i, t in enumerate(toks):
|
| 983 |
+
# Preservar puntuación
|
| 984 |
+
if t in VISIBLE_PUNCT or t in {"?", "!", "¿", "¡"}:
|
| 985 |
+
out.append(t)
|
| 986 |
+
continue
|
| 987 |
+
|
| 988 |
+
# ✅ FIX: Preservar nombres propios (primera letra mayúscula)
|
| 989 |
+
if t and t[0].isupper() and not t.isupper() and len(t) > 1:
|
| 990 |
+
out.append(t)
|
| 991 |
+
continue
|
| 992 |
+
|
| 993 |
+
low=t.lower()
|
| 994 |
+
|
| 995 |
+
# Quita enclíticos -na / -ba (interrog./exclam.) SOLO para lookup
|
| 996 |
+
lookup_form = low[:-3] if (low.endswith("-na") or low.endswith("-ba")) else low
|
| 997 |
+
|
| 998 |
+
# 1) Forma directa (superficie o raíz)
|
| 999 |
+
if lookup_form in NI_TO_ES_FORM:
|
| 1000 |
+
out.append(NI_TO_ES_FORM[lookup_form])
|
| 1001 |
+
continue
|
| 1002 |
+
|
| 1003 |
+
# 2) TAM por sufijo + PERSONA ✅ NUEVO
|
| 1004 |
+
root, tam_tag, tam_suffix = detect_ni_tam(lookup_form)
|
| 1005 |
+
person, root_clean = _detect_ni_person(root) # ✅ DETECTAR PERSONA
|
| 1006 |
+
|
| 1007 |
+
# 2a) Superficie exacta (con TAM)
|
| 1008 |
+
es_direct = NI_TO_ES_SURF.get((lookup_form, tam_tag))
|
| 1009 |
+
if es_direct:
|
| 1010 |
+
out.append(es_direct)
|
| 1011 |
+
continue
|
| 1012 |
+
|
| 1013 |
+
# 2b) CONJUGAR con persona ✅ MEJORADO
|
| 1014 |
+
if tam_tag not in {"INF","UNK"} and root_clean in NI_TO_ES_LEMMA:
|
| 1015 |
+
es_lemma = NI_TO_ES_LEMMA[root_clean]
|
| 1016 |
+
out.append(_conj_es_from_lemma(es_lemma, tam_tag, person))
|
| 1017 |
+
continue
|
| 1018 |
+
|
| 1019 |
+
# 2c) Raíz conocida → forma/lema ES
|
| 1020 |
+
if root_clean in NI_TO_ES_FORM:
|
| 1021 |
+
out.append(NI_TO_ES_FORM[root_clean])
|
| 1022 |
+
continue
|
| 1023 |
+
if root_clean in NI_TO_ES_LEMMA:
|
| 1024 |
+
es_lemma = NI_TO_ES_LEMMA[root_clean]
|
| 1025 |
+
out.append(_conj_es_from_lemma(es_lemma, tam_tag, person))
|
| 1026 |
+
continue
|
| 1027 |
+
|
| 1028 |
+
# 3) Fallback suave para nominales: quita -ar/-en/-ka/-la/-si/-ŕa y reintenta
|
| 1029 |
+
base2, suf2 = _strip_nominal_suffix(root_clean if root_clean else lookup_form)
|
| 1030 |
+
if base2 != (root_clean if root_clean else lookup_form):
|
| 1031 |
+
if base2 in NI_TO_ES_FORM:
|
| 1032 |
+
out.append(NI_TO_ES_FORM[base2])
|
| 1033 |
+
continue
|
| 1034 |
+
if base2 in NI_TO_ES_LEMMA:
|
| 1035 |
+
# ruta nominal: NO conjugar aunque sea verbo; devolvemos el lema limpio
|
| 1036 |
+
out.append(NI_TO_ES_LEMMA[base2])
|
| 1037 |
+
continue
|
| 1038 |
+
# Si aún no, último recurso: presentar el núcleo "limpio"
|
| 1039 |
+
out.append(base2)
|
| 1040 |
+
continue
|
| 1041 |
+
|
| 1042 |
+
# 4) Desconocido → marcador suave
|
| 1043 |
+
out.append(f"[?:{t}]")
|
| 1044 |
+
|
| 1045 |
+
return _cleanup_es_spaces(" ".join(out))
|
| 1046 |
+
|
| 1047 |
+
# =========================
|
| 1048 |
+
# CARGA DE LÉXICO
|
| 1049 |
+
# =========================
|
| 1050 |
+
def load_lexicon():
|
| 1051 |
+
loaded=False
|
| 1052 |
+
total_rich=total_simple=0
|
| 1053 |
+
for p in CSV_CANDIDATES:
|
| 1054 |
+
if not os.path.exists(p): continue
|
| 1055 |
+
try:
|
| 1056 |
+
with open(p, encoding="utf-8") as f:
|
| 1057 |
+
rd=csv.DictReader(f); flds=set(rd.fieldnames or [])
|
| 1058 |
+
# v4.4: formato nuevo con ni_surface
|
| 1059 |
+
if {"source_es","es_morph","ni_surface"}.issubset(flds):
|
| 1060 |
+
for r in rd:
|
| 1061 |
+
es=(r.get("source_es") or "").strip().lower()
|
| 1062 |
+
tag=(r.get("es_morph") or "").strip().upper()
|
| 1063 |
+
surf=(r.get("ni_surface") or "").strip()
|
| 1064 |
+
if not surf:
|
| 1065 |
+
root=(r.get("ni_root") or "").strip(); suf=(r.get("ni_suffix") or "").strip()
|
| 1066 |
+
if root or suf: surf=f"{root}{suf}"
|
| 1067 |
+
if es and tag and surf: SURF_RICH[(es,tag)] = surf; total_rich+=1
|
| 1068 |
+
|
| 1069 |
+
ni=(r.get("target_ni") or "").strip()
|
| 1070 |
+
es_lem=(r.get("es_lemma") or "").strip().lower()
|
| 1071 |
+
|
| 1072 |
+
pos = _canon_pos(r.get("pos") or r.get("es_pos") or r.get("target_pos") or r.get("pos_es") or r.get("ni_pos") or "")
|
| 1073 |
+
tam_ok = _boolish(r.get("tam_ok"))
|
| 1074 |
+
|
| 1075 |
+
if es: _meta_set(es, pos=pos, tam_ok=(tam_ok if tam_ok is not None else (pos=="V" if pos else None)))
|
| 1076 |
+
if es_lem:
|
| 1077 |
+
_meta_set(es_lem, pos=("V" if es_lem.endswith(("ar","er","ir")) else (pos or "")),
|
| 1078 |
+
tam_ok=(tam_ok if tam_ok is not None else (pos=="V" if pos else None)))
|
| 1079 |
+
|
| 1080 |
+
if es and ni!="": LEX_FORM.setdefault(es,ni)
|
| 1081 |
+
if es_lem and ni!="": LEX_LEMMA.setdefault(es_lem,ni)
|
| 1082 |
+
loaded=True; continue
|
| 1083 |
+
|
| 1084 |
+
if {"source_es","target_ni"}.issubset(flds):
|
| 1085 |
+
for r in rd:
|
| 1086 |
+
es=(r.get("source_es") or "").strip().lower()
|
| 1087 |
+
ni=(r.get("target_ni") or "").strip()
|
| 1088 |
+
if not es: continue
|
| 1089 |
+
LEX_FORM.setdefault(es,ni); total_simple+=1
|
| 1090 |
+
_meta_set(es, pos="", tam_ok=None)
|
| 1091 |
+
if looks_like_verb_form_strict(es):
|
| 1092 |
+
lem=guess_infinitive_es(es)
|
| 1093 |
+
if lem:
|
| 1094 |
+
LEX_LEMMA.setdefault(lem,ni); _meta_set(lem, pos="V", tam_ok=True)
|
| 1095 |
+
loaded=True; continue
|
| 1096 |
+
|
| 1097 |
+
if {"es","ni_lemma"}.issubset(flds):
|
| 1098 |
+
for r in rd:
|
| 1099 |
+
es=(r.get("es") or "").strip().lower()
|
| 1100 |
+
ni=(r.get("ni_lemma") or "").strip()
|
| 1101 |
+
if not es: continue
|
| 1102 |
+
LEX_FORM.setdefault(es,ni); total_simple+=1
|
| 1103 |
+
_meta_set(es, pos="", tam_ok=None)
|
| 1104 |
+
if looks_like_verb_form_strict(es):
|
| 1105 |
+
lem=guess_infinitive_es(es)
|
| 1106 |
+
if lem:
|
| 1107 |
+
LEX_LEMMA.setdefault(lem,ni); _meta_set(lem, pos="V", tam_ok=True)
|
| 1108 |
+
loaded=True; continue
|
| 1109 |
+
except Exception as e:
|
| 1110 |
+
print(f"[WARN] No se pudo leer {p}: {e}")
|
| 1111 |
+
if total_rich or total_simple:
|
| 1112 |
+
print(f"✓ ES→NI: {total_rich} superficies ricas, {total_simple} pares simples")
|
| 1113 |
+
|
| 1114 |
+
global FOLD_FORM
|
| 1115 |
+
FOLD_FORM={}
|
| 1116 |
+
for k,v in LEX_FORM.items():
|
| 1117 |
+
fk=fold(k)
|
| 1118 |
+
if fk!=k and len(k)>=5 and not looks_like_verb_form_strict(k):
|
| 1119 |
+
FOLD_FORM.setdefault(fk,v)
|
| 1120 |
+
|
| 1121 |
+
# Cobertura mínima
|
| 1122 |
+
KEEP_MIN={
|
| 1123 |
+
"y":"ne","o":"o","no":"eś","a":"ka","para":"kara","eso":"kok","tarta":"gatel",
|
| 1124 |
+
"el":"do", "la":"da", "los":"don", "las":"dan",
|
| 1125 |
+
"un":"banu","una":"bana","uno":"ban",
|
| 1126 |
+
"este":"aŕe","esta":"aŕa","estos":"aŕen","estas":"aŕan",
|
| 1127 |
+
|
| 1128 |
+
# Números básicos
|
| 1129 |
+
"dos":"bi","tres":"irur","cuatro":"laur","cinco":"borste","seis":"śei",
|
| 1130 |
+
"siete":"sisbi","ocho":"sorse","nueve":"lauŕbi","diez":"abaŕ","veinte":"oŕkei",
|
| 1131 |
+
|
| 1132 |
+
# Números 1-100 (dígitos)
|
| 1133 |
+
"1":"ban","2":"bi","3":"irur","4":"laur","5":"borste",
|
| 1134 |
+
"6":"śei","7":"sisbi","8":"sorse","9":"bedar","10":"abaŕ",
|
| 1135 |
+
"11":"abaŕ-ke-ban","12":"abaŕ-ke-bi","13":"abaŕ-ke-irur","14":"abaŕ-ke-laur","15":"abaŕ-ke-borste",
|
| 1136 |
+
"16":"abaŕ-ke-śei","17":"abaŕ-ke-sisbi","18":"abaŕ-ke-sorse","19":"abaŕ-ke-bedar","20":"oŕkei",
|
| 1137 |
+
"21":"oŕkei-ke-ban","22":"oŕkei-ke-bi","23":"oŕkei-ke-irur","24":"oŕkei-ke-laur","25":"oŕkei-ke-borste",
|
| 1138 |
+
"26":"oŕkei-ke-śei","27":"oŕkei-ke-sisbi","28":"oŕkei-ke-sorse","29":"oŕkei-ke-bedar","30":"oŕkei-abaŕ",
|
| 1139 |
+
"31":"oŕkei-abaŕ-ke-ban","32":"oŕkei-abaŕ-ke-bi","33":"oŕkei-abaŕ-ke-irur","34":"oŕkei-abaŕ-ke-laur","35":"oŕkei-abaŕ-ke-borste",
|
| 1140 |
+
"36":"oŕkei-abaŕ-ke-śei","37":"oŕkei-abaŕ-ke-sisbi","38":"oŕkei-abaŕ-ke-sorse","39":"oŕkei-abaŕ-ke-bedar","40":"binoŕkei",
|
| 1141 |
+
"41":"binoŕkei-abaŕ-ke-ban","42":"binoŕkei-abaŕ-ke-bi","43":"binoŕkei-abaŕ-ke-irur","44":"binoŕkei-abaŕ-ke-laur","45":"binoŕkei-abaŕ-ke-borste",
|
| 1142 |
+
"46":"binoŕkei-abaŕ-ke-śei","47":"binoŕkei-abaŕ-ke-sisbi","48":"binoŕkei-abaŕ-ke-sorse","49":"binoŕkei-abaŕ-ke-bedar","50":"binoŕkei-abaŕ",
|
| 1143 |
+
"51":"binoŕkei-abaŕ-ke-ban","52":"binoŕkei-abaŕ-ke-bi","53":"binoŕkei-abaŕ-ke-irur","54":"binoŕkei-abaŕ-ke-laur","55":"binoŕkei-abaŕ-ke-borste",
|
| 1144 |
+
"56":"binoŕkei-abaŕ-ke-śei","57":"binoŕkei-abaŕ-ke-sisbi","58":"binoŕkei-abaŕ-ke-sorse","59":"binoŕkei-abaŕ-ke-bedar","60":"iruŕokei",
|
| 1145 |
+
"61":"iruŕokei-abaŕ-ke-ban","62":"iruŕokei-abaŕ-ke-bi","63":"iruŕokei-abaŕ-ke-irur","64":"iruŕokei-abaŕ-ke-laur","65":"iruŕokei-abaŕ-ke-borste",
|
| 1146 |
+
"66":"iruŕokei-abaŕ-ke-śei","67":"iruŕokei-abaŕ-ke-sisbi","68":"iruŕokei-abaŕ-ke-sorse","69":"iruŕokei-abaŕ-ke-bedar","70":"iruŕokei-abaŕ",
|
| 1147 |
+
"71":"iruŕokei-abaŕ-ke-ban","72":"iruŕokei-abaŕ-ke-bi","73":"iruŕokei-abaŕ-ke-irur","74":"iruŕokei-abaŕ-ke-laur","75":"iruŕokei-abaŕ-ke-borste",
|
| 1148 |
+
"76":"iruŕokei-abaŕ-ke-śei","77":"iruŕokei-abaŕ-ke-sisbi","78":"iruŕokei-abaŕ-ke-sorse","79":"iruŕokei-abaŕ-ke-bedar","80":"lauŕokei",
|
| 1149 |
+
"81":"lauŕokei-abaŕ-ke-ban","82":"lauŕokei-abaŕ-ke-bi","83":"lauŕokei-abaŕ-ke-irur","84":"lauŕokei-abaŕ-ke-laur","85":"lauŕokei-abaŕ-ke-borste",
|
| 1150 |
+
"86":"lauŕokei-abaŕ-ke-śei","87":"lauŕokei-abaŕ-ke-sisbi","88":"lauŕokei-abaŕ-ke-sorse","89":"lauŕokei-abaŕ-ke-bedar","90":"lauŕokei-abaŕ",
|
| 1151 |
+
"91":"lauŕokei-abaŕ-ke-ban","92":"lauŕokei-abaŕ-ke-bi","93":"lauŕokei-abaŕ-ke-irur","94":"lauŕokei-abaŕ-ke-laur","95":"lauŕokei-abaŕ-ke-borste",
|
| 1152 |
+
"96":"lauŕokei-abaŕ-ke-śei","97":"lauŕokei-abaŕ-ke-sisbi","98":"lauŕokei-abaŕ-ke-sorse","99":"lauŕokei-abaŕ-ke-bedar","100":"atun",
|
| 1153 |
+
|
| 1154 |
+
# Números en letras
|
| 1155 |
+
"once":"abaŕ-ke-ban","doce":"abaŕ-ke-bi","trece":"abaŕ-ke-irur","catorce":"abaŕ-ke-laur","quince":"abaŕ-ke-borste",
|
| 1156 |
+
"dieciséis":"abaŕ-ke-śei","dieciseis":"abaŕ-ke-śei","diecisiete":"abaŕ-ke-sisbi","dieciocho":"abaŕ-ke-sorse","diecinueve":"abaŕ-ke-bedar",
|
| 1157 |
+
"veintiuno":"oŕkei-ke-ban","veintidós":"oŕkei-ke-bi","veintidos":"oŕkei-ke-bi","veintitrés":"oŕkei-ke-irur","veintitres":"oŕkei-ke-irur",
|
| 1158 |
+
"veinticuatro":"oŕkei-ke-laur","veinticinco":"oŕkei-ke-borste","veintiséis":"oŕkei-ke-śei","veintiseis":"oŕkei-ke-śei",
|
| 1159 |
+
"veintisiete":"oŕkei-ke-sisbi","veintiocho":"oŕkei-ke-sorse","veintinueve":"oŕkei-ke-bedar",
|
| 1160 |
+
"treinta":"oŕkei-abaŕ","cuarenta":"binoŕkei","cincuenta":"binoŕkei-abaŕ","sesenta":"iruŕokei",
|
| 1161 |
+
"setenta":"iruŕokei-abaŕ","ochenta":"lauŕokei","noventa":"lauŕokei-abaŕ","cien":"atun",
|
| 1162 |
+
|
| 1163 |
+
# Pronombres y partículas
|
| 1164 |
+
"yo":"ni","tú":"zu","él":"nar","ella":"nar",
|
| 1165 |
+
"nosotros":"gu","nosotras":"gu","vosotros":"zuek","vosotras":"zuek",
|
| 1166 |
+
"ellos":"narek","ellas":"narek",
|
| 1167 |
+
"que":"ze","si":"baldin","cuando":"noiz","donde":"non",
|
| 1168 |
+
"como":"nola","porque":"zeren","mientras":"bitarte",
|
| 1169 |
+
"versión":"bertsi","test":"froga","prueba":"froga",
|
| 1170 |
+
"ejemplo":"adibid","texto":"testu","palabra":"hitz"
|
| 1171 |
+
}
|
| 1172 |
+
for k,v in KEEP_MIN.items():
|
| 1173 |
+
LEX_FORM.setdefault(k,v)
|
| 1174 |
+
if k in {"yo","tú","él","ella","nosotros","nosotras","vosotros","vosotras","ellos","ellas"}:
|
| 1175 |
+
_meta_set(k, pos="PRON", tam_ok=False)
|
| 1176 |
+
elif k in {"que","si","cuando","donde","como","porque","mientras"}:
|
| 1177 |
+
_meta_set(k, pos="PART", tam_ok=False)
|
| 1178 |
+
elif k.isdigit() or k in {"uno","dos","tres","cuatro","cinco","seis","siete","ocho","nueve","diez","once","doce","trece","catorce","quince","dieciséis","dieciseis","diecisiete","dieciocho","diecinueve","veinte","veintiuno","veintidós","veintidos","veintitrés","veintitres","veinticuatro","veinticinco","veintiséis","veintiseis","veintisiete","veintiocho","veintinueve","treinta","cuarenta","cincuenta","sesenta","setenta","ochenta","noventa","cien"}:
|
| 1179 |
+
_meta_set(k, pos="NUM", tam_ok=False)
|
| 1180 |
+
else:
|
| 1181 |
+
_meta_set(k, pos=_canon_pos("PART" if k in {"y","o","no","a","para"} else "DET"), tam_ok=False)
|
| 1182 |
+
|
| 1183 |
+
BUILTIN_LEMMA={
|
| 1184 |
+
# Solo por seguridad si faltara en CSV
|
| 1185 |
+
"llover":"euŕak","llamar":"deitu","venir":"nuker","ir":"nitus",
|
| 1186 |
+
"hacer":"giotael","tener":"giokk","poder":"binbel","poner":"pusen",
|
| 1187 |
+
"ser":"izan","estar":"egon"
|
| 1188 |
+
}
|
| 1189 |
+
for k,v in BUILTIN_LEMMA.items():
|
| 1190 |
+
LEX_LEMMA.setdefault(k,v); _meta_set(k, pos="V", tam_ok=True)
|
| 1191 |
+
|
| 1192 |
+
FORCE_FORMS = {
|
| 1193 |
+
"voy":"nitus-ke","vas":"nitus-ke","va":"nitus-ke","vamos":"nitus-ke","vais":"nitus-ke","van":"nitus-ke",
|
| 1194 |
+
"vengo":"nuker-ke","vienes":"nuker-ke","viene":"nuker-ke","venimos":"nuker-ke","venís":"nuker-ke","vienen":"nuker-ke",
|
| 1195 |
+
"ven":"nuker-tu","haz":"giotael-tu","pon":"pusen-tu","di":"siśnesir-tu","sal":"salku-tu","ten":"giokk-tu","sé":"suber-tu"
|
| 1196 |
+
}
|
| 1197 |
+
for form, ni in FORCE_FORMS.items():
|
| 1198 |
+
LEX_FORM[form] = ni; _meta_set(form, pos="V", tam_ok=True)
|
| 1199 |
+
global FORCE_KEYS
|
| 1200 |
+
FORCE_KEYS = set(FORCE_FORMS.keys())
|
| 1201 |
+
return loaded
|
| 1202 |
+
|
| 1203 |
+
def load_lexicon_ni_es():
|
| 1204 |
+
loaded=False
|
| 1205 |
+
total=0
|
| 1206 |
+
# 1) Intento DictReader con cabecera
|
| 1207 |
+
for p in CSV_NI_ES:
|
| 1208 |
+
if not os.path.exists(p):
|
| 1209 |
+
debug_print(f"CSV NI→ES no encontrado: {p}")
|
| 1210 |
+
continue
|
| 1211 |
+
try:
|
| 1212 |
+
with open(p, encoding="utf-8") as f:
|
| 1213 |
+
sniffer = csv.Sniffer()
|
| 1214 |
+
sample = f.read(4096)
|
| 1215 |
+
f.seek(0)
|
| 1216 |
+
has_header = sniffer.has_header(sample)
|
| 1217 |
+
if has_header:
|
| 1218 |
+
dr = csv.DictReader(f)
|
| 1219 |
+
fieldnames = [x.lower() for x in (dr.fieldnames or [])]
|
| 1220 |
+
# nombres plausibles
|
| 1221 |
+
fn_source = next((c for c in fieldnames if "source" in c and ("ni" in c or "neo" in c)), None)
|
| 1222 |
+
fn_target = next((c for c in fieldnames if "target" in c and ("es" in c or "spa" in c)), None)
|
| 1223 |
+
fn_eslem = next((c for c in fieldnames if "es_lem" in c or c=="es_lemma" or "lemma_es" in c), None)
|
| 1224 |
+
# v4.4: el CSV usa 'ni_tam'
|
| 1225 |
+
fn_morph = next((c for c in fieldnames if c in {"ni_tam","ni_morph","ni_tag"} or "morph" in c), None)
|
| 1226 |
+
fn_root = next((c for c in fieldnames if "ni_root" in c or c=="root" or "ni_lemma" in c), None)
|
| 1227 |
+
|
| 1228 |
+
if fn_source and fn_target:
|
| 1229 |
+
debug_print(f"Cargando {p} con cabecera: source={fn_source}, target={fn_target}")
|
| 1230 |
+
for r in dr:
|
| 1231 |
+
# ✅ FIX 1: NO convertir a minúsculas
|
| 1232 |
+
source_ni = (r.get(fn_source) or "").strip() # ← SIN .lower()
|
| 1233 |
+
target_es = (r.get(fn_target) or "").strip()
|
| 1234 |
+
es_lemma = (r.get(fn_eslem) or "").strip().lower() if fn_eslem else ""
|
| 1235 |
+
ni_morph = (r.get(fn_morph) or "").strip().upper() if fn_morph else ""
|
| 1236 |
+
ni_root = (r.get(fn_root) or "").strip().lower() if fn_root else ""
|
| 1237 |
+
|
| 1238 |
+
if source_ni and target_es:
|
| 1239 |
+
# ✅ FIX 2: Sobrescribir en vez de setdefault
|
| 1240 |
+
NI_TO_ES_FORM[source_ni] = target_es
|
| 1241 |
+
if ni_morph:
|
| 1242 |
+
NI_TO_ES_SURF[(source_ni, ni_morph)] = target_es
|
| 1243 |
+
if ni_root and es_lemma:
|
| 1244 |
+
NI_TO_ES_LEMMA.setdefault(ni_root, es_lemma)
|
| 1245 |
+
if ni_root and target_es:
|
| 1246 |
+
NI_TO_ES_FORM.setdefault(ni_root, target_es)
|
| 1247 |
+
total+=1
|
| 1248 |
+
print(f"✓ Cargadas {total} filas NI→ES (cabecera) desde {p}")
|
| 1249 |
+
loaded=True
|
| 1250 |
+
continue # pasa al siguiente fichero si hay
|
| 1251 |
+
|
| 1252 |
+
# 2) Fallback por posiciones
|
| 1253 |
+
f.seek(0)
|
| 1254 |
+
reader=csv.reader(f)
|
| 1255 |
+
count=0
|
| 1256 |
+
for row in reader:
|
| 1257 |
+
if not row: continue
|
| 1258 |
+
if count==0 and any("source" in (c or "").lower() or "ni_" in (c or "").lower() or "target" in (c or "").lower() for c in row):
|
| 1259 |
+
count+=1
|
| 1260 |
+
continue
|
| 1261 |
+
|
| 1262 |
+
# ✅ FIX 3: Índices correctos según estructura del CSV v4.4
|
| 1263 |
+
# source_ni, target_es, ni_tam, ni_pn, es_morph, es_pn, ni_root, ni_suffix, es_lemma, pos_es, evidencia
|
| 1264 |
+
# 0 1 2 3 4 5 6 7 8 9 10
|
| 1265 |
+
source_ni = (row[0] if len(row)>0 else "").strip() # ← SIN .lower()
|
| 1266 |
+
target_es = (row[1] if len(row)>1 else "").strip()
|
| 1267 |
+
ni_tam = (row[2] if len(row)>2 else "").strip().upper() # ← CORRECTO: posición 2
|
| 1268 |
+
ni_root = (row[6] if len(row)>6 else "").strip().lower() # ← Ya estaba bien
|
| 1269 |
+
es_lemma = (row[8] if len(row)>8 else "").strip().lower() # ← CORRECTO: posición 8
|
| 1270 |
+
|
| 1271 |
+
if source_ni and target_es:
|
| 1272 |
+
NI_TO_ES_FORM[source_ni] = target_es # ← Sobrescribir
|
| 1273 |
+
if ni_tam:
|
| 1274 |
+
NI_TO_ES_SURF[(source_ni, ni_tam)] = target_es
|
| 1275 |
+
if ni_root and es_lemma:
|
| 1276 |
+
NI_TO_ES_LEMMA.setdefault(ni_root, es_lemma)
|
| 1277 |
+
if ni_root and target_es:
|
| 1278 |
+
NI_TO_ES_FORM.setdefault(ni_root, target_es)
|
| 1279 |
+
count+=1
|
| 1280 |
+
total+=1
|
| 1281 |
+
|
| 1282 |
+
if count>0:
|
| 1283 |
+
print(f"✓ Cargadas {count} filas NI→ES (posicional) desde {p}")
|
| 1284 |
+
loaded=True
|
| 1285 |
+
except Exception as e:
|
| 1286 |
+
print(f"[WARN] Error leyendo {p}: {e}")
|
| 1287 |
+
import traceback
|
| 1288 |
+
traceback.print_exc()
|
| 1289 |
+
|
| 1290 |
+
# ✅ FIX 4: Vocabulario mínimo ampliado
|
| 1291 |
+
KEEP_MIN_NI = {
|
| 1292 |
+
# Partículas
|
| 1293 |
+
"ne":"y","o":"o","eś":"no","ka":"a","mi":"a","te":"a",
|
| 1294 |
+
"kin":"con","tan":"en","ta":"de","kara":"para",
|
| 1295 |
+
|
| 1296 |
+
# Pronombres
|
| 1297 |
+
"ni":"yo","zu":"tú","nar":"él","gu":"nosotros",
|
| 1298 |
+
"ban":"un","banu":"un","bana":"una",
|
| 1299 |
+
|
| 1300 |
+
# Artículos
|
| 1301 |
+
"do":"el","da":"la","don":"los","dan":"las",
|
| 1302 |
+
|
| 1303 |
+
# Demostrativos
|
| 1304 |
+
"aŕe":"este","aŕa":"esta","aŕen":"estos","aŕan":"estas",
|
| 1305 |
+
|
| 1306 |
+
# Verbos base
|
| 1307 |
+
"nuker":"venir","siśnesir":"decir","giotael":"hacer",
|
| 1308 |
+
"izan":"ser","egon":"estar","giokk":"tener",
|
| 1309 |
+
"pusen":"poner","binbel":"poder","nitus":"ir",
|
| 1310 |
+
"deitu":"llamar","euŕak":"llover",
|
| 1311 |
+
|
| 1312 |
+
# Interjecciones
|
| 1313 |
+
"batsornel":"hola","sabernel":"adiós",
|
| 1314 |
+
|
| 1315 |
+
# Sustantivos comunes
|
| 1316 |
+
"domśaldum":"pan","śesilmen":"café","kuknomtok":"restaurante",
|
| 1317 |
+
"sikliskoŕ":"casa","śaldalbam":"mercado","bekmil":"cine",
|
| 1318 |
+
"seŕtuŕgok":"año","kordo":"pueblo","tokbatkir":"ciudad",
|
| 1319 |
+
"eskom":"amigo","nintos":"madre","śimnas":"padre",
|
| 1320 |
+
}
|
| 1321 |
+
|
| 1322 |
+
for k,v in KEEP_MIN_NI.items():
|
| 1323 |
+
NI_TO_ES_FORM.setdefault(k,v)
|
| 1324 |
+
|
| 1325 |
+
if total:
|
| 1326 |
+
print(f"✓ NI→ES: {total} pares cargados (incluyendo {len(KEEP_MIN_NI)} mínimos)")
|
| 1327 |
+
else:
|
| 1328 |
+
print(f"⚠ NI→ES: No se cargaron pares desde CSV, usando {len(KEEP_MIN_NI)} mínimos")
|
| 1329 |
+
|
| 1330 |
+
# ✅ FIX 5: DEBUG - mostrar muestras cargadas
|
| 1331 |
+
if DEBUG_MODE and total > 0:
|
| 1332 |
+
print("\n[DEBUG] Muestra de NI_TO_ES_FORM:")
|
| 1333 |
+
samples = list(NI_TO_ES_FORM.items())[:30]
|
| 1334 |
+
for k, v in samples:
|
| 1335 |
+
print(f" {k} → {v}")
|
| 1336 |
+
|
| 1337 |
+
return loaded
|
| 1338 |
+
|
| 1339 |
+
print("Cargando léxico ES→NI..."); load_lexicon()
|
| 1340 |
+
print("Cargando léxico NI→ES..."); load_lexicon_ni_es()
|
| 1341 |
+
|
| 1342 |
+
# =========================
|
| 1343 |
+
# UI CLÁSICA (con dirección)
|
| 1344 |
+
# =========================
|
| 1345 |
+
LABELS={
|
| 1346 |
+
"ES":{
|
| 1347 |
+
"title":"Traductor Español ↔ Neoíbero v4.4",
|
| 1348 |
+
"subtitle":"Explora el renacimiento ibérico con tecnología moderna — ULTRA-DEFINITIVO",
|
| 1349 |
+
"in_label_es":"✏️ Entrada (Español)",
|
| 1350 |
+
"in_label_ni":"✏️ Entrada (Neoíbero)",
|
| 1351 |
+
"in_ph_es":"Escribe aquí. Ej.: Veo a Ana y doy pan a Marta.",
|
| 1352 |
+
"in_ph_ni":"Idatzi hemen. Adib.: nitus-ke ni etxe-ka.",
|
| 1353 |
+
"out_lat_esni":"📜 Salida: Neoíbero (latín)",
|
| 1354 |
+
"out_lat_nies":"📜 Salida: Español",
|
| 1355 |
+
"out_ib":"🗿 Línea ibérica",
|
| 1356 |
+
"out_audio":"🔊 Locución (Audio)",
|
| 1357 |
+
"btn":"🔄 Traducir",
|
| 1358 |
+
"combo":"🌍 Idioma (UI + explicación)",
|
| 1359 |
+
"dir":"🔁 Dirección",
|
| 1360 |
+
"dir_opts":["ES → NI","NI → ES"],
|
| 1361 |
+
"doc_header":"📚 Documentación y Referencia",
|
| 1362 |
+
"acc_titles":[
|
| 1363 |
+
"🎓 Marco académico y decisiones del neoíbero",
|
| 1364 |
+
"🏛️ Herencia posible del íbero histórico",
|
| 1365 |
+
"🎨 Diseño de la conlang (neoíbero)",
|
| 1366 |
+
"⚙️ Pipeline del traductor (paso a paso)",
|
| 1367 |
+
"🔤 Ortografía, línea ibérica y claves",
|
| 1368 |
+
"❓/❗ Modalidad presunto vascoide (-na / -ba)",
|
| 1369 |
+
"📖 Gramática de referencia (v1.2)",
|
| 1370 |
+
"📚 Bibliografía de base",
|
| 1371 |
+
"🧾 Siglas y glosario"
|
| 1372 |
+
]
|
| 1373 |
+
},
|
| 1374 |
+
"EN":{
|
| 1375 |
+
"title":"Spanish ↔ Neo-Iberian Translator v4.4",
|
| 1376 |
+
"subtitle":"Explore the revival of Neo-Iberian with modern tech — ULTRA-DEFINITIVE",
|
| 1377 |
+
"in_label_es":"✏️ Input (Spanish)",
|
| 1378 |
+
"in_label_ni":"✏️ Input (Neo-Iberian)",
|
| 1379 |
+
"in_ph_es":"Type here. E.g., Veo a Ana y doy pan a Marta.",
|
| 1380 |
+
"in_ph_ni":"Type here. E.g., nitus-ke ni etxe-ka.",
|
| 1381 |
+
"out_lat_esni":"📜 Output: Neo-Iberian (Latin)",
|
| 1382 |
+
"out_lat_nies":"📜 Output: Spanish",
|
| 1383 |
+
"out_ib":"🗿 Iberian line",
|
| 1384 |
+
"out_audio":"🔊 Speech (Audio)",
|
| 1385 |
+
"btn":"🔄 Translate",
|
| 1386 |
+
"combo":"🌍 Language (UI + docs)",
|
| 1387 |
+
"dir":"🔁 Direction",
|
| 1388 |
+
"dir_opts":["ES → NI","NI → ES"],
|
| 1389 |
+
"doc_header":"📚 Documentation & Reference",
|
| 1390 |
+
"acc_titles":[
|
| 1391 |
+
"🎓 Background & design choices",
|
| 1392 |
+
"🏛️ Possible inheritance from ancient Iberian",
|
| 1393 |
+
"🎨 Conlang design (Neo-Iberian)",
|
| 1394 |
+
"⚙️ Translator pipeline (step by step)",
|
| 1395 |
+
"🔤 Orthography, Iberian line & keys",
|
| 1396 |
+
"❓/❗ 'Vascoid' mood (-na / -ba)",
|
| 1397 |
+
"📖 Reference grammar (v1.2)",
|
| 1398 |
+
"📚 Core references",
|
| 1399 |
+
"🧾 Acronyms & glossary"
|
| 1400 |
+
]
|
| 1401 |
+
}
|
| 1402 |
+
}
|
| 1403 |
+
|
| 1404 |
+
# Documentación completa del appOld.py
|
| 1405 |
+
DOC_ES_0 = """**Escritura y datos.**
|
| 1406 |
+
El *neoíbero* se diseña como una **lengua conjetural** que toma como base el corpus ibérico (ss. V–I a.C.) conocido, más una morfología y un léxico especulativos construidos con plausibilidad histórica y tipológica.
|
| 1407 |
+
"""
|
| 1408 |
+
DOC_ES_1 = """**Herencia antigua posible.**
|
| 1409 |
+
- Raíces documentadas en inscripciones ibéricas reales: *ban*, *bi*, *irur*, *laur*, *borste*, *śei*, *sisbi*, *sorse* (numerales); *belai* (cuervo), *ebee* (perdiz), etc.
|
| 1410 |
+
- **CV(C)** phonotactics; no **/p/** fonémico; *r/ŕ* desaconsejado en inicio de palabra.
|
| 1411 |
+
- Postposiciones/sufijos nominales: **-k** (pl), **-te** (agente), **-ar/-en** (genitivo/origen), **-ka** (dat./loc./dist.), **-i** (ac. con PN).
|
| 1412 |
+
- Partículas: **ne** 'y', **o** 'o', **eś** 'no'.
|
| 1413 |
+
- Numerales: *ban, bi, irur, laur, borste, śei, sisbi, sorse, lauŕbi, abaŕ (10), oŕkei (20).*
|
| 1414 |
+
"""
|
| 1415 |
+
DOC_ES_2 = """**Diseño de la conlang:**
|
| 1416 |
+
- **TAM (v3.2-LTS):** PRS **-ke**, PST **-bo**, FUT **-ta**, IPFV **-ri**, IMP **-tu**, **SBJV -ni**, **COND -ne**.
|
| 1417 |
+
- Derivación: verbos (-ke/-ta/-bo/-ri/-ni/-ne), adjetivos (-si), sustantivos (-ar/-en/-tu/-la/-ŕa/-si).
|
| 1418 |
+
- Orden preferido **SOV**.
|
| 1419 |
+
"""
|
| 1420 |
+
DOC_ES_3 = """**Pipeline (resumen):**
|
| 1421 |
+
1) Tokenizar; partir **al→ka do**, **del→ta do**.
|
| 1422 |
+
2) `a` → `ka`/`mi`/`te`.
|
| 1423 |
+
3) CSV rico da **superficie** NI; si no, CSV simple → **lema** NI.
|
| 1424 |
+
4) **Puerta POS/TAM**: solo verbos obtienen TAM; otros se normalizan a lema/raíz.
|
| 1425 |
+
5) Negación **eś** antes del primer verbo finito.
|
| 1426 |
+
6) ?/! → enclíticos **-na/-ba** en el último verbo finito (o último constituyente).
|
| 1427 |
+
7) WH desnudo añade **-na** e inserta `?`.
|
| 1428 |
+
8) Línea ibérica: solo puntuación visible; separador de palabras = **"/"** (tridots).
|
| 1429 |
+
"""
|
| 1430 |
+
DOC_ES_4 = """**Ortografía y claves:**
|
| 1431 |
+
- Modo de claves **explicit** (BA/BE/BI/BO/BU).
|
| 1432 |
+
- Separador de palabras = "/".
|
| 1433 |
+
- Atajos: `ka`→**K**, `mi`→**MI**, `te`→**TE**, `ne`→**N**, `o`→**O**, `eś`→**X**.
|
| 1434 |
+
"""
|
| 1435 |
+
DOC_ES_5 = """**Modalidad (-na/-ba):**
|
| 1436 |
+
- **-na** interrogativa; **-ba** exclamativa, se une al último verbo finito (o último constituyente).
|
| 1437 |
+
"""
|
| 1438 |
+
DOC_ES_6 = """**Gramática mínima (NI):**
|
| 1439 |
+
- Verbo: raíz + **TAM**; negación preverbal **eś**.
|
| 1440 |
+
- Casos productivos: -k (pl), -te (agente), -ka (dat/loc), -ar/-en (genitivo/origen).
|
| 1441 |
+
"""
|
| 1442 |
+
DOC_ES_7 = """**Referencias principales:** Untermann; de Hoz; Ferrer i Jané; Correa; gramáticas/corpora bascoide seleccionados."""
|
| 1443 |
+
DOC_ES_8 = """**Acrónimos (v3.2-LTS):**
|
| 1444 |
+
- **TAM** (PRS, PST, FUT, IPFV, SBJV, COND, IMP, FUT_SBJV); **PN**; **POS**; **LEMMa/SURFACE**; **RT**; **LTS**; **SOV**; **CV(C)**; **CSV**; **Enclítico**.
|
| 1445 |
+
"""
|
| 1446 |
+
|
| 1447 |
+
DOC_EN_0 = """**Writing & data.**
|
| 1448 |
+
*Neo-Iberian* is designed as a **conlang** that takes the known Iberian corpus (5th–1st c. BCE) as a base, plus a speculative morphology and lexicon built with historical and typological plausibility.
|
| 1449 |
+
"""
|
| 1450 |
+
DOC_EN_1 = """**Possible ancient heritage.**
|
| 1451 |
+
- Roots documented in real Iberian inscriptions: *ban*, *bi*, *irur*, *laur*, *borste*, *śei*, *sisbi*, *sorse* (numerals); *belai* (raven), *ebee* (partridge), etc.
|
| 1452 |
+
- **CV(C)** phonotactics; no phonemic **/p/**; *r/ŕ* disallowed word-initially.
|
| 1453 |
+
- Postpositions/nominal suffixes: **-k** (pl), **-te** (agent), **-ar/-en** (genitive/origin), **-ka** (dat./loc./dist.), **-i** (acc. with PN).
|
| 1454 |
+
- Particles: **ne** 'and', **o** 'or', **eś** 'not'.
|
| 1455 |
+
- Numerals: *ban, bi, irur, laur, borste, śei, sisbi, sorse, lauŕbi, abaŕ (10), oŕkei (20).*
|
| 1456 |
+
"""
|
| 1457 |
+
DOC_EN_2 = """**Conlang design:**
|
| 1458 |
+
- **TAM (v3.2-LTS):** PRS **-ke**, PST **-bo**, FUT **-ta**, IPFV **-ri**, IMP **-tu**, **SBJV -ni**, **COND -ne**.
|
| 1459 |
+
- Derivation: verbs (-ke/-ta/-bo/-ri/-ni/-ne), adjectives (-si), nouns (-ar/-en/-tu/-la/-ŕa/-si).
|
| 1460 |
+
- Preferred order **SOV**.
|
| 1461 |
+
"""
|
| 1462 |
+
DOC_EN_3 = """**Pipeline (summary):**
|
| 1463 |
+
1) Tokenize; split **al→ka do**, **del→ta do**.
|
| 1464 |
+
2) `a` → `ka`/`mi`/`te`.
|
| 1465 |
+
3) Rich CSV gives NI **surface**; else simple CSV → NI **lemma**.
|
| 1466 |
+
4) **POS/TAM gating**: only verbs get TAM; others normalize to lemma/root.
|
| 1467 |
+
5) Negation **eś** before the first finite verb.
|
| 1468 |
+
6) ?/! → enclitics **-na/-ba** on the last finite verb (or last constituent).
|
| 1469 |
+
7) Bare WH adds **-na** and inserts `?`.
|
| 1470 |
+
8) Iberian line: visible punctuation only; word separator is **"/"** (tridots).
|
| 1471 |
+
"""
|
| 1472 |
+
DOC_EN_4 = """**Orthography & keys:**
|
| 1473 |
+
- Keys mode **explicit** (BA/BE/BI/BO/BU).
|
| 1474 |
+
- Word separator = "/".
|
| 1475 |
+
- Shortcuts: `ka`→**K**, `mi`→**MI**, `te`→**TE**, `ne`→**N**, `o`→**O**, `eś`→**X**.
|
| 1476 |
+
"""
|
| 1477 |
+
DOC_EN_5 = """**Modality (-na/-ba):**
|
| 1478 |
+
- **-na** interrogative; **-ba** exclamative, attached to the last finite verb (or last constituent).
|
| 1479 |
+
"""
|
| 1480 |
+
DOC_EN_6 = """**Minimal grammar (NI):**
|
| 1481 |
+
- Verb: root + **TAM**; preverbal negation **eś**.
|
| 1482 |
+
- Productive cases: -k (pl), -te (agent), -ka (dat/loc), -ar/-en (genitive/origin).
|
| 1483 |
+
"""
|
| 1484 |
+
DOC_EN_7 = """**Core references:** Untermann; de Hoz; Ferrer i Jané; Correa; selected Bascoid grammars/corpora."""
|
| 1485 |
+
DOC_EN_8 = """**Acronyms (v3.2-LTS):**
|
| 1486 |
+
- **TAM** (PRS, PST, FUT, IPFV, SBJV, COND, IMP, FUT_SBJV); **PN**; **POS**; **LEMMa/SURFACE**; **RT**; **LTS**; **SOV**; **CV(C)**; **CSV**; **Enclitic**.
|
| 1487 |
+
"""
|
| 1488 |
+
|
| 1489 |
+
DOC={
|
| 1490 |
+
"ES":[DOC_ES_0, DOC_ES_1, DOC_ES_2, DOC_ES_3, DOC_ES_4, DOC_ES_5, DOC_ES_6, DOC_ES_7, DOC_ES_8],
|
| 1491 |
+
"EN":[DOC_EN_0, DOC_EN_1, DOC_EN_2, DOC_EN_3, DOC_EN_4, DOC_EN_5, DOC_EN_6, DOC_EN_7, DOC_EN_8]
|
| 1492 |
+
}
|
| 1493 |
+
|
| 1494 |
+
# CSS del diseño original (appOld.py)
|
| 1495 |
+
def build_css():
|
| 1496 |
+
b64=None
|
| 1497 |
+
if os.path.exists("Iberia-Georgeos.ttf"):
|
| 1498 |
+
with open("Iberia-Georgeos.ttf","rb") as f:
|
| 1499 |
+
b64=base64.b64encode(f.read()).decode("ascii")
|
| 1500 |
+
font_src = f"url(data:font/ttf;base64,{b64}) format('truetype')" if b64 else "local('sans-serif')"
|
| 1501 |
+
return f"""
|
| 1502 |
+
@font-face {{
|
| 1503 |
+
font-family: 'IberiaGeorgeos';
|
| 1504 |
+
src: {font_src};
|
| 1505 |
+
font-weight: normal; font-style: normal;
|
| 1506 |
+
}}
|
| 1507 |
+
:root {{
|
| 1508 |
+
--iberian-clay:#8B4513; --iberian-ochre:#CC7722; --iberian-stone:#5C5C5C;
|
| 1509 |
+
--iberian-sand:#D2B48C; --iberian-rust:#A0522D; --iberian-bronze:#CD7F32;
|
| 1510 |
+
}}
|
| 1511 |
+
.gradio-container {{ background:linear-gradient(135deg,#f4e8d8 0%,#e8d5c4 50%,#d4c4b0 100%)!important;
|
| 1512 |
+
font-family:'Georgia','Times New Roman',serif!important; }}
|
| 1513 |
+
.gradio-container h1,.gradio-container h2,.gradio-container h3 {{
|
| 1514 |
+
color:var(--iberian-clay)!important; text-shadow:2px 2px 4px rgba(139,69,19,.15)!important;
|
| 1515 |
+
border-bottom:3px solid var(--iberian-bronze)!important; padding-bottom:.5rem!important; letter-spacing:.5px!important;
|
| 1516 |
+
}}
|
| 1517 |
+
.gradio-container .gr-group {{ background:linear-gradient(to bottom,#f9f6f0,#ede6dc)!important;
|
| 1518 |
+
border:2px solid var(--iberian-sand)!important; border-radius:8px!important; box-shadow:0 4px 12px rgba(139,69,19,.2), inset 0 1px 0 rgba(255,255,255,.5)!important;
|
| 1519 |
+
padding:1.5rem!important; margin-bottom:1.5rem!important; }}
|
| 1520 |
+
.gradio-container .gr-accordion {{ background:linear-gradient(145deg,#ebe3d5,#d9cec0)!important;
|
| 1521 |
+
border:2px solid var(--iberian-rust)!important; border-radius:6px!important; margin-bottom:.8rem!important; box-shadow:2px 2px 6px rgba(0,0,0,.15)!important; }}
|
| 1522 |
+
.gradio-container .gr-accordion .label-wrap {{ background:linear-gradient(to right,var(--iberian-ochre),var(--iberian-rust))!important;
|
| 1523 |
+
color:#fff!important; font-weight:600!important; padding:.8rem 1rem!important; border-radius:4px!important; text-shadow:1px 1px 2px rgba(0,0,0,.3)!important; }}
|
| 1524 |
+
.gradio-container .gr-textbox textarea,.gradio-container .gr-textbox input {{ background:linear-gradient(to bottom,#faf8f3,#f5f0e8)!important;
|
| 1525 |
+
border:2px solid var(--iberian-sand)!important; border-radius:6px!important; color:var(--iberian-stone)!important;
|
| 1526 |
+
font-family:'Georgia',serif!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1)!important; }}
|
| 1527 |
+
.gradio-container .gr-textbox textarea:focus,.gradio-container .gr-textbox input:focus {{
|
| 1528 |
+
border-color:var(--iberian-bronze)!important; box-shadow:inset 2px 2px 4px rgba(139,69,19,.1), 0 0 8px rgba(205,127,50,.3)!important; }}
|
| 1529 |
+
.gradio-container .gr-button.gr-button-primary {{ background:linear-gradient(145deg,var(--iberian-bronze),var(--iberian-rust))!important;
|
| 1530 |
+
border:2px solid var(--iberian-clay)!important; color:#fff!important; font-weight:bold!important; text-shadow:1px 1px 2px rgba(0,0,0,.4)!important;
|
| 1531 |
+
box-shadow:0 4px 8px rgba(139,69,19,.3), inset 0 1px 0 rgba(255,255,255,.2)!important; border-radius:8px!important; padding:.8rem 1.5rem!important; transition:all .3s ease!important; }}
|
| 1532 |
+
.gradio-container .gr-button.gr-button-primary:hover {{ background:linear-gradient(145deg,var(--iberian-rust),var(--iberian-bronze))!important;
|
| 1533 |
+
transform:translateY(-2px)!important; box-shadow:0 6px 12px rgba(139,69,19,.4)!important; }}
|
| 1534 |
+
.ib-line {{ font-family:'IberiaGeorgeos',monospace,sans-serif!important; font-size:1.9rem!important; line-height:2.4rem!important; white-space:pre-wrap!important;
|
| 1535 |
+
background:linear-gradient(135deg,#e8dcc8 0%,#d4c4a8 50%,#c4b098 100%)!important; padding:24px!important; border-radius:10px!important;
|
| 1536 |
+
border:3px solid var(--iberian-rust)!important; border-left:6px solid var(--iberian-bronze)!important;
|
| 1537 |
+
box-shadow:0 4px 15px rgba(139,69,19,.25), inset 0 2px 4px rgba(0,0,0,.1)!important; color:var(--iberian-clay)!important; position:relative!important; }}
|
| 1538 |
+
.ib-line::before {{ content:''!important; position:absolute!important; inset:0!important;
|
| 1539 |
+
background-image:repeating-linear-gradient(0deg,transparent,transparent 2px, rgba(139,69,19,.03) 2px, rgba(139,69,19,.03) 4px)!important;
|
| 1540 |
+
pointer-events:none!important; border-radius:10px!important; }}
|
| 1541 |
+
@media (max-width:768px) {{
|
| 1542 |
+
.ib-line {{ font-size:1.5rem!important; line-height:2rem!important; padding:16px!important; }}
|
| 1543 |
+
.gradio-container .gr-group {{ padding:1rem!important; }}
|
| 1544 |
+
.gradio-container h1 {{ font-size:1.8rem!important; }}
|
| 1545 |
+
}}
|
| 1546 |
+
@media (max-width:480px) {{
|
| 1547 |
+
.ib-line {{ font-size:1.3rem!important; line-height:1.8rem!important; padding:12px!important; }}
|
| 1548 |
+
.gradio-container h1 {{ font-size:1.5rem!important; }}
|
| 1549 |
+
}}
|
| 1550 |
+
"""
|
| 1551 |
+
CSS = build_css()
|
| 1552 |
+
|
| 1553 |
+
# =========================
|
| 1554 |
+
# INTERFAZ GRADIO
|
| 1555 |
+
# =========================
|
| 1556 |
+
with gr.Blocks(css=CSS, theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple")) as demo:
|
| 1557 |
+
with gr.Group():
|
| 1558 |
+
title = gr.Markdown(f"# {LABELS['ES']['title']}")
|
| 1559 |
+
subtitle = gr.Markdown(f"*{LABELS['ES']['subtitle']}*")
|
| 1560 |
+
|
| 1561 |
+
with gr.Row():
|
| 1562 |
+
combo = gr.Dropdown(choices=["ES","EN"], value="ES", label=LABELS["ES"]["combo"])
|
| 1563 |
+
direction = gr.Radio(choices=LABELS["ES"]["dir_opts"], value="ES → NI", label=LABELS["ES"]["dir"])
|
| 1564 |
+
|
| 1565 |
+
with gr.Group():
|
| 1566 |
+
doc_header = gr.Markdown(f"## {LABELS['ES']['doc_header']}")
|
| 1567 |
+
acc_titles = LABELS["ES"]["acc_titles"]
|
| 1568 |
+
with gr.Accordion(acc_titles[0], open=False) as acc1: md1 = gr.Markdown(DOC["ES"][0])
|
| 1569 |
+
with gr.Accordion(acc_titles[1], open=False) as acc2: md2 = gr.Markdown(DOC["ES"][1])
|
| 1570 |
+
with gr.Accordion(acc_titles[2], open=False) as acc3: md3 = gr.Markdown(DOC["ES"][2])
|
| 1571 |
+
with gr.Accordion(acc_titles[3], open=False) as acc4: md4 = gr.Markdown(DOC["ES"][3])
|
| 1572 |
+
with gr.Accordion(acc_titles[4], open=False) as acc5: md5 = gr.Markdown(DOC["ES"][4])
|
| 1573 |
+
with gr.Accordion(acc_titles[5], open=False) as acc6: md6 = gr.Markdown(DOC["ES"][5])
|
| 1574 |
+
with gr.Accordion(acc_titles[6], open=False) as acc7: md7 = gr.Markdown(DOC["ES"][6])
|
| 1575 |
+
with gr.Accordion(acc_titles[7], open=False) as acc8: md8 = gr.Markdown(DOC["ES"][7])
|
| 1576 |
+
with gr.Accordion(acc_titles[8], open=False) as acc9: md9 = gr.Markdown(DOC["ES"][8])
|
| 1577 |
+
|
| 1578 |
+
with gr.Group():
|
| 1579 |
+
es_in = gr.Textbox(label=LABELS["ES"]["in_label_es"], placeholder=LABELS["ES"]["in_ph_es"], lines=5)
|
| 1580 |
+
btn_tr = gr.Button(LABELS["ES"]["btn"], variant="primary")
|
| 1581 |
+
with gr.Row():
|
| 1582 |
+
with gr.Column(scale=2):
|
| 1583 |
+
ni_out = gr.Textbox(label=LABELS["ES"]["out_lat_esni"], lines=5, interactive=False)
|
| 1584 |
+
loc_btn = gr.Button("🔊 Locutar", variant="secondary", visible=False)
|
| 1585 |
+
audio_out = gr.Audio(label=LABELS["ES"]["out_audio"], type="numpy")
|
| 1586 |
+
with gr.Column(scale=1):
|
| 1587 |
+
ib_out = gr.HTML(label=LABELS["ES"]["out_ib"])
|
| 1588 |
+
|
| 1589 |
+
def do_translate(text, dir_label):
|
| 1590 |
+
if not text or not text.strip():
|
| 1591 |
+
return (gr.update(value=""),
|
| 1592 |
+
gr.update(value="<div class='ib-line'></div>"),
|
| 1593 |
+
gr.update(visible=False),
|
| 1594 |
+
gr.update(value=None))
|
| 1595 |
+
if dir_label.startswith("ES"):
|
| 1596 |
+
latin, ib = translate(text)
|
| 1597 |
+
ib_html = "<div class='ib-line'>" + escape(ib) + "</div>"
|
| 1598 |
+
return (gr.update(label=LABELS["ES"]["out_lat_esni"], value=latin),
|
| 1599 |
+
gr.update(value=ib_html),
|
| 1600 |
+
gr.update(visible=True),
|
| 1601 |
+
gr.update(value=None))
|
| 1602 |
+
else:
|
| 1603 |
+
es_text = translate_ni_to_es(text)
|
| 1604 |
+
return (gr.update(label=LABELS["ES"]["out_lat_nies"], value=es_text),
|
| 1605 |
+
gr.update(value="<div class='ib-line'></div>"),
|
| 1606 |
+
gr.update(visible=False),
|
| 1607 |
+
gr.update(value=None))
|
| 1608 |
+
|
| 1609 |
+
btn_tr.click(do_translate, [es_in, direction], [ni_out, ib_out, loc_btn, audio_out])
|
| 1610 |
+
|
| 1611 |
+
def run_locution(latin_text, dir_label):
|
| 1612 |
+
if dir_label.startswith("ES"):
|
| 1613 |
+
return synthesize_speech(latin_text)
|
| 1614 |
+
return None
|
| 1615 |
+
|
| 1616 |
+
loc_btn.click(run_locution, [ni_out, direction], audio_out)
|
| 1617 |
+
|
| 1618 |
+
def switch_lang(sel_lang, dir_label):
|
| 1619 |
+
L=LABELS[sel_lang]; T=L["acc_titles"]; D=DOC[sel_lang]
|
| 1620 |
+
# Input/Output labels dependen de la dirección
|
| 1621 |
+
in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"]
|
| 1622 |
+
in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"]
|
| 1623 |
+
out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"]
|
| 1624 |
+
return (
|
| 1625 |
+
gr.update(value=f"# {L['title']}"),
|
| 1626 |
+
gr.update(value=f"*{L['subtitle']}*"),
|
| 1627 |
+
gr.update(label=L["combo"], value=sel_lang),
|
| 1628 |
+
gr.update(label=L["dir"], choices=L["dir_opts"], value=dir_label),
|
| 1629 |
+
gr.update(value=f"## {L['doc_header']}"),
|
| 1630 |
+
gr.update(label=T[0]), gr.update(value=D[0]),
|
| 1631 |
+
gr.update(label=T[1]), gr.update(value=D[1]),
|
| 1632 |
+
gr.update(label=T[2]), gr.update(value=D[2]),
|
| 1633 |
+
gr.update(label=T[3]), gr.update(value=D[3]),
|
| 1634 |
+
gr.update(label=T[4]), gr.update(value=D[4]),
|
| 1635 |
+
gr.update(label=T[5]), gr.update(value=D[5]),
|
| 1636 |
+
gr.update(label=T[6]), gr.update(value=D[6]),
|
| 1637 |
+
gr.update(label=T[7]), gr.update(value=D[7]),
|
| 1638 |
+
gr.update(label=T[8]), gr.update(value=D[8]),
|
| 1639 |
+
gr.update(label=in_label, placeholder=in_ph),
|
| 1640 |
+
gr.update(label=out_lab),
|
| 1641 |
+
gr.update(label=L["out_ib"]),
|
| 1642 |
+
gr.update(label=L["out_audio"]),
|
| 1643 |
+
gr.update(value=L["btn"])
|
| 1644 |
+
)
|
| 1645 |
+
|
| 1646 |
+
combo.change(
|
| 1647 |
+
switch_lang,
|
| 1648 |
+
[combo, direction],
|
| 1649 |
+
[title, subtitle, combo, direction, doc_header,
|
| 1650 |
+
acc1, md1, acc2, md2, acc3, md3, acc4, md4, acc5, md5, acc6, md6, acc7, md7, acc8, md8, acc9, md9,
|
| 1651 |
+
es_in, ni_out, ib_out, audio_out, btn_tr]
|
| 1652 |
+
)
|
| 1653 |
+
|
| 1654 |
+
def switch_direction(dir_label, sel_lang):
|
| 1655 |
+
# Solo cambia etiquetas y visibilidad de Locutar/Línea ibérica
|
| 1656 |
+
L=LABELS[sel_lang]
|
| 1657 |
+
in_label = L["in_label_es"] if dir_label.startswith("ES") else L["in_label_ni"]
|
| 1658 |
+
in_ph = L["in_ph_es"] if dir_label.startswith("ES") else L["in_ph_ni"]
|
| 1659 |
+
out_lab = L["out_lat_esni"] if dir_label.startswith("ES") else L["out_lat_nies"]
|
| 1660 |
+
# Locución solo para ES → NI
|
| 1661 |
+
loc_vis = True if dir_label.startswith("ES") else False
|
| 1662 |
+
# Línea ibérica visible solo para ES → NI (tras traducir)
|
| 1663 |
+
return (gr.update(label=in_label, placeholder=in_ph),
|
| 1664 |
+
gr.update(label=out_lab, value=""),
|
| 1665 |
+
gr.update(value="<div class='ib-line'></div>"),
|
| 1666 |
+
gr.update(visible=loc_vis),
|
| 1667 |
+
gr.update(value=None))
|
| 1668 |
+
|
| 1669 |
+
direction.change(
|
| 1670 |
+
switch_direction,
|
| 1671 |
+
[direction, combo],
|
| 1672 |
+
[es_in, ni_out, ib_out, loc_btn, audio_out]
|
| 1673 |
+
)
|
| 1674 |
+
|
| 1675 |
+
if __name__ == "__main__":
|
| 1676 |
+
demo.queue().launch()
|
requirements.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
--extra-index-url https://download.pytorch.org/whl/cpu
|
| 2 |
+
torch==2.3.1
|
| 3 |
+
transformers
|
| 4 |
+
soundfile
|
| 5 |
+
gradio
|
| 6 |
+
sentencepiece
|