ajescandon's picture
Update app.py
5571018 verified
import os
import re
import csv
import gradio as gr
import sentencepiece as spm
MODEL_FILE = "ast_spm.model"
# --- Ficheros TSV de normalización ALLA ---
# --- Ficheros TSV de normalización ALLA ---
TSV_FILES = [
"alla_replace_secure.tsv",
]
# =========================
# 1. CARGA DEL MODELU SP
# =========================
tokenizer = None
init_error = None
try:
if not os.path.exists(MODEL_FILE):
init_error = f"Nun s'alcuentra el ficheru de modelu: {MODEL_FILE}"
else:
tokenizer = spm.SentencePieceProcessor(model_file=MODEL_FILE)
except Exception as e:
init_error = f"Fallu cargando el modelu: {e}"
# =========================
# 2. NORMALIZADOR SENCILLO
# =========================
def load_tsv_rules(file_path):
rules = []
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as f:
for line in f:
if line.strip() and not line.startswith("#"):
parts = line.strip().split("\t")
if len(parts) >= 2:
rules.append((parts[0], parts[1]))
return rules
def load_all_rules():
base_dir = os.path.dirname(__file__)
all_rules = []
for file_name in TSV_FILES:
path = os.path.join(base_dir, file_name)
all_rules += load_tsv_rules(path)
print(f"✅ {len(all_rules)} reglas cargadas desde {len(TSV_FILES)} ficheros TSV.")
return all_rules
# --- Carga final de todas las reglas ---
ALL_RULES = load_all_rules()
def apply_rules(text: str, rules):
"""
Aplica reemplazos:
- si la regla ye una sola palabra (ensin espacios), úsase \\b...\\b
- si la regla tien espacios, fácese un replace direutu
"""
for orig, repl in rules:
if " " in orig:
# Sintagmes: reemplazu direutu
text = text.replace(orig, repl)
else:
# Palabres sueltes: bordes de palabra
pattern = rf"\b{re.escape(orig)}\b"
text = re.sub(pattern, repl, text)
return text
def basic_cleanup(text: str) -> str:
"""Llimpieza básica: espacios dobles, etc."""
text = re.sub(r"\s+", " ", text)
return text.strip()
def normalize_text(text: str) -> str:
normalized = basic_cleanup(text)
if ALL_RULES:
normalized = apply_rules(normalized, ALL_RULES)
normalized = basic_cleanup(normalized)
return normalized
# =========================
# 3. FUNCIÓN DE LA DEMO
# =========================
def process_text(text: str):
"""
- Normaliza'l testu (ALLA base).
- Tokeniza col modelu SentencePiece.
- Devuelve: testu normalizáu, tokens, IDs.
"""
if init_error is not None:
return f"[ERROR DE INICIALIZACIÓN]\n{init_error}", "", ""
if not text or not text.strip():
return "", "", ""
normalized = normalize_text(text)
pieces = tokenizer.encode(normalized, out_type=str)
ids = tokenizer.encode(normalized, out_type=int)
tokens_str = " ".join(pieces)
ids_str = " ".join(str(i) for i in ids)
return normalized, tokens_str, ids_str
demo = gr.Interface(
fn=process_text,
inputs=gr.Textbox(
lines=3,
label="Testu n'asturianu (entrada)",
placeholder="Escribi equí un testu n'asturianu..."
),
outputs=[
gr.Textbox(label="Testu normalizáu (ALLA – versión base)", lines=3),
gr.Textbox(label="Tokens (subunidaes)", lines=4),
gr.Textbox(label="IDs (índices nel vocabulariu)", lines=4),
],
title="Normalizador + tokenizador asturianu (SentencePiece)",
description=(
"Primer versión d'una cadena de procesamientu pa la llingua asturiana: "
"normalización básica (ALLA) + tokenización con SentencePiece. "
"Si se xubieren ficheros TSV ('alla_toponimia_asturiana_basica.tsv', "
"'alla_replace_secure.tsv', etc.), aplicaránse les sos regles "
"antes de tokenizar."
),
)
if __name__ == "__main__":
demo.launch()