Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import csv | |
| import gradio as gr | |
| import sentencepiece as spm | |
| MODEL_FILE = "ast_spm.model" | |
| # --- Ficheros TSV de normalización ALLA --- | |
| # --- Ficheros TSV de normalización ALLA --- | |
| TSV_FILES = [ | |
| "alla_replace_secure.tsv", | |
| ] | |
| # ========================= | |
| # 1. CARGA DEL MODELU SP | |
| # ========================= | |
| tokenizer = None | |
| init_error = None | |
| try: | |
| if not os.path.exists(MODEL_FILE): | |
| init_error = f"Nun s'alcuentra el ficheru de modelu: {MODEL_FILE}" | |
| else: | |
| tokenizer = spm.SentencePieceProcessor(model_file=MODEL_FILE) | |
| except Exception as e: | |
| init_error = f"Fallu cargando el modelu: {e}" | |
| # ========================= | |
| # 2. NORMALIZADOR SENCILLO | |
| # ========================= | |
| def load_tsv_rules(file_path): | |
| rules = [] | |
| if os.path.exists(file_path): | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| for line in f: | |
| if line.strip() and not line.startswith("#"): | |
| parts = line.strip().split("\t") | |
| if len(parts) >= 2: | |
| rules.append((parts[0], parts[1])) | |
| return rules | |
| def load_all_rules(): | |
| base_dir = os.path.dirname(__file__) | |
| all_rules = [] | |
| for file_name in TSV_FILES: | |
| path = os.path.join(base_dir, file_name) | |
| all_rules += load_tsv_rules(path) | |
| print(f"✅ {len(all_rules)} reglas cargadas desde {len(TSV_FILES)} ficheros TSV.") | |
| return all_rules | |
| # --- Carga final de todas las reglas --- | |
| ALL_RULES = load_all_rules() | |
| def apply_rules(text: str, rules): | |
| """ | |
| Aplica reemplazos: | |
| - si la regla ye una sola palabra (ensin espacios), úsase \\b...\\b | |
| - si la regla tien espacios, fácese un replace direutu | |
| """ | |
| for orig, repl in rules: | |
| if " " in orig: | |
| # Sintagmes: reemplazu direutu | |
| text = text.replace(orig, repl) | |
| else: | |
| # Palabres sueltes: bordes de palabra | |
| pattern = rf"\b{re.escape(orig)}\b" | |
| text = re.sub(pattern, repl, text) | |
| return text | |
| def basic_cleanup(text: str) -> str: | |
| """Llimpieza básica: espacios dobles, etc.""" | |
| text = re.sub(r"\s+", " ", text) | |
| return text.strip() | |
| def normalize_text(text: str) -> str: | |
| normalized = basic_cleanup(text) | |
| if ALL_RULES: | |
| normalized = apply_rules(normalized, ALL_RULES) | |
| normalized = basic_cleanup(normalized) | |
| return normalized | |
| # ========================= | |
| # 3. FUNCIÓN DE LA DEMO | |
| # ========================= | |
| def process_text(text: str): | |
| """ | |
| - Normaliza'l testu (ALLA base). | |
| - Tokeniza col modelu SentencePiece. | |
| - Devuelve: testu normalizáu, tokens, IDs. | |
| """ | |
| if init_error is not None: | |
| return f"[ERROR DE INICIALIZACIÓN]\n{init_error}", "", "" | |
| if not text or not text.strip(): | |
| return "", "", "" | |
| normalized = normalize_text(text) | |
| pieces = tokenizer.encode(normalized, out_type=str) | |
| ids = tokenizer.encode(normalized, out_type=int) | |
| tokens_str = " ".join(pieces) | |
| ids_str = " ".join(str(i) for i in ids) | |
| return normalized, tokens_str, ids_str | |
| demo = gr.Interface( | |
| fn=process_text, | |
| inputs=gr.Textbox( | |
| lines=3, | |
| label="Testu n'asturianu (entrada)", | |
| placeholder="Escribi equí un testu n'asturianu..." | |
| ), | |
| outputs=[ | |
| gr.Textbox(label="Testu normalizáu (ALLA – versión base)", lines=3), | |
| gr.Textbox(label="Tokens (subunidaes)", lines=4), | |
| gr.Textbox(label="IDs (índices nel vocabulariu)", lines=4), | |
| ], | |
| title="Normalizador + tokenizador asturianu (SentencePiece)", | |
| description=( | |
| "Primer versión d'una cadena de procesamientu pa la llingua asturiana: " | |
| "normalización básica (ALLA) + tokenización con SentencePiece. " | |
| "Si se xubieren ficheros TSV ('alla_toponimia_asturiana_basica.tsv', " | |
| "'alla_replace_secure.tsv', etc.), aplicaránse les sos regles " | |
| "antes de tokenizar." | |
| ), | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |