Spaces:

ajescandon
/

asturian-tokenizer

Sleeping

App Files Files Community

asturian-tokenizer / app.py

ajescandon

Update app.py

5571018 verified 6 months ago

raw

history blame contribute delete

3.99 kB

	import os
	import re
	import csv
	import gradio as gr
	import sentencepiece as spm

	MODEL_FILE = "ast_spm.model"

	# --- Ficheros TSV de normalización ALLA ---
	# --- Ficheros TSV de normalización ALLA ---
	TSV_FILES = [
	"alla_replace_secure.tsv",
	]

	# =========================
	# 1. CARGA DEL MODELU SP
	# =========================

	tokenizer = None
	init_error = None

	try:
	if not os.path.exists(MODEL_FILE):
	init_error = f"Nun s'alcuentra el ficheru de modelu: {MODEL_FILE}"
	else:
	tokenizer = spm.SentencePieceProcessor(model_file=MODEL_FILE)
	except Exception as e:
	init_error = f"Fallu cargando el modelu: {e}"

	# =========================
	# 2. NORMALIZADOR SENCILLO
	# =========================

	def load_tsv_rules(file_path):
	rules = []
	if os.path.exists(file_path):
	with open(file_path, "r", encoding="utf-8") as f:
	for line in f:
	if line.strip() and not line.startswith("#"):
	parts = line.strip().split("\t")
	if len(parts) >= 2:
	rules.append((parts[0], parts[1]))
	return rules

	def load_all_rules():
	base_dir = os.path.dirname(__file__)
	all_rules = []
	for file_name in TSV_FILES:
	path = os.path.join(base_dir, file_name)
	all_rules += load_tsv_rules(path)
	print(f"✅ {len(all_rules)} reglas cargadas desde {len(TSV_FILES)} ficheros TSV.")
	return all_rules

	# --- Carga final de todas las reglas ---
	ALL_RULES = load_all_rules()

	def apply_rules(text: str, rules):
	"""
	Aplica reemplazos:
	- si la regla ye una sola palabra (ensin espacios), úsase \\b...\\b
	- si la regla tien espacios, fácese un replace direutu
	"""
	for orig, repl in rules:
	if " " in orig:
	# Sintagmes: reemplazu direutu
	text = text.replace(orig, repl)
	else:
	# Palabres sueltes: bordes de palabra
	pattern = rf"\b{re.escape(orig)}\b"
	text = re.sub(pattern, repl, text)
	return text

	def basic_cleanup(text: str) -> str:
	"""Llimpieza básica: espacios dobles, etc."""
	text = re.sub(r"\s+", " ", text)
	return text.strip()

	def normalize_text(text: str) -> str:
	normalized = basic_cleanup(text)

	if ALL_RULES:
	normalized = apply_rules(normalized, ALL_RULES)

	normalized = basic_cleanup(normalized)
	return normalized

	# =========================
	# 3. FUNCIÓN DE LA DEMO
	# =========================

	def process_text(text: str):
	"""
	- Normaliza'l testu (ALLA base).
	- Tokeniza col modelu SentencePiece.
	- Devuelve: testu normalizáu, tokens, IDs.
	"""
	if init_error is not None:
	return f"[ERROR DE INICIALIZACIÓN]\n{init_error}", "", ""

	if not text or not text.strip():
	return "", "", ""

	normalized = normalize_text(text)

	pieces = tokenizer.encode(normalized, out_type=str)
	ids = tokenizer.encode(normalized, out_type=int)

	tokens_str = " ".join(pieces)
	ids_str = " ".join(str(i) for i in ids)

	return normalized, tokens_str, ids_str

	demo = gr.Interface(
	fn=process_text,
	inputs=gr.Textbox(
	lines=3,
	label="Testu n'asturianu (entrada)",
	placeholder="Escribi equí un testu n'asturianu..."
	),
	outputs=[
	gr.Textbox(label="Testu normalizáu (ALLA – versión base)", lines=3),
	gr.Textbox(label="Tokens (subunidaes)", lines=4),
	gr.Textbox(label="IDs (índices nel vocabulariu)", lines=4),
	],
	title="Normalizador + tokenizador asturianu (SentencePiece)",
	description=(
	"Primer versión d'una cadena de procesamientu pa la llingua asturiana: "
	"normalización básica (ALLA) + tokenización con SentencePiece. "
	"Si se xubieren ficheros TSV ('alla_toponimia_asturiana_basica.tsv', "
	"'alla_replace_secure.tsv', etc.), aplicaránse les sos regles "
	"antes de tokenizar."
	),
	)

	if __name__ == "__main__":
	demo.launch()