Spaces:
Sleeping
Sleeping
File size: 3,754 Bytes
e179439 7291080 e179439 70bb33e e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 c5de144 e179439 4c34e0b e179439 7566c59 66e8097 7566c59 080e470 7566c59 4b17963 7566c59 166238d 9cafd01 827b9b5 e73a1af 9cafd01 e179439 7291080 c5de144 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, MarianMTModel, MarianTokenizer
import shutil, os
shutil.rmtree(os.path.expanduser("~/.cache/huggingface"), ignore_errors=True)
shutil.rmtree(os.path.expanduser("~/.cache/torch"), ignore_errors=True)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL_OPTIONS = [
"Helsinki-NLP (Tira ondo)", # Round-trip OPUS-MT en→es→en
"FLAN-T5-base (Google gaizki xamar)"
]
# Cache
CACHE = {}
# --- FLAN loader (Google-style Euskera correction) ---
def load_flan():
if "flan" not in CACHE:
tok = AutoTokenizer.from_pretrained("google/flan-t5-base")
mdl = AutoModelForSeq2SeqLM.from_pretrained(
"google/flan-t5-base",
low_cpu_mem_usage=True,
torch_dtype="auto"
).to(DEVICE)
CACHE["flan"] = (mdl, tok)
return CACHE["flan"]
def run_flan(sentence: str) -> str:
model, tok = load_flan()
prompt = f"Euskara zuzen gramatikalki eta idatzi modu naturalean: {sentence}"
inputs = tok(prompt, return_tensors="pt").to(DEVICE)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=96, num_beams=4)
return tok.decode(out[0], skip_special_tokens=True).strip()
# --- Euskera round-trip loader ---
def load_euskera():
if "eus" not in CACHE:
tok1 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-eu-es")
mdl1 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-eu-es").to(DEVICE)
tok2 = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-es-eu")
mdl2 = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-es-eu").to(DEVICE)
CACHE["eus"] = (mdl1, tok1, mdl2, tok2)
return CACHE["eus"]
def run_roundtrip(sentence: str) -> str:
mdl1, tok1, mdl2, tok2 = load_euskera()
# Euskera → Spanish
inputs = tok1(sentence, return_tensors="pt").to(DEVICE)
es_tokens = mdl1.generate(**inputs, max_length=128, num_beams=4)
spanish = tok1.decode(es_tokens[0], skip_special_tokens=True)
# Spanish → Euskera
inputs2 = tok2(spanish, return_tensors="pt").to(DEVICE)
eu_tokens = mdl2.generate(**inputs2, max_length=128, num_beams=4)
euskera = tok2.decode(eu_tokens[0], skip_special_tokens=True)
return euskera.strip()
# --- Dispatcher ---
def polish(sentence: str, choice: str) -> str:
if not sentence.strip():
return ""
if choice.startswith("FLAN"):
return run_flan(sentence)
elif choice.startswith("Helsinki"):
return run_roundtrip(sentence)
else:
return "Unknown option."
# --- Gradio UI ---
with gr.Blocks(title="HizkuntzLagun: AI Euskera Zuzendu (CPU enabled)") as demo:
gr.Image(
value="banner.png",
show_label=False,
elem_id="banner",
height=200
)
gr.Markdown("### HizkuntzLagun: AI Euskera Zuzedu\n")
gr.Markdown(
"""
> ⚡ **Oharra:**
> Tresna honek doako, CPU‑lagunko AI ereduak erabiltzen ditu.
> Azkarra eta eskuragarria izateko diseinatuta dago — ez beti perfektua.
> Zuzenketa azkarrak bai, ez analisi gramatikal sakonak.
> Edozein unetan erabil dezakezu — eguneroko zuzenketa txiki batek saihesten du esaldi traketsen lotsa.
""")
inp = gr.Textbox(lines=3, label="Idatzi Euskeraz esaldi bat, adibidez Gaur Koldo ikusi nuen.", placeholder="Idatzi Euskeraz esaldi bat...")
choice = gr.Dropdown(choices=MODEL_OPTIONS, value="Helsinki-NLP (Tira ondo)", label="Metodoa")
btn = gr.Button("Euskera zuzendu")
out = gr.Textbox(label="Erantzuna")
btn.click(polish, inputs=[inp, choice], outputs=out)
if __name__ == "__main__":
demo.launch()
|