Thibault Clérice
Demo online !
3801f51
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
# Load model and tokenizer
MODEL_NAME = "comma-project/normalization-byt5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
def normalize_text(text: str) -> str:
"""
Normalize input text using ByT5.
"""
if not text.strip():
return ""
# Tokenize
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=1024,
)
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=1024,
num_beams=2,
early_stopping=True,
)
# Decode
normalized = tokenizer.decode(
outputs[0],
skip_special_tokens=True,
)
return normalized
# Gradio interface
demo = gr.Interface(
fn=normalize_text,
inputs=gr.Textbox(
label="Input Text",
placeholder="Enter text to normalize...",
lines=4,
),
outputs=gr.Textbox(
label="Normalized Text",
lines=4,
),
title="Text Normalization with ByT5",
description="Normalize noisy or non-standard text using the ByT5 model.",
theme="soft",
examples=[
["Scͥbo uobiᷤᷤ ñ pauli ł donati."],
["""⁊ pitie mlt' lelasce
P ities li dist. uai a ton peire
Nelaissier. """, """Uer̃ ab his qͥ ita dissert̃
q̃ri debet. qͥd ꝑ amorem dei. quidq ꝑ amorẽ
boni tẽꝑalis ueluit intellig̾e."""]
],
)
if __name__ == "__main__":
demo.launch()