File size: 1,662 Bytes
3801f51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 | import gradio as gr
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
# Load model and tokenizer
MODEL_NAME = "comma-project/normalization-byt5-small"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
def normalize_text(text: str) -> str:
"""
Normalize input text using ByT5.
"""
if not text.strip():
return ""
# Tokenize
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=1024,
)
# Generate
with torch.no_grad():
outputs = model.generate(
**inputs,
max_length=1024,
num_beams=2,
early_stopping=True,
)
# Decode
normalized = tokenizer.decode(
outputs[0],
skip_special_tokens=True,
)
return normalized
# Gradio interface
demo = gr.Interface(
fn=normalize_text,
inputs=gr.Textbox(
label="Input Text",
placeholder="Enter text to normalize...",
lines=4,
),
outputs=gr.Textbox(
label="Normalized Text",
lines=4,
),
title="Text Normalization with ByT5",
description="Normalize noisy or non-standard text using the ByT5 model.",
theme="soft",
examples=[
["Scͥbo uobiᷤᷤ ñ pauli ł donati."],
["""⁊ pitie mlt' lelasce
P ities li dist. uai a ton peire
Nelaissier. """, """Uer̃ ab his qͥ ita dissert̃
q̃ri debet. qͥd ꝑ amorem dei. quidq ꝑ amorẽ
boni tẽꝑalis ueluit intellig̾e."""]
],
)
if __name__ == "__main__":
demo.launch()
|