Piyazon
update example
5324141
import gradio as gr
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch
# --- Configuration ---
MODEL_ID = "piyazon/Uyghur_ASR_Restore_Punctuation"
label_map = {
0: "0",
1: ".", # Period
2: "،", # Comma (،)
3: "؟", # Question mark (؟)
4: "-", # Colon
5: ":", # Hyphen
6: "؛" # Semicolon
}
# --- Load Model ---
print(f"Loading model from {MODEL_ID}...")
try:
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, fix_mistral_regex=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
except Exception as e:
print(f"Error loading model: {e}")
raise e
def restore_punctuation(text):
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
logits = model(**inputs).logits
predictions = torch.argmax(logits, dim=2)[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
result = ""
current_word = ""
current_label = "0"
for i, token in enumerate(tokens):
if token in tokenizer.all_special_tokens:
continue
# Check for SentencePiece/Unigram underscore
is_start_of_word = token.startswith("\u2581")
if is_start_of_word:
# 1. Finish the PREVIOUS word
if current_word:
result += current_word
# Add punctuation if predicted
if current_label != "0":
result += current_label
# Add a space
result += " "
# 2. Start NEW word (remove the underscore)
current_word = token.replace("\u2581", "")
# Reset label to the prediction of this new token
pred_id = predictions[i]
current_label = label_map.get(pred_id, "0")
else:
# It is a sub-part of the word (merge it)
current_word += token
# Update label: The label of the LAST sub-token is usually the valid one
pred_id = predictions[i]
if pred_id in label_map and label_map[pred_id] != "0":
current_label = label_map[pred_id]
# Process the very last word
if current_word:
result += current_word
if current_label != "0":
result += current_label
return result.strip()
# --- Gradio Interface ---
title = "Uyghur ASR Punctuation Restoration"
description = """
This model automatically restores punctuation (periods, commas, question marks, etc.) to raw Uyghur text.
It is specifically designed for post-processing **ASR (Speech-to-Text)** outputs which usually lack punctuation.
"""
# Uyghur text examples for users to try
examples = [
["چىنلىق بىلەن توقۇلمىنىڭ رېئاللىق بىلەن تەسەۋۋۇرنىڭ ماكان بىلەن زاماننىڭ مۇناسىۋىتىنى قانداق بولار"],
["ئاتا ئانىلار مەكتەپكە كىرىپ كەلدى"],
["ئاشۇنداق ئېھتىماللىقلارنى كۆزدە تۇتۇپ سىز ئۇلارنى تاشلىۋېتىشنى زادىلا خالىمايسىز"],
["مەسئۇلىيەت دېگەن سۆز بىرەر ئىش ھەرىكەتنىڭ ئاقىۋىتى ۋە نەتىجىسى ئۈچۈن جاۋابكار بولۇش دېگەنلىكتۇر"],
]
iface = gr.Interface(
fn=restore_punctuation,
inputs=gr.Textbox(
lines=4,
placeholder="تىلىسىڭىزنى بۇ يەرگە كىرگۈزۈڭ...",
label="ئەسلىدىكى تېكىست",
elem_classes="rtl-text",
elem_id="input-textbox",
),
outputs=gr.Textbox(
lines=4,
label="ئوڭشالغان تېكىست",
elem_classes="rtl-text",
elem_id="input-textbox",
),
title=title,
description=description,
examples=examples,
css="""
@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
.rtl-text textarea {
direction: rtl;
width: 100%;
font-size: 14px;
font-family: "Noto Sans Arabic" !important;
}
.gallery{
font-family: "Noto Sans Arabic" !important;
direction: rtl;
}
#input-textbox{
font-family: "Noto Sans Arabic" !important;
direction: rtl;
}
""",
flagging_mode="never",
theme='JohnSmith9982/small_and_pretty'
)
if __name__ == "__main__":
iface.launch()