Spaces:

piyazon
/

Uyghur_Punctuation_Resorations

Sleeping

Piyazon

update example

5324141 4 months ago

4.62 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	import torch

	# --- Configuration ---
	MODEL_ID = "piyazon/Uyghur_ASR_Restore_Punctuation"

	label_map = {
	0: "0",
	1: ".", # Period
	2: "،", # Comma (،)
	3: "؟", # Question mark (؟)
	4: "-", # Colon
	5: ":", # Hyphen
	6: "؛" # Semicolon
	}

	# --- Load Model ---
	print(f"Loading model from {MODEL_ID}...")
	try:
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, fix_mistral_regex=True)
	model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
	except Exception as e:
	print(f"Error loading model: {e}")
	raise e

	def restore_punctuation(text):
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	logits = model(**inputs).logits

	predictions = torch.argmax(logits, dim=2)[0].tolist()
	tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])

	result = ""
	current_word = ""
	current_label = "0"

	for i, token in enumerate(tokens):
	if token in tokenizer.all_special_tokens:
	continue

	# Check for SentencePiece/Unigram underscore
	is_start_of_word = token.startswith("\u2581")

	if is_start_of_word:
	# 1. Finish the PREVIOUS word
	if current_word:
	result += current_word
	# Add punctuation if predicted
	if current_label != "0":
	result += current_label
	# Add a space
	result += " "

	# 2. Start NEW word (remove the underscore)
	current_word = token.replace("\u2581", "")

	# Reset label to the prediction of this new token
	pred_id = predictions[i]
	current_label = label_map.get(pred_id, "0")

	else:
	# It is a sub-part of the word (merge it)
	current_word += token

	# Update label: The label of the LAST sub-token is usually the valid one
	pred_id = predictions[i]
	if pred_id in label_map and label_map[pred_id] != "0":
	current_label = label_map[pred_id]

	# Process the very last word
	if current_word:
	result += current_word
	if current_label != "0":
	result += current_label

	return result.strip()

	# --- Gradio Interface ---

	title = "Uyghur ASR Punctuation Restoration"

	description = """
	This model automatically restores punctuation (periods, commas, question marks, etc.) to raw Uyghur text.
	It is specifically designed for post-processing ASR (Speech-to-Text) outputs which usually lack punctuation.
	"""

	# Uyghur text examples for users to try
	examples = [
	["چىنلىق بىلەن توقۇلمىنىڭ رېئاللىق بىلەن تەسەۋۋۇرنىڭ ماكان بىلەن زاماننىڭ مۇناسىۋىتىنى قانداق بولار"],
	["ئاتا ئانىلار مەكتەپكە كىرىپ كەلدى"],
	["ئاشۇنداق ئېھتىماللىقلارنى كۆزدە تۇتۇپ سىز ئۇلارنى تاشلىۋېتىشنى زادىلا خالىمايسىز"],
	["مەسئۇلىيەت دېگەن سۆز بىرەر ئىش ھەرىكەتنىڭ ئاقىۋىتى ۋە نەتىجىسى ئۈچۈن جاۋابكار بولۇش دېگەنلىكتۇر"],
	]

	iface = gr.Interface(
	fn=restore_punctuation,
	inputs=gr.Textbox(
	lines=4,
	placeholder="تىلىسىڭىزنى بۇ يەرگە كىرگۈزۈڭ...",
	label="ئەسلىدىكى تېكىست",
	elem_classes="rtl-text",
	elem_id="input-textbox",
	),
	outputs=gr.Textbox(
	lines=4,
	label="ئوڭشالغان تېكىست",
	elem_classes="rtl-text",
	elem_id="input-textbox",
	),
	title=title,
	description=description,
	examples=examples,
	css="""
	@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
	.rtl-text textarea {
	direction: rtl;
	width: 100%;
	font-size: 14px;
	font-family: "Noto Sans Arabic" !important;
	}
	.gallery{
	font-family: "Noto Sans Arabic" !important;
	direction: rtl;
	}
	#input-textbox{
	font-family: "Noto Sans Arabic" !important;
	direction: rtl;
	}
	""",
	flagging_mode="never",
	theme='JohnSmith9982/small_and_pretty'
	)

	if __name__ == "__main__":
	iface.launch()