| import gradio as gr |
| from transformers import AutoTokenizer, AutoModelForTokenClassification |
| import torch |
|
|
| |
| MODEL_ID = "piyazon/Uyghur_ASR_Restore_Punctuation" |
|
|
| label_map = { |
| 0: "0", |
| 1: ".", |
| 2: "،", |
| 3: "؟", |
| 4: "-", |
| 5: ":", |
| 6: "؛" |
| } |
|
|
| |
| print(f"Loading model from {MODEL_ID}...") |
| try: |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, fix_mistral_regex=True) |
| model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) |
| except Exception as e: |
| print(f"Error loading model: {e}") |
| raise e |
|
|
| def restore_punctuation(text): |
| inputs = tokenizer(text, return_tensors="pt") |
| with torch.no_grad(): |
| logits = model(**inputs).logits |
| |
| predictions = torch.argmax(logits, dim=2)[0].tolist() |
| tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0]) |
| |
| result = "" |
| current_word = "" |
| current_label = "0" |
| |
| for i, token in enumerate(tokens): |
| if token in tokenizer.all_special_tokens: |
| continue |
| |
| |
| is_start_of_word = token.startswith("\u2581") |
| |
| if is_start_of_word: |
| |
| if current_word: |
| result += current_word |
| |
| if current_label != "0": |
| result += current_label |
| |
| result += " " |
| |
| |
| current_word = token.replace("\u2581", "") |
| |
| |
| pred_id = predictions[i] |
| current_label = label_map.get(pred_id, "0") |
| |
| else: |
| |
| current_word += token |
| |
| |
| pred_id = predictions[i] |
| if pred_id in label_map and label_map[pred_id] != "0": |
| current_label = label_map[pred_id] |
| |
| |
| if current_word: |
| result += current_word |
| if current_label != "0": |
| result += current_label |
| |
| return result.strip() |
|
|
| |
|
|
| title = "Uyghur ASR Punctuation Restoration" |
|
|
| description = """ |
| This model automatically restores punctuation (periods, commas, question marks, etc.) to raw Uyghur text. |
| It is specifically designed for post-processing **ASR (Speech-to-Text)** outputs which usually lack punctuation. |
| """ |
|
|
| |
| examples = [ |
| ["چىنلىق بىلەن توقۇلمىنىڭ رېئاللىق بىلەن تەسەۋۋۇرنىڭ ماكان بىلەن زاماننىڭ مۇناسىۋىتىنى قانداق بولار"], |
| ["ئاتا ئانىلار مەكتەپكە كىرىپ كەلدى"], |
| ["ئاشۇنداق ئېھتىماللىقلارنى كۆزدە تۇتۇپ سىز ئۇلارنى تاشلىۋېتىشنى زادىلا خالىمايسىز"], |
| ["مەسئۇلىيەت دېگەن سۆز بىرەر ئىش ھەرىكەتنىڭ ئاقىۋىتى ۋە نەتىجىسى ئۈچۈن جاۋابكار بولۇش دېگەنلىكتۇر"], |
| ] |
|
|
| iface = gr.Interface( |
| fn=restore_punctuation, |
| inputs=gr.Textbox( |
| lines=4, |
| placeholder="تىلىسىڭىزنى بۇ يەرگە كىرگۈزۈڭ...", |
| label="ئەسلىدىكى تېكىست", |
| elem_classes="rtl-text", |
| elem_id="input-textbox", |
| ), |
| outputs=gr.Textbox( |
| lines=4, |
| label="ئوڭشالغان تېكىست", |
| elem_classes="rtl-text", |
| elem_id="input-textbox", |
| ), |
| title=title, |
| description=description, |
| examples=examples, |
| css=""" |
| @import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap'); |
| .rtl-text textarea { |
| direction: rtl; |
| width: 100%; |
| font-size: 14px; |
| font-family: "Noto Sans Arabic" !important; |
| } |
| .gallery{ |
| font-family: "Noto Sans Arabic" !important; |
| direction: rtl; |
| } |
| #input-textbox{ |
| font-family: "Noto Sans Arabic" !important; |
| direction: rtl; |
| } |
| """, |
| flagging_mode="never", |
| theme='JohnSmith9982/small_and_pretty' |
| ) |
|
|
| if __name__ == "__main__": |
| iface.launch() |
| |
|
|