Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForTokenClassification, AutoTokenizer | |
| MODEL_ID = "MostafaMaroof/Naqta" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForTokenClassification.from_pretrained(MODEL_ID) | |
| model.eval() | |
| id2label = model.config.id2label | |
| label2id = model.config.label2id | |
| PUNCT_COLORS = { | |
| ".": "#ef4444", | |
| "،": "#3b82f6", | |
| "؟": "#a855f7", | |
| "!": "#f97316", | |
| ":": "#10b981", | |
| "؛": "#eab308", | |
| "-": "#64748b", | |
| } | |
| def _predict_words(text): | |
| text = text.strip() | |
| if not text: | |
| return [], [] | |
| words = text.split() | |
| inputs = tokenizer( | |
| words, | |
| is_split_into_words=True, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=384, | |
| ) | |
| with torch.no_grad(): | |
| logits = model(**inputs).logits | |
| # Boost Arabic comma to reduce conservatism (tune 0.5–1.2 as needed) | |
| logits[0, :, label2id['،']] += 0.8 | |
| pred_ids = logits.argmax(dim=-1)[0].tolist() | |
| word_ids = inputs.word_ids(batch_index=0) | |
| word_labels = ["O"] * len(words) | |
| previous_word_id = None | |
| for token_id, word_id in zip(pred_ids, word_ids): | |
| if word_id is None or word_id == previous_word_id: | |
| continue | |
| word_labels[word_id] = id2label[token_id] | |
| previous_word_id = word_id | |
| return words, word_labels | |
| def restore_punctuation(text): | |
| words, labels = _predict_words(text) | |
| if not words: | |
| return "" | |
| pieces = [] | |
| for word, label in zip(words, labels): | |
| if label != "O": | |
| pieces.append(word + label) | |
| else: | |
| pieces.append(word) | |
| return "\u202B" + " ".join(pieces) + "\u202C" | |
| def colored_html(text): | |
| words, labels = _predict_words(text) | |
| if not words: | |
| return "<div class='naqta-empty'>اكتب نصاً لرؤية الترقيم الملوّن</div>" | |
| spans = [] | |
| for word, label in zip(words, labels): | |
| if label != "O": | |
| color = PUNCT_COLORS.get(label, "#9ca3af") | |
| spans.append( | |
| f"<span class='naqta-word'>{word}" | |
| f"<span class='naqta-mark' style='color:{color}'>{label}</span>" | |
| f"</span>" | |
| ) | |
| else: | |
| spans.append(f"<span class='naqta-word'>{word}</span>") | |
| body = " ".join(spans) | |
| return f"<div class='naqta-output' dir='rtl'>{body}</div>" | |
| def run(text): | |
| return restore_punctuation(text), colored_html(text) | |
| CUSTOM_CSS = """ | |
| .gradio-container { max-width: 1100px !important; margin: auto; } | |
| #naqta-header { | |
| text-align: center; | |
| padding: 28px 16px 8px 16px; | |
| } | |
| #naqta-header h1 { | |
| font-size: 2.6rem; | |
| margin: 0; | |
| background: linear-gradient(90deg,#6366f1,#a855f7,#ec4899); | |
| -webkit-background-clip: text; | |
| background-clip: text; | |
| color: transparent; | |
| font-weight: 800; | |
| letter-spacing: 0.5px; | |
| } | |
| #naqta-header p { | |
| margin-top: 8px; | |
| color: #6b7280; | |
| font-size: 1rem; | |
| } | |
| .naqta-card { | |
| border-radius: 16px; | |
| padding: 8px; | |
| } | |
| .naqta-output { | |
| direction: rtl; | |
| text-align: right; | |
| line-height: 2.4; | |
| font-size: 1.25rem; | |
| padding: 18px 20px; | |
| border-radius: 14px; | |
| background: #0f172a08; | |
| min-height: 120px; | |
| font-family: "Segoe UI", "Tahoma", "Amiri", serif; | |
| } | |
| .naqta-empty { | |
| color: #9ca3af; | |
| text-align: center; | |
| padding: 40px 0; | |
| font-style: italic; | |
| } | |
| .naqta-word { | |
| display: inline-block; | |
| margin: 2px 4px; | |
| padding: 4px 8px; | |
| border-radius: 8px; | |
| background: #ffffff10; | |
| border: 1px solid #ffffff15; | |
| } | |
| .naqta-mark { | |
| font-weight: 800; | |
| margin-right: 2px; | |
| font-size: 1.35rem; | |
| } | |
| #naqta-legend { | |
| display: flex; | |
| flex-wrap: wrap; | |
| gap: 10px; | |
| justify-content: center; | |
| padding: 8px 0 4px 0; | |
| } | |
| .naqta-legend-item { | |
| display: inline-flex; | |
| align-items: center; | |
| gap: 6px; | |
| padding: 4px 10px; | |
| border-radius: 999px; | |
| background: #ffffff10; | |
| border: 1px solid #ffffff20; | |
| font-size: 0.85rem; | |
| } | |
| .naqta-dot { | |
| width: 10px; | |
| height: 10px; | |
| border-radius: 50%; | |
| display: inline-block; | |
| } | |
| #naqta-footer { | |
| text-align: center; | |
| color: #9ca3af; | |
| font-size: 0.85rem; | |
| padding: 12px; | |
| } | |
| """ | |
| LEGEND_HTML = """ | |
| <div id='naqta-legend'> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#ef4444'></span> . فاصلة</span> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#3b82f6'></span> ، فاصلة عربية</span> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#a855f7'></span> ؟ استفهام</span> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#f97316'></span> ! تعجب</span> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#10b981'></span> : نقطتان</span> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#eab308'></span> ؛ فاصلة منقوطة</span> | |
| <span class='naqta-legend-item'><span class='naqta-dot' style='background:#64748b'></span> - شَرطة</span> | |
| </div> | |
| """ | |
| EXAMPLES = [ | |
| ["بلغت نسبة النمو الاقتصادي 4.7 بالمئة خلال الربع الثالث من عام 2024 وهو اعلى مستوى منذ خمس سنوات"], | |
| ["اذا اردت ان تنجح في حياتك فعليك ان تحدد اهدافك بوضوح وان تعمل بجد واستمرارية ولا تيأس عند اول عقبة تواجهها"], | |
| ["يقول المثل العربي من جد وجد ومن زرع حصد وهذا يعني ان النجاح لا يأتي بدون عمل وتعب واجتهاد"], | |
| ["يتكون الجهاز الهضمي من عدة اعضاء رئيسية وهي الفم والمريء والمعدة والامعاء الدقيقة والامعاء الغليظة"], | |
| ["هل تعلم ان اللغة العربية تحتوي على اكثر من اثني عشر مليون كلمة وهي اغنى لغات العالم"], | |
| ] | |
| with gr.Blocks(title="Naqta · Arabic Punctuation Restoration") as demo: | |
| gr.HTML( | |
| """ | |
| <div id='naqta-header'> | |
| <h1>Naqta · نقطة</h1> | |
| <p>Arabic punctuation restoration powered by XLM-RoBERTa Large</p> | |
| </div> | |
| """ | |
| ) | |
| gr.HTML(LEGEND_HTML) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_text = gr.Textbox( | |
| label="النص بدون ترقيم", | |
| lines=8, | |
| placeholder="اكتب النص العربي هنا بدون علامات ترقيم...", | |
| rtl=True, | |
| text_align="right", | |
| elem_classes=["naqta-card"], | |
| ) | |
| with gr.Row(): | |
| run_btn = gr.Button("استعادة الترقيم", variant="primary", size="lg") | |
| clear_btn = gr.Button("مسح", variant="secondary", size="lg") | |
| with gr.Column(scale=1): | |
| output_text = gr.Textbox( | |
| label="النص بعد الترقيم", | |
| lines=8, | |
| rtl=True, | |
| text_align="right", | |
| elem_classes=["naqta-card"], | |
| ) | |
| output_html = gr.HTML(label="عرض ملوّن") | |
| gr.Examples( | |
| examples=EXAMPLES, | |
| inputs=input_text, | |
| label="أمثلة جاهزة", | |
| ) | |
| gr.HTML( | |
| "<div id='naqta-footer'>" | |
| "Built with ❤ · Model: " | |
| "<a href='https://huggingface.co/MostafaMaroof/Naqta' target='_blank'>MostafaMaroof/Naqta</a>" | |
| "</div>" | |
| ) | |
| run_btn.click(fn=run, inputs=input_text, outputs=[output_text, output_html]) | |
| input_text.submit(fn=run, inputs=input_text, outputs=[output_text, output_html]) | |
| clear_btn.click( | |
| fn=lambda: ("", "", ""), | |
| outputs=[input_text, output_text, output_html], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"), | |
| css=CUSTOM_CSS, | |
| ) |