Naqta-Test / app.py
MostafaMaroof's picture
Create app.py
e302cff verified
import gradio as gr
import torch
from transformers import AutoModelForTokenClassification, AutoTokenizer
MODEL_ID = "MostafaMaroof/Naqta"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForTokenClassification.from_pretrained(MODEL_ID)
model.eval()
id2label = model.config.id2label
label2id = model.config.label2id
PUNCT_COLORS = {
".": "#ef4444",
"،": "#3b82f6",
"؟": "#a855f7",
"!": "#f97316",
":": "#10b981",
"؛": "#eab308",
"-": "#64748b",
}
def _predict_words(text):
text = text.strip()
if not text:
return [], []
words = text.split()
inputs = tokenizer(
words,
is_split_into_words=True,
return_tensors="pt",
truncation=True,
max_length=384,
)
with torch.no_grad():
logits = model(**inputs).logits
# Boost Arabic comma to reduce conservatism (tune 0.5–1.2 as needed)
logits[0, :, label2id['،']] += 0.8
pred_ids = logits.argmax(dim=-1)[0].tolist()
word_ids = inputs.word_ids(batch_index=0)
word_labels = ["O"] * len(words)
previous_word_id = None
for token_id, word_id in zip(pred_ids, word_ids):
if word_id is None or word_id == previous_word_id:
continue
word_labels[word_id] = id2label[token_id]
previous_word_id = word_id
return words, word_labels
def restore_punctuation(text):
words, labels = _predict_words(text)
if not words:
return ""
pieces = []
for word, label in zip(words, labels):
if label != "O":
pieces.append(word + label)
else:
pieces.append(word)
return "\u202B" + " ".join(pieces) + "\u202C"
def colored_html(text):
words, labels = _predict_words(text)
if not words:
return "<div class='naqta-empty'>اكتب نصاً لرؤية الترقيم الملوّن</div>"
spans = []
for word, label in zip(words, labels):
if label != "O":
color = PUNCT_COLORS.get(label, "#9ca3af")
spans.append(
f"<span class='naqta-word'>{word}"
f"<span class='naqta-mark' style='color:{color}'>{label}</span>"
f"</span>"
)
else:
spans.append(f"<span class='naqta-word'>{word}</span>")
body = " ".join(spans)
return f"<div class='naqta-output' dir='rtl'>{body}</div>"
def run(text):
return restore_punctuation(text), colored_html(text)
CUSTOM_CSS = """
.gradio-container { max-width: 1100px !important; margin: auto; }
#naqta-header {
text-align: center;
padding: 28px 16px 8px 16px;
}
#naqta-header h1 {
font-size: 2.6rem;
margin: 0;
background: linear-gradient(90deg,#6366f1,#a855f7,#ec4899);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
font-weight: 800;
letter-spacing: 0.5px;
}
#naqta-header p {
margin-top: 8px;
color: #6b7280;
font-size: 1rem;
}
.naqta-card {
border-radius: 16px;
padding: 8px;
}
.naqta-output {
direction: rtl;
text-align: right;
line-height: 2.4;
font-size: 1.25rem;
padding: 18px 20px;
border-radius: 14px;
background: #0f172a08;
min-height: 120px;
font-family: "Segoe UI", "Tahoma", "Amiri", serif;
}
.naqta-empty {
color: #9ca3af;
text-align: center;
padding: 40px 0;
font-style: italic;
}
.naqta-word {
display: inline-block;
margin: 2px 4px;
padding: 4px 8px;
border-radius: 8px;
background: #ffffff10;
border: 1px solid #ffffff15;
}
.naqta-mark {
font-weight: 800;
margin-right: 2px;
font-size: 1.35rem;
}
#naqta-legend {
display: flex;
flex-wrap: wrap;
gap: 10px;
justify-content: center;
padding: 8px 0 4px 0;
}
.naqta-legend-item {
display: inline-flex;
align-items: center;
gap: 6px;
padding: 4px 10px;
border-radius: 999px;
background: #ffffff10;
border: 1px solid #ffffff20;
font-size: 0.85rem;
}
.naqta-dot {
width: 10px;
height: 10px;
border-radius: 50%;
display: inline-block;
}
#naqta-footer {
text-align: center;
color: #9ca3af;
font-size: 0.85rem;
padding: 12px;
}
"""
LEGEND_HTML = """
<div id='naqta-legend'>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#ef4444'></span> . فاصلة</span>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#3b82f6'></span> ، فاصلة عربية</span>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#a855f7'></span> ؟ استفهام</span>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#f97316'></span> ! تعجب</span>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#10b981'></span> : نقطتان</span>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#eab308'></span> ؛ فاصلة منقوطة</span>
<span class='naqta-legend-item'><span class='naqta-dot' style='background:#64748b'></span> - شَرطة</span>
</div>
"""
EXAMPLES = [
["بلغت نسبة النمو الاقتصادي 4.7 بالمئة خلال الربع الثالث من عام 2024 وهو اعلى مستوى منذ خمس سنوات"],
["اذا اردت ان تنجح في حياتك فعليك ان تحدد اهدافك بوضوح وان تعمل بجد واستمرارية ولا تيأس عند اول عقبة تواجهها"],
["يقول المثل العربي من جد وجد ومن زرع حصد وهذا يعني ان النجاح لا يأتي بدون عمل وتعب واجتهاد"],
["يتكون الجهاز الهضمي من عدة اعضاء رئيسية وهي الفم والمريء والمعدة والامعاء الدقيقة والامعاء الغليظة"],
["هل تعلم ان اللغة العربية تحتوي على اكثر من اثني عشر مليون كلمة وهي اغنى لغات العالم"],
]
with gr.Blocks(title="Naqta · Arabic Punctuation Restoration") as demo:
gr.HTML(
"""
<div id='naqta-header'>
<h1>Naqta · نقطة</h1>
<p>Arabic punctuation restoration powered by XLM-RoBERTa Large</p>
</div>
"""
)
gr.HTML(LEGEND_HTML)
with gr.Row():
with gr.Column(scale=1):
input_text = gr.Textbox(
label="النص بدون ترقيم",
lines=8,
placeholder="اكتب النص العربي هنا بدون علامات ترقيم...",
rtl=True,
text_align="right",
elem_classes=["naqta-card"],
)
with gr.Row():
run_btn = gr.Button("استعادة الترقيم", variant="primary", size="lg")
clear_btn = gr.Button("مسح", variant="secondary", size="lg")
with gr.Column(scale=1):
output_text = gr.Textbox(
label="النص بعد الترقيم",
lines=8,
rtl=True,
text_align="right",
elem_classes=["naqta-card"],
)
output_html = gr.HTML(label="عرض ملوّن")
gr.Examples(
examples=EXAMPLES,
inputs=input_text,
label="أمثلة جاهزة",
)
gr.HTML(
"<div id='naqta-footer'>"
"Built with ❤ · Model: "
"<a href='https://huggingface.co/MostafaMaroof/Naqta' target='_blank'>MostafaMaroof/Naqta</a>"
"</div>"
)
run_btn.click(fn=run, inputs=input_text, outputs=[output_text, output_html])
input_text.submit(fn=run, inputs=input_text, outputs=[output_text, output_html])
clear_btn.click(
fn=lambda: ("", "", ""),
outputs=[input_text, output_text, output_html],
)
if __name__ == "__main__":
demo.launch(
theme=gr.themes.Soft(primary_hue="indigo", secondary_hue="purple"),
css=CUSTOM_CSS,
)