Spaces:

Abelex
/

sentence

Runtime error

App Files Files Community

Abelex commited on Dec 21, 2025

Commit

ed7df25

verified ·

1 Parent(s): c7d8bdd

Create app.py

Browse files

Files changed (1) hide show

app.py +197 -0

app.py ADDED Viewed

	@@ -0,0 +1,197 @@

+# ===============================
+# Sentence-ChuLo Gradio Demo (HF Spaces Ready)
+# ===============================
+import gradio as gr
+import torch
+import torch.nn as nn
+import numpy as np
+import os
+import re
+from transformers import AutoTokenizer, AutoModel
+# --------------------------------------------------
+# Configuration
+# --------------------------------------------------
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+PRETRAINED = "Davlan/afro-xlmr-large"
+HF_MODEL_ID = "Abelex/Sentence-Chunking-Afri_BERTA_amharic_text"
+CHUNK_SIZE = 512
+MAX_CHUNKS = 8
+CHUNK_DMODEL = 256
+DROPOUT = 0.1
+NUM_LABELS = 8
+# ⚠️ MUST match training
+id2label = {
+    0: "Politics",
+    1: "Business",
+    2: "Sports",
+    3: "Technology",
+    4: "Health",
+    5: "Entertainment",
+    6: "Education",
+    7: "Other"
+}
+# ========================================================
+# MODEL
+# ========================================================
+class HybridSentenceChuLo(nn.Module):
+    def __init__(self, pretrained_name, num_labels):
+        super().__init__()
+        self.bert = AutoModel.from_pretrained(
+            pretrained_name,
+            trust_remote_code=True
+        )
+        hidden_size = self.bert.config.hidden_size
+        self.proj = nn.Linear(hidden_size, CHUNK_DMODEL) if hidden_size != CHUNK_DMODEL else nn.Identity()
+        self.token_attn_vec = nn.Parameter(torch.randn(CHUNK_DMODEL))
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=CHUNK_DMODEL,
+            nhead=8,
+            dim_feedforward=4 * CHUNK_DMODEL,
+            batch_first=True,
+            dropout=DROPOUT
+        )
+        self.chunk_transformer = nn.TransformerEncoder(encoder_layer, num_layers=2)
+        self.classifier = nn.Sequential(
+            nn.LayerNorm(CHUNK_DMODEL),
+            nn.Linear(CHUNK_DMODEL, num_labels)
+        )
+    def forward(self, input_ids, attention_mask):
+        B, C, T = input_ids.size()
+        flat_ids = input_ids.view(B * C, T)
+        flat_mask = attention_mask.view(B * C, T)
+        bert_out = self.bert(input_ids=flat_ids, attention_mask=flat_mask)
+        token_vecs = bert_out.last_hidden_state
+        proj = self.proj(token_vecs)
+        attn_scores = torch.matmul(proj, self.token_attn_vec)
+        attn_scores = attn_scores.masked_fill(flat_mask == 0, torch.finfo(attn_scores.dtype).min)
+        attn_weights = torch.softmax(attn_scores, dim=1).unsqueeze(-1)
+        chunk_vecs = (proj * attn_weights).sum(dim=1).view(B, C, CHUNK_DMODEL)
+        chunk_mask = (attention_mask.sum(dim=2) > 0)
+        key_padding_mask = ~chunk_mask
+        chunk_out = self.chunk_transformer(chunk_vecs, src_key_padding_mask=key_padding_mask)
+        valid_mask = (~key_padding_mask).unsqueeze(-1).float()
+        doc_vec = (chunk_out * valid_mask).sum(dim=1) / valid_mask.sum(dim=1).clamp(min=1e-6)
+        return self.classifier(doc_vec)
+# ========================================================
+# Load tokenizer & model
+# ========================================================
+tokenizer = AutoTokenizer.from_pretrained(PRETRAINED)
+model = HybridSentenceChuLo(
+    pretrained_name=PRETRAINED,
+    num_labels=NUM_LABELS
+).to(DEVICE)
+# Load weights from HF Hub
+state_dict = torch.hub.load_state_dict_from_url(
+    f"https://huggingface.co/{HF_MODEL_ID}/resolve/main/pytorch_model.bin",
+    map_location=DEVICE
+)
+model.load_state_dict(state_dict, strict=False)
+model.eval()
+# ========================================================
+# Sentence Utilities
+# ========================================================
+def split_sentences(text):
+    return [s.strip() for s in re.split(r"(?<=[።፤!?])\s+", text) if s.strip()]
+def select_topk(sentences):
+    n = len(sentences)
+    if n == 0:
+        return []
+    return [sentences[0], sentences[n // 2], sentences[-1]]
+def encode_sentence_chunks(sentences):
+    chunks, masks = [], []
+    for sent in sentences:
+        enc = tokenizer(
+            sent,
+            max_length=CHUNK_SIZE,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt"
+        )
+        chunks.append(enc["input_ids"][0])
+        masks.append(enc["attention_mask"][0])
+    while len(chunks) < MAX_CHUNKS:
+        chunks.append(torch.zeros(CHUNK_SIZE, dtype=torch.long))
+        masks.append(torch.zeros(CHUNK_SIZE, dtype=torch.long))
+    return torch.stack(chunks), torch.stack(masks)
+def build_html(all_sents, selected):
+    html = "<div style='font-size:15px; line-height:1.6;'>"
+    for s in all_sents:
+        safe = s.replace("<", "&lt;").replace(">", "&gt;")
+        if s in selected:
+            html += f"<p style='background:#d4edda; padding:4px;'><b>{safe}</b></p>"
+        else:
+            html += f"<p>{safe}</p>"
+    html += "</div>"
+    return html
+# ========================================================
+# Prediction
+# ========================================================
+def chulo_predict(text):
+    if not text or not text.strip():
+        return "⚠️ Please enter Amharic text.", [], ""
+    sents = split_sentences(text)
+    selected = select_topk(sents)
+    chunks, masks = encode_sentence_chunks(selected)
+    with torch.no_grad():
+        logits = model(
+            input_ids=chunks.unsqueeze(0).to(DEVICE),
+            attention_mask=masks.unsqueeze(0).to(DEVICE)
+        )
+        probs = torch.softmax(logits, dim=-1)[0].cpu().numpy()
+    pred = id2label[int(np.argmax(probs))]
+    table = [(id2label[i], float(probs[i])) for i in range(len(probs))]
+    return f"🏷️ {pred}", table, build_html(sents, selected)
+# ========================================================
+# Gradio UI (HF Friendly)
+# ========================================================
+demo = gr.Interface(
+    fn=chulo_predict,
+    inputs=gr.Textbox(lines=8, placeholder="እባክዎ የአማርኛ ዜና ጽሑፍ እዚህ ያስገቡ"),
+    outputs=[
+        gr.Textbox(label="Prediction"),
+        gr.Dataframe(headers=["Label", "Probability"], label="Class Probabilities"),
+        gr.HTML(label="Highlighted Document")
+    ],
+    title="Sentence-ChuLo — Amharic News Classification",
+    description="Uses EXACT Beginning–Middle–End sentence selection."
+)
+demo.launch()