Spaces:

Abelex
/

news_classification

Sleeping

App Files Files Community

Abelex commited on Dec 21, 2025

Commit

bd815d9

verified ·

1 Parent(s): c63b323

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -81

app.py CHANGED Viewed

@@ -1,97 +1,142 @@
 import torch
 import gradio as gr
-from transformers import AutoTokenizer, AutoModel
-# ==================================================
-# Configuration
-# ==================================================
-MODEL_NAME = "Abelex/afro-xlmr-large"
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-# ==================================================
-# Load tokenizer & model
-# ==================================================
-tokenizer = AutoTokenizer.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True
-)
-model = AutoModel.from_pretrained(
-    MODEL_NAME,
-    trust_remote_code=True
-)
 model.to(DEVICE)
 model.eval()
-# ==================================================
-# Prediction function (FULLY FIXED)
-# ==================================================
-def classify_text(text):
-    # ---- Validation ----
-    if not text or not text.strip():
-        return "⚠️ Please enter Amharic text.", None
-    # ---- Tokenization ----
-    inputs = tokenizer(
         text,
-        return_tensors="pt",
         truncation=True,
-        padding=True,
-        max_length=1024
-    ).to(DEVICE)
-    # ---- Inference ----
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits = outputs.logits if hasattr(outputs, "logits") else outputs[0]
-        probs = torch.softmax(logits, dim=-1)[0]
-    # ---- Prediction ----
-    pred_id = torch.argmax(probs).item()
-    id2label = getattr(model.config, "id2label", {})
-    pred_label = id2label.get(pred_id, f"Class {pred_id}")
-    scores = {
-        id2label.get(i, f"Class {i}"): float(probs[i])
-        for i in range(len(probs))
-    }
-    return f"🏷️ **{pred_label}**", scores
-# ==================================================
-# Gradio UI (MINIMAL & STABLE)
-# ==================================================
-with gr.Blocks(
-    title="Amharic Text Classification",
-    theme=gr.themes.Soft()
-) as demo:
-    gr.Markdown("## 🇪🇹 Amharic Text Classification")
-    input_text = gr.Textbox(
-        lines=4,
-        placeholder="እባክዎ የአማርኛ ጽሑፍ እዚህ ያስገቡ...",
-        show_label=False
     )
-    classify_btn = gr.Button("Classify", variant="primary")
-    output_label = gr.Markdown()
-    output_scores = gr.JSON(label="Class Probabilities", visible=False)
-    classify_btn.click(
-        fn=classify_text,
-        inputs=input_text,
-        outputs=[output_label, output_scores]
-    )
-    gr.Markdown(
-        "<small>Model: <b>Abelex/afro-xlmr-large</b></small>"
     )
-# ==================================================
-# Launch
-# ==================================================
-if __name__ == "__main__":
-    demo.launch()

 import torch
 import gradio as gr
+import numpy as np
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# ----------------------------------------
+# 1. Load from Hugging Face Hub
+# ----------------------------------------
+# Change this to YOUR pushed model repo
+HUB_MODEL_ID = "Abelex/amharic-news-bert-multilingual-cased"
+  # <--- EDIT IF NEEDED
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+MAX_LENGTH = 512  # model context window in TOKENS
+# Load tokenizer and model directly from HF Hub
+tokenizer = AutoTokenizer.from_pretrained(HUB_MODEL_ID)
+model = AutoModelForSequenceClassification.from_pretrained(HUB_MODEL_ID)
 model.to(DEVICE)
 model.eval()
+# Label mapping from config
+id2label = {int(k): v for k, v in model.config.id2label.items()}
+num_labels = len(id2label)
+# ----------------------------------------
+# Helper: highlight tokens after MAX_LENGTH in red (HTML)
+# ----------------------------------------
+def highlight_token_overflow(text: str, max_tokens: int = 512) -> str:
+    """
+    Tokenize the input text and generate HTML where tokens beyond
+    `max_tokens` are wrapped in red. This shows exactly which tokens
+    are outside the model's context window.
+    """
+    if not text.strip():
+        return "<i>No text provided.</i>"
+    # Tokenize without truncation (so we can see ALL tokens)
+    tokens = tokenizer.tokenize(text)
+    if len(tokens) == 0:
+        return "<i>No tokens produced by tokenizer.</i>"
+    spans = []
+    for i, tok in enumerate(tokens):
+        # minimal HTML escape
+        safe_tok = (
+            tok.replace("&", "&amp;")
+               .replace("<", "&lt;")
+               .replace(">", "&gt;")
+        )
+        if i >= max_tokens:
+            spans.append(f"<span style='color:red;font-weight:bold;'>{safe_tok}</span>")
+        else:
+            spans.append(f"<span>{safe_tok}</span>")
+    html = " ".join(spans)
+    if len(tokens) > max_tokens:
+        html += (
+            f"<br><br>"
+            f"<small style='color:red;'>"
+            f"Note: Tokens in <b>red</b> are beyond the model context window "
+            f"({max_tokens} tokens) and will be truncated."
+            f"</small>"
+        )
+    else:
+        html += (
+            f"<br><br>"
+            f"<small>Token count: {len(tokens)} (≤ {max_tokens}, no truncation).</small>"
+        )
+    return html
+# ----------------------------------------
+# 2. Prediction
+# ----------------------------------------
+def predict_amharic_news(text):
+    if not text.strip():
+        # Also return highlighted version (empty)
+        return "Please enter text.", None, "<i>No text provided.</i>"
+    # For actual model inference: truncate to MAX_LENGTH tokens
+    encoded = tokenizer(
         text,
         truncation=True,
+        padding="max_length",
+        max_length=MAX_LENGTH,
+        return_tensors="pt"
     )
+    encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
+    with torch.no_grad():
+        outputs = model(**encoded)
+        logits = outputs.logits
+        probs = torch.softmax(logits, dim=-1).cpu().numpy()[0]
+    pred_id = int(np.argmax(probs))
+    pred_label = id2label.get(pred_id, f"LABEL_{pred_id}")
+    # Prepare probability table
+    rows = []
+    for i in range(num_labels):
+        rows.append((id2label.get(i, f"LABEL_{i}"), float(probs[i])))
+    rows = sorted(rows, key=lambda x: x[1], reverse=True)
+    # Build HTML showing tokens; tokens >512 in red
+    token_highlight_html = highlight_token_overflow(text, max_tokens=MAX_LENGTH)
+    # Now we return 3 outputs: prediction, probs table, token visualization
+    return f"Predicted Label: {pred_label}", rows, token_highlight_html
+# ----------------------------------------
+# 3. Gradio Interface
+# ----------------------------------------
+demo = gr.Interface(
+    fn=predict_amharic_news,
+    inputs=gr.Textbox(
+        lines=5,
+        label="Enter Amharic News Text",
+        placeholder="እባክዎ የአማርኛ ዜና ጽሑፍ ያስገቡ..."
+    ),
+    outputs=[
+        gr.Textbox(label="Prediction"),
+        gr.Dataframe(
+            headers=["Label", "Probability"],
+            label="Class Probabilities"
+        ),
+        gr.HTML(label="Tokenizer view (tokens > 512 are red)")
+    ],
+    title="Amharic News Classifier",
+    description=(
+        "XLM-RoBERTa model loaded directly from Hugging Face Hub (raw text input, no preprocessing). "
+        "Below, tokenizer output shows which tokens are beyond the 512-token context window (in red)."
     )
+)
+demo.launch()