File size: 2,440 Bytes
204159d f0de0e1 204159d af315b6 204159d 6a8a4c3 f0de0e1 204159d 9215f2d 204159d 005516a 204159d 005516a 204159d f0de0e1 204159d af315b6 204159d af315b6 2c37fc0 af315b6 2c37fc0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 | import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import hf_hub_download
import torch
import numpy as np
import pandas as pd
import json
@st.cache_resource
def load_model():
repo_id = "MurDanya/ml-course-article-classifier"
model = AutoModelForSequenceClassification.from_pretrained(repo_id)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
file_path = hf_hub_download(repo_id, "labels.json")
with open(file_path) as f:
labels = json.load(f)
id2label = {int(idx): label for idx, label in labels['id2label'].items()}
categories = labels['categories']
return tokenizer, model, id2label, categories
def get_top95(labels, probs):
sorted_indices = torch.argsort(probs, descending=True)
sorted_probs = probs[sorted_indices]
sorted_labels = [labels[i.item()] for i in sorted_indices]
cumulative = torch.cumsum(sorted_probs, dim=0)
cutoff = torch.where(cumulative >= 0.95)[0]
last_idx = cutoff[0].item() + 1 if len(cutoff) > 0 else len(sorted_probs)
return list(zip(sorted_labels[:last_idx], sorted_probs[:last_idx].tolist()))
# UI
st.set_page_config(page_title="Article Topic Classifier")
st.title("Article Topic Classifier")
st.markdown("Enter the **title** and optionally **abstract** of the article.")
title = st.text_input("Title", placeholder="e.g. Neural Networks for Quantum Physics")
abstract = st.text_area("Abstract (optional)", placeholder="e.g. We explore the application of neural nets...")
if st.button("Classify"):
if not title and not abstract:
st.warning("Please enter at least the title.")
else:
tokenizer, model, id2label, categories = load_model()
text = title + " - " + abstract if abstract else title
inputs = tokenizer(text, return_tensors="pt", truncation=True)
with torch.no_grad():
outputs = model(**inputs)
probs = torch.nn.functional.softmax(outputs.logits[0], dim=-1)
top_labels = get_top95(id2label, probs)
results = []
for label, prob in top_labels:
results.append({
"Category": categories[label],
"ID": label,
"Confidence": f"{prob * 100:.1f} %"
})
df = pd.DataFrame(results)
df.index += 1
st.markdown("### Top 95% Predicted Topics")
st.table(df) |