File size: 2,440 Bytes
204159d
 
f0de0e1
204159d
 
af315b6
204159d
 
 
 
6a8a4c3
f0de0e1
 
 
 
 
 
 
 
 
 
204159d
 
 
 
9215f2d
204159d
 
005516a
204159d
 
 
 
 
 
005516a
204159d
 
 
 
 
 
 
 
 
f0de0e1
 
 
204159d
 
 
 
 
 
 
af315b6
204159d
af315b6
 
 
 
 
 
 
2c37fc0
af315b6
2c37fc0
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import streamlit as st
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import hf_hub_download
import torch
import numpy as np
import pandas as pd
import json

@st.cache_resource
def load_model():
    repo_id = "MurDanya/ml-course-article-classifier"
    model = AutoModelForSequenceClassification.from_pretrained(repo_id)
    tokenizer = AutoTokenizer.from_pretrained(repo_id)

    file_path = hf_hub_download(repo_id, "labels.json")
    with open(file_path) as f:
        labels = json.load(f)
    id2label = {int(idx): label for idx, label in labels['id2label'].items()}
    categories = labels['categories']

    return tokenizer, model, id2label, categories

def get_top95(labels, probs):
    sorted_indices = torch.argsort(probs, descending=True)
    sorted_probs = probs[sorted_indices]
    sorted_labels = [labels[i.item()] for i in sorted_indices]

    cumulative = torch.cumsum(sorted_probs, dim=0)
    cutoff = torch.where(cumulative >= 0.95)[0]
    last_idx = cutoff[0].item() + 1 if len(cutoff) > 0 else len(sorted_probs)

    return list(zip(sorted_labels[:last_idx], sorted_probs[:last_idx].tolist()))

# UI
st.set_page_config(page_title="Article Topic Classifier")
st.title("Article Topic Classifier")
st.markdown("Enter the **title** and optionally **abstract** of the article.")

title = st.text_input("Title", placeholder="e.g. Neural Networks for Quantum Physics")
abstract = st.text_area("Abstract (optional)", placeholder="e.g. We explore the application of neural nets...")

if st.button("Classify"):
    if not title and not abstract:
        st.warning("Please enter at least the title.")
    else:
        tokenizer, model, id2label, categories = load_model()

        text = title + " - " + abstract if abstract else title
        inputs = tokenizer(text, return_tensors="pt", truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
            probs = torch.nn.functional.softmax(outputs.logits[0], dim=-1)

        top_labels = get_top95(id2label, probs)

        results = []
        for label, prob in top_labels:
            results.append({
                "Category": categories[label],
                "ID": label,
                "Confidence": f"{prob * 100:.1f} %"
            })
        
        df = pd.DataFrame(results)
        df.index += 1
        
        st.markdown("### Top 95% Predicted Topics")
        st.table(df)