Spaces:

Akeb0n0
/

ML2_HFtune_basic

Sleeping

App Files Files Community

Akeb0n0 commited on Apr 5, 2025

Commit

a7e2485

verified ·

1 Parent(s): c685cad

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -2

app.py CHANGED Viewed

@@ -1,4 +1,100 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+import torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from PIL import Image
+import requests
+from io import BytesIO
+@st.cache_data
+def load_header_image():
+    response = requests.get(
+        "https://upload.wikimedia.org/wikipedia/commons/thumb/b/bc/ArXiv_logo_2022.svg/512px-ArXiv_logo_2022.svg.png"
+    )
+    return Image.open(BytesIO(response.content))
+@st.cache_resource
+def load_model():
+    checkpoint = torch.load('TinyBERT_cls_model.pt', map_location='cpu')
+    model = AutoModelForSequenceClassification.from_pretrained(
+        "huawei-noah/TinyBERT_General_4L_312D",
+        num_labels=len(checkpoint['idx_to_category'])
+    )
+    model.load_state_dict(checkpoint['model_state_dict'])
+    tokenizer = checkpoint['tokenizer']
+    idx_to_category = checkpoint['idx_to_category']
+    return model, tokenizer, idx_to_category
+def predict(title, abstract, model, tokenizer, idx_to_category, threshold=0.95):
+    text = f"{title} /n {abstract}" if abstract else title
+    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)[0]
+    sorted_probs, sorted_indices = torch.sort(probs, descending=True)
+    results = []
+    cumulative_prob = 0.0
+    for i in range(len(sorted_probs)):
+        if cumulative_prob >= threshold:
+            break
+        prob = sorted_probs[i].item()
+        results.append({
+            "category": idx_to_category[sorted_indices[i].item()],
+            "probability": prob
+        })
+        cumulative_prob += prob
+    return results, cumulative_prob
+def main():
+    model, tokenizer, idx_to_category = load_model()
+    header_img = load_header_image()
+    st.set_page_config(page_title="arXiv Classifier", layout="wide")
+    col1, col2 = st.columns([1, 4])
+    with col1:
+        st.image(header_img, width=100)
+    with col2:
+        st.title("arXiv Article Classifier")
+        st.markdown("Определение тематики научных статей по названию и аннотации")
+    with st.form("input_form"):
+        title = st.text_input("Название статьи*", placeholder="Введите название...")
+        abstract = st.text_area("Аннотация", placeholder="Введите текст аннотации (необязательно)...", height=150)
+        submitted = st.form_submit_button("Классифицировать")
+        if submitted and not title:
+            st.error("Пожалуйста, введите название статьи")
+    if submitted and title:
+        with st.spinner("Анализируем статью..."):
+            results, total_prob = predict(
+                title=title,
+                abstract=abstract,
+                model=model,
+                tokenizer=tokenizer,
+                idx_to_category=idx_to_category
+            )
+        st.success("Результаты классификации:")
+        st.metric("Общая вероятность", f"{total_prob*100:.1f}%")
+        for i, res in enumerate(results, 1):
+            col1, col2 = st.columns([1, 4])
+            with col1:
+                st.metric(f"Топ {i}", f"{res['probability']*100:.1f}%")
+            with col2:
+                st.progress(res['probability'], text=res['category'])
+if __name__ == "__main__":
+    main()