import streamlit as  st
from transformers import pipeline, AutoTokenizer
import pandas as pd

@st.cache_resource
def load_model():
    tokenizer = AutoTokenizer.from_pretrained('FedorX8/arxiv-classification-bert-uncased')
    return pipeline(task='text-classification', model="FedorX8/arxiv-classification-bert-uncased", tokenizer=tokenizer, return_all_scores=True)

def get_top_p(result, top_p=0.95):
    result = sorted(result, key=lambda x: x['score'], reverse=True)
    prob_sum = 0
    classes = []
    probs = []
    for elem in result:
        score = elem['score']
        label = elem['label']
        if prob_sum < top_p:
            prob_sum += score
            probs.append(score)
            classes.append(label)
    return classes, probs


st_model = load_model()

st.header('Web interface for arXiv articles classification')

# Создание раскрывающегося текста
expander = st.expander("Click to read description of possible classes")
expander.markdown("""
1. math.AC — Commutative Algebra
2. cs.CV — Computer Vision and Pattern Recognition
3. cs.AI — Artificial Intelligence
4. cs.SY — Systems and Control
5. math.GR — Group Theory
6. cs.CE — Computational Engineering, Finance, and Science
7. cs.PL — Programming Languages
8. cs.IT — Information Theory
9. cs.DS — Data Structures and Algorithms
10. cs.NE — Neural and Evolutionary Computing
11. math.ST — Statistics Theory
""")

query = st.text_input("Enter the text of the papper", value="AI")
if query:
    result = st_model(query)
    classes, probs = get_top_p(result[0])
    data_dict = {
        "classes": classes,
        "probabilites": probs
    }
    df = pd.DataFrame(data_dict)
    st.write(df)