import streamlit as st from transformers import pipeline, AutoTokenizer import pandas as pd @st.cache_resource def load_model(): tokenizer = AutoTokenizer.from_pretrained('FedorX8/arxiv-classification-bert-uncased') return pipeline(task='text-classification', model="FedorX8/arxiv-classification-bert-uncased", tokenizer=tokenizer, return_all_scores=True) def get_top_p(result, top_p=0.95): result = sorted(result, key=lambda x: x['score'], reverse=True) prob_sum = 0 classes = [] probs = [] for elem in result: score = elem['score'] label = elem['label'] if prob_sum < top_p: prob_sum += score probs.append(score) classes.append(label) return classes, probs st_model = load_model() st.header('Web interface for arXiv articles classification') # Создание раскрывающегося текста expander = st.expander("Click to read description of possible classes") expander.markdown(""" 1. math.AC — Commutative Algebra 2. cs.CV — Computer Vision and Pattern Recognition 3. cs.AI — Artificial Intelligence 4. cs.SY — Systems and Control 5. math.GR — Group Theory 6. cs.CE — Computational Engineering, Finance, and Science 7. cs.PL — Programming Languages 8. cs.IT — Information Theory 9. cs.DS — Data Structures and Algorithms 10. cs.NE — Neural and Evolutionary Computing 11. math.ST — Statistics Theory """) query = st.text_input("Enter the text of the papper", value="AI") if query: result = st_model(query) classes, probs = get_top_p(result[0]) data_dict = { "classes": classes, "probabilites": probs } df = pd.DataFrame(data_dict) st.write(df)