Spaces:

1NEYRON1
/

Topic_classification_for_scientific_articles

Sleeping

App Files Files Community

1NEYRON1 commited on Apr 4, 2025

Commit

72216f4

1 Parent(s): e4b1086

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -27

app.py CHANGED Viewed

@@ -1,32 +1,166 @@
 import streamlit as st
 from transformers import pipeline
 # Загружаем модель (замените на вашу модель, если нужно)
-# Для примера используем zero-shot-classification
 try:
-    classifier = pipeline("zero-shot-classification")
 except OSError as e:
     st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.")
     st.stop()  # Ос��ановка выполнения приложения при ошибке
-# model =
-# tokenizer =
-# topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
-topic_classifier = pipeline("text-classification")
-text = "This is an example sentence for topic classification."
-result = topic_classifier(text)
-print(result)
-def classify_text(title, description, candidate_labels, show_all=False, threshold=0.95):
     """
     Классифицирует текст и возвращает результаты в отсортированном виде.
     Args:
         title (str): Заголовок текста.
         description (str): Краткое описание текста.
-        candidate_labels (list): Список меток-кандидатов.
         show_all (bool): Показывать ли все результаты, независимо от порога.
         threshold (float): Порог суммарной вероятности.
@@ -34,23 +168,28 @@ def classify_text(title, description, candidate_labels, show_all=False, threshol
         list: Отсортированный список результатов классификации.
     """
     text = f"{title} {description}"  # Объединяем заголовок и описание
     try:
         results = topic_classifier(text)
         # results = topic_classifier(text, candidate_labels, multi_label=True)  # multi_label=True для нескольких меток
     except Exception as e:
         st.error(f"Ошибка классификации: {e}")
         return []
-    # Сортируем результаты по убыванию вероятности
-    sorted_results = sorted(zip(results['labels'], results['scores']), key=lambda x: x[1], reverse=True)
     if show_all:
-        return sorted_results
     else:
         cumulative_prob = 0
         filtered_results = []
-        for label, score in sorted_results:
-            filtered_results.append((label, score))
             cumulative_prob += score
             if cumulative_prob >= threshold:
                 break
@@ -64,18 +203,13 @@ st.title("Классификация статей")
 title = st.text_input("Заголовок статьи")
 description = st.text_area("Краткое описание статьи", height=150)
-# Ввод меток-кандидатов (разделенных запятыми)
-default_labels = "политика, экономика, спорт, культура, технологии, наука, происшествия"
-candidate_labels_str = st.text_input("Метки-кандидаты (через запятую)", default_labels)
-candidate_labels = [label.strip() for label in candidate_labels_str.split(",") if label.strip()]
 # Кнопка "Классифицировать"
 if st.button("Классифицировать"):
-    if not title or not description or not candidate_labels:
-        st.warning("Пожалуйста, заполните все поля.")
     else:
         with st.spinner("Идет классификация..."):  # Индикатор загрузки
-            results = classify_text(title, description, candidate_labels)
             if results:
               st.subheader("Результаты классификации (с ограничением по вероятности):")
               for label, score in results:
@@ -90,5 +224,5 @@ if st.button("Классифицировать"):
             else:
                 st.info("Не удалось получить результаты классификации.")
-elif title or description or candidate_labels_str != default_labels: #небольшой костыль, чтобы при старте не было предупреждения
     st.warning("Пожалуйста, заполните все поля.")

 import streamlit as st
 from transformers import pipeline
+id_to_cat = {0: 'Performance',
+ 1: 'Molecular Networks',
+ 2: 'Operating Systems',
+ 3: 'High Energy Astrophysical Phenomena',
+ 4: 'Computational Finance',
+ 5: 'General Finance',
+ 6: 'Astrophysics of Galaxies',
+ 7: 'Portfolio Management',
+ 8: 'Functional Analysis',
+ 9: 'Quantitative Methods',
+ 10: 'Mathematical Software',
+ 11: 'Computation',
+ 12: 'Chemical Physics',
+ 13: 'Information Theory',
+ 14: 'Classical Physics',
+ 15: 'Subcellular Processes',
+ 16: 'Medical Physics',
+ 17: 'Differential Geometry',
+ 18: 'Biomolecules',
+ 19: 'Metric Geometry',
+ 20: 'Cryptography and Security',
+ 21: 'Instrumentation and Methods for Astrophysics',
+ 22: 'General Mathematics',
+ 23: 'Computational Complexity',
+ 24: 'Soft Condensed Matter',
+ 25: 'Analysis of PDEs',
+ 26: 'Human-Computer Interaction',
+ 27: 'Classical Analysis and ODEs',
+ 28: 'Genomics',
+ 29: 'Optimization and Control',
+ 30: 'Applied Physics',
+ 31: 'Computational Engineering, Finance, and Science',
+ 32: 'Quantum Algebra',
+ 33: 'Other Condensed Matter',
+ 34: 'Category Theory',
+ 35: 'Popular Physics',
+ 36: 'General Topology',
+ 37: 'Algebraic Topology',
+ 38: 'Trading and Market Microstructure',
+ 39: 'Numerical Analysis',
+ 40: 'Applications',
+ 41: 'Group Theory',
+ 42: 'Cosmology and Nongalactic Astrophysics',
+ 43: 'Mathematical Physics',
+ 44: 'Econometrics',
+ 45: 'Systems and Control',
+ 46: 'Graphics',
+ 47: 'Data Structures and Algorithms',
+ 48: 'Operator Algebras',
+ 49: 'Number Theory',
+ 50: 'Robotics',
+ 51: 'Nuclear Theory',
+ 52: 'Neural and Evolutionary Computing',
+ 53: 'Multimedia',
+ 54: 'Information Retrieval',
+ 55: 'Image and Video Processing',
+ 56: 'Rings and Algebras',
+ 57: 'Instrumentation and Detectors',
+ 58: 'Social and Information Networks',
+ 59: 'High Energy Physics - Lattice',
+ 60: 'Emerging Technologies',
+ 61: 'Strongly Correlated Electrons',
+ 62: 'Representation Theory',
+ 63: 'Space Physics',
+ 64: 'Risk Management',
+ 65: 'Disordered Systems and Neural Networks',
+ 66: 'Databases',
+ 67: 'Networking and Internet Architecture',
+ 68: 'Computers and Society',
+ 69: 'Hardware Architecture',
+ 70: 'Chaotic Dynamics',
+ 71: 'Mesoscale and Nanoscale Physics',
+ 72: 'Computational Geometry',
+ 73: 'Commutative Algebra',
+ 74: 'Statistics Theory',
+ 75: 'General Literature',
+ 76: 'Physics and Society',
+ 77: 'Geophysics',
+ 78: 'Economics',
+ 79: 'Quantum Physics',
+ 80: 'Symbolic Computation',
+ 81: 'Computational Physics',
+ 82: 'Sound',
+ 83: 'Multiagent Systems',
+ 84: 'Signal Processing',
+ 85: 'Adaptation and Self-Organizing Systems',
+ 86: 'Other Computer Science',
+ 87: 'Other Quantitative Biology',
+ 88: 'Formal Languages and Automata Theory',
+ 89: 'Populations and Evolution',
+ 90: 'Spectral Theory',
+ 91: 'Pattern Formation and Solitons',
+ 92: 'Methodology',
+ 93: 'Biological Physics',
+ 94: 'General Physics',
+ 95: 'Logic in Computer Science',
+ 96: 'Complex Variables',
+ 97: 'Optics',
+ 98: 'Discrete Mathematics',
+ 99: 'History and Overview',
+ 100: 'Programming Languages',
+ 101: 'Audio and Speech Processing',
+ 102: 'Algebraic Geometry',
+ 103: 'Neurons and Cognition',
+ 104: 'High Energy Physics - Phenomenology',
+ 105: 'History and Philosophy of Physics',
+ 106: 'Earth and Planetary Astrophysics',
+ 107: 'Pricing of Securities',
+ 108: 'Distributed, Parallel, and Cluster Computing',
+ 109: 'Tissues and Organs',
+ 110: 'Cellular Automata and Lattice Gases',
+ 111: 'Statistical Finance',
+ 112: 'Materials Science',
+ 113: 'High Energy Physics - Theory',
+ 114: 'Digital Libraries',
+ 115: 'Other Statistics',
+ 116: 'Superconductivity',
+ 117: 'Cell Behavior',
+ 118: 'General Relativity and Quantum Cosmology',
+ 119: 'Dynamical Systems',
+ 120: 'Statistical Mechanics',
+ 121: 'Fluid Dynamics',
+ 122: 'Computer Science and Game Theory',
+ 123: 'Logic',
+ 124: 'Computer Vision and Pattern Recognition',
+ 125: 'Solar and Stellar Astrophysics',
+ 126: 'High Energy Physics - Experiment',
+ 127: 'Software Engineering',
+ 128: 'Combinatorics',
+ 129: 'Data Analysis, Statistics and Probability',
+ 130: 'Machine Learning',
+ 131: 'Probability',
+ 132: 'Atmospheric and Oceanic Physics',
+ 133: 'Geometric Topology',
+ 134: 'Computation and Language',
+ 135: 'Quantum Gases',
+ 136: 'Nuclear Experiment',
+ 137: 'Artificial Intelligence'}
 # Загружаем модель (замените на вашу модель, если нужно)
+model_name = ''
 try:
+    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_name,
+        num_labels=len(id_to_cat),
+        problem_type="multi_label_classification"
+    )
 except OSError as e:
     st.error(f"Ошибка загрузки модели: {e}. Убедитесь, что модель доступна или укажите другую.")
     st.stop()  # Ос��ановка выполнения приложения при ошибке
+def classify_text(title, description, show_all=False, threshold=0.95):
     """
     Классифицирует текст и возвращает результаты в отсортированном виде.
     Args:
         title (str): Заголовок текста.
         description (str): Краткое описание текста.
         show_all (bool): Показывать ли все результаты, независимо от порога.
         threshold (float): Порог суммарной вероятности.
         list: Отсортированный список результатов классификации.
     """
     text = f"{title} {description}"  # Объединяем заголовок и описание
+    topic_classifier = pipeline("text-classification", model=model, tokenizer=tokenizer, top_k = len(id_to_cat))
     try:
         results = topic_classifier(text)
         # results = topic_classifier(text, candidate_labels, multi_label=True)  # multi_label=True для нескольких меток
     except Exception as e:
         st.error(f"Ошибка классификации: {e}")
         return []
+    for i in results[0]:
+        i['label'] = id_to_category[int(i['label'].split('_')[1])]
     if show_all:
+        filtered_results = []
+        for i in results[0]:
+            filtered_results.append((i['label'], i['score']))
+        return filtered_results
     else:
         cumulative_prob = 0
         filtered_results = []
+        for i in results[0]:
+            filtered_results.append((i['label'], i['score']))
             cumulative_prob += score
             if cumulative_prob >= threshold:
                 break
 title = st.text_input("Заголовок статьи")
 description = st.text_area("Краткое описание статьи", height=150)
 # Кнопка "Классифицировать"
 if st.button("Классифицировать"):
+    if not title or not description:
+        st.warning("Пожалуйста, заполните хотя бы одно поле.")
     else:
         with st.spinner("Идет классификация..."):  # Индикатор загрузки
+            results = classify_text(title, description)
             if results:
               st.subheader("Результаты классификации (с ограничением по вероятности):")
               for label, score in results:
             else:
                 st.info("Не удалось получить результаты классификации.")
+elif title or description: #небольшой костыль, чтобы при старте не было предупреждения
     st.warning("Пожалуйста, заполните все поля.")