Spaces:

Zalimannard
/

NLP_Homework_1

Sleeping

App Files Files Community

Kolesnikov Dmitry commited on Nov 15, 2025

Commit

6dcad4a

1 Parent(s): 753b589

fix: Ошибки на трёх страницах

Browse files

Files changed (1) hide show

src/streamlit_app.py +42 -7

src/streamlit_app.py CHANGED Viewed

@@ -497,7 +497,9 @@ def main():
                     # Проекция документов
                     coords = embed_2d(lsa["X_reduced"], method=proj_method)
                     proj_df = pd.DataFrame({"x": coords[:,0], "y": coords[:,1]})
-                    st.plotly_chart(px.scatter(proj_df, x="x", y="y", title=f"Проекция документов ({proj_method.upper()})"), use_container_width=True)
     # ======== Эмбеддинги (ЛР2: Word2Vec/FastText/Doc2Vec + эксперименты) ========
     with main_tabs[2]:
@@ -604,7 +606,8 @@ def main():
             )
             # Создание разметки (упрощенная версия - пользователь должен разметить данные заранее)
-            st.info("💡 Для полноценной работы требуется размеченный датасет. Здесь показана демонстрация на синтетических данных.")
             # Генерация синтетических меток для демонстрации
             if "labels" not in st.session_state or st.session_state.get("task_type") != task_type:
@@ -706,16 +709,48 @@ def main():
                     st.dataframe(st.session_state["classification_results"], use_container_width=True)
                     # Важность признаков
-                    if "vectorizer_classification" in st.session_state:
                         st.subheader("🔍 Важные слова")
                         vectorizer = st.session_state["vectorizer_classification"]
                         if "Logistic Regression" in selected_models:
                             # Создаем простую модель для демонстрации
                             from sklearn.linear_model import LogisticRegression
-                            model = LogisticRegression(max_iter=1000, random_state=42)
-                            model.fit(X_train, y_train)
-                            important_words = get_tfidf_important_words(vectorizer, model, class_idx=0, top_k=20)
-                            st.dataframe(important_words, use_container_width=True)
     # ======== Кластеризация (ЛР4) ========
     with main_tabs[4]:

                     # Проекция документов
                     coords = embed_2d(lsa["X_reduced"], method=proj_method)
                     proj_df = pd.DataFrame({"x": coords[:,0], "y": coords[:,1]})
+                    import plotly.express as px_plot
+                    fig = px_plot.scatter(proj_df, x="x", y="y", title=f"Проекция документов ({proj_method.upper()})")
+                    st.plotly_chart(fig, use_container_width=True)
     # ======== Эмбеддинги (ЛР2: Word2Vec/FastText/Doc2Vec + эксперименты) ========
     with main_tabs[2]:
             )
             # Создание разметки (упрощенная версия - пользователь должен разметить данные заранее)
+            with st.expander("ℹ️ Информация о данных", expanded=False):
+                st.info("💡 Для полноценной работы требуется размеченный датасет. Здесь показана демонстрация на синтетических данных, сгенерированных случайным образом.")
             # Генерация синтетических меток для демонстрации
             if "labels" not in st.session_state or st.session_state.get("task_type") != task_type:
                     st.dataframe(st.session_state["classification_results"], use_container_width=True)
                     # Важность признаков
+                    if "vectorizer_classification" in st.session_state and "X_classification" in st.session_state:
                         st.subheader("🔍 Важные слова")
                         vectorizer = st.session_state["vectorizer_classification"]
                         if "Logistic Regression" in selected_models:
                             # Создаем простую модель для демонстрации
                             from sklearn.linear_model import LogisticRegression
+                            from sklearn.multioutput import MultiOutputClassifier
+                            # Получаем данные из session_state
+                            X_full = st.session_state["X_classification"]
+                            y_full = labels[:len(processed_texts)]
+                            # Для multilabel используем MultiOutputClassifier
+                            if task_type == "Многометочная":
+                                # Проверяем, что y_full - это 2D массив
+                                if len(y_full.shape) == 1:
+                                    y_full = y_full.reshape(-1, 1)
+                                # Используем только часть данных для быстрой демонстрации
+                                X_demo = X_full[:min(100, len(X_full))]
+                                y_demo = y_full[:min(100, len(y_full))]
+                                model = MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
+                            else:
+                                # Для бинарной и многоклассовой классификации используем обычную модель
+                                # Убеждаемся, что y_full - это 1D массив
+                                if len(y_full.shape) > 1:
+                                    y_full = y_full.flatten() if y_full.shape[1] == 1 else y_full.argmax(axis=1)
+                                # Используем только часть данных для быстрой демонстрации
+                                X_demo = X_full[:min(100, len(X_full))]
+                                y_demo = y_full[:min(100, len(y_full))]
+                                model = LogisticRegression(max_iter=1000, random_state=42)
+                            try:
+                                model.fit(X_demo, y_demo)
+                                # Для multilabel берем первый классификатор
+                                if task_type == "Многометочная":
+                                    base_model = model.estimators_[0] if hasattr(model, 'estimators_') and len(model.estimators_) > 0 else model
+                                else:
+                                    base_model = model
+                                important_words = get_tfidf_important_words(vectorizer, base_model, class_idx=0, top_k=20)
+                                st.dataframe(important_words, use_container_width=True)
+                            except Exception as e:
+                                st.warning(f"Не удалось показать важные слова: {e}")
     # ======== Кластеризация (ЛР4) ========
     with main_tabs[4]: