Spaces:

Rulga
/

status-law-gbot

Running

App Files Files Community

Rulga commited on Mar 26

Commit

da8386d

1 Parent(s): 8f0f58a

Add ChatAnalyzer class for analyzing chat history and extracting insights

Browse files

Files changed (3) hide show

src/analytics/__init__.py +1 -0
src/analytics/chat_analyzer.py +190 -0
web/training_interface.py +63 -103

src/analytics/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ # Empty init file

src/analytics/chat_analyzer.py ADDED Viewed

	@@ -0,0 +1,190 @@

+class ChatAnalyzer:
+    """Chat history analyzer"""
+    def __init__(self):
+        self.history = []
+    def analyze_chats(self) -> str:
+        """
+        Analyzes chat history and returns a report
+        """
+        try:
+            from src.knowledge_base.dataset import DatasetManager
+            dataset = DatasetManager()
+            success, history = dataset.load_chat_history()
+            if not success:
+                return "Failed to load chat history"
+            if not history:
+                return "No chat history available for analysis"
+            # Basic analysis
+            total_chats = len(history)
+            total_messages = sum(len(chat) for chat in history)
+            avg_messages = total_messages / total_chats if total_chats > 0 else 0
+            report = f"""
+### Chat Analysis Report
+- Total conversations: {total_chats}
+- Total messages: {total_messages}
+- Average messages per conversation: {avg_messages:.1f}
+            """
+            return report
+        except Exception as e:
+            return f"Error during analysis: {str(e)}"
+    def extract_question_answer_pairs(self, min_question_length: int = 10) -> List[Dict[str, str]]:
+        """
+        Extract question-answer pairs from chat history
+        Args:
+            min_question_length: Minimum question length to include in the sample
+        Returns:
+            List of question-answer pairs in format [{"question": "...", "answer": "..."}]
+        """
+        chat_data = self.get_chat_data()
+        qa_pairs = []
+        for chat in chat_data:
+            messages = chat.get("messages", [])
+            # Go through messages and collect question-answer pairs
+            for i in range(len(messages) - 1):
+                if messages[i].get("role") == "user" and messages[i+1].get("role") == "assistant":
+                    question = messages[i].get("content", "").strip()
+                    answer = messages[i+1].get("content", "").strip()
+                    # Filter by question length
+                    if len(question) >= min_question_length and answer:
+                        qa_pairs.append({
+                            "question": question,
+                            "answer": answer
+                        })
+        return qa_pairs
+    def analyze_common_questions(self, top_n: int = 10) -> List[Tuple[str, int]]:
+        """
+        Analysis of most frequently asked questions
+        Args:
+            top_n: Number of most popular questions to return
+        Returns:
+            List of tuples (question, count)
+        """
+        qa_pairs = self.extract_question_answer_pairs()
+        # Extract only questions
+        questions = [qa["question"] for qa in qa_pairs]
+        # Preprocess questions for better grouping
+        processed_questions = []
+        for q in questions:
+            # Convert to lowercase
+            q = q.lower()
+            # Remove punctuation and extra spaces
+            q = re.sub(r'[^\w\s]', ' ', q)
+            q = re.sub(r'\s+', ' ', q).strip()
+            processed_questions.append(q)
+        # Count question frequency
+        question_counter = Counter(processed_questions)
+        # Get top_n most frequent questions
+        return question_counter.most_common(top_n)
+    def analyze_user_satisfaction(self) -> Dict[str, Any]:
+        """
+        Analysis of user satisfaction based on chat history
+        Returns:
+            Dictionary with satisfaction metrics
+        """
+        chat_data = self.get_chat_data()
+        # Initialize metrics
+        metrics = {
+            "total_conversations": len(chat_data),
+            "avg_messages_per_conversation": 0,
+            "avg_conversation_duration": 0,  # in seconds
+            "follow_up_questions_rate": 0,   # percentage of dialogs with follow-up questions
+        }
+        if not chat_data:
+            return metrics
+        # Calculate averages
+        metrics["avg_messages_per_conversation"] = total_messages / len(chat_data)
+        metrics["follow_up_questions_rate"] = conversations_with_followups / len(chat_data) * 100
+        # Calculate average duration if data exists
+        if total_duration > 0:
+            metrics["avg_conversation_duration"] = total_duration / len(chat_data)
+        return metrics
+    def extract_failed_questions(self) -> List[str]:
+        """
+        Extract questions that the bot failed to answer satisfactorily
+        Returns:
+            List of questions that need improvement
+        """
+        chat_data = self.get_chat_data()
+        failed_questions = []
+        # Keywords indicating unsatisfactory response
+        failure_indicators = [
+            "don't know", "cannot answer", "unable to answer",
+            "I don't have information", "no data available"
+        ]
+        for chat in chat_data:
+            messages = chat.get("messages", [])
+            for i in range(len(messages) - 1):
+                if messages[i].get("role") == "user" and messages[i+1].get("role") == "assistant":
+                    question = messages[i].get("content", "").strip()
+                    answer = messages[i+1].get("content", "").strip().lower()
+                    # Check if answer contains failure indicators
+                    if any(indicator in answer for indicator in failure_indicators):
+                        failed_questions.append(question)
+        return failed_questions
+    def export_training_data(self, output_file: str) -> Tuple[bool, str]:
+        """
+        Export training data in JSONL format
+        Args:
+            output_file: Path to output file
+        Returns:
+            (success, message)
+        """
+        try:
+            qa_pairs = self.extract_question_answer_pairs()
+            if not qa_pairs:
+                return False, "Not enough data for export"
+            with open(output_file, "w", encoding="utf-8") as f:
+                for pair in qa_pairs:
+                    training_example = {
+                        "messages": [
+                            {"role": "user", "content": pair["question"]},
+                            {"role": "assistant", "content": pair["answer"]}
+                        ]
+                    }
+                    f.write(json.dumps(training_example, ensure_ascii=False) + "\n")
+            return True, f"Training data successfully exported to {output_file}. Exported {len(qa_pairs)} examples."
+        except Exception as e:
+            return False, f"Error exporting training data: {str(e)}"

web/training_interface.py CHANGED Viewed

@@ -1,5 +1,5 @@
 """
-Веб-интерфейс для управления моделями и запуска дообучения
 """
 import os
@@ -14,93 +14,57 @@ from src.training.fine_tuner import FineTuner, finetune_from_chat_history
 from src.training.model_manager import ModelManager
 from config.settings import MODEL_PATH, TRAINING_OUTPUT_DIR
-# Инициализация менеджеров
 model_manager = ModelManager()
 chat_analyzer = ChatAnalyzer()
 def get_models_df():
     """
-    Получение датафрейма с моделями из реестра
     Returns:
-        pandas.DataFrame: Датафрейм с моделями
     """
     models = model_manager.list_models()
     if not models:
         return pd.DataFrame(columns=["model_id", "version", "description", "is_active", "registration_date"])
-    # Создаем датафрейм
     df = pd.DataFrame(models)
-    # Выбираем нужные колонки
     columns = ["model_id", "version", "description", "is_active", "registration_date"]
     df = df[columns]
-    # Сортируем по model_id и registration_date
     df = df.sort_values(by=["model_id", "registration_date"], ascending=[True, False])
     return df
 def generate_chat_analysis():
-    """
-    Генерация аналитического отчета по истории чатов
-    Returns:
-        str: HTML-отчет
-    """
-    report = chat_analyzer.generate_analytics_report()
-    if not report or report.get("total_conversations", 0) == 0:
-        return "### Нет данных для анализа\nИстория чатов пуста или не может быть загружена."
-    # Формируем HTML-отчет
-    html = f"""
-    ### Аналитический отчет по истории чатов
-    #### Основные метрики
-    - **Всего диалогов:** {report['total_conversations']}
-    - **Пар вопрос-ответ для обучения:** {report['qa_pairs_count']}
-    - **Вопросы без ответов:** {report['failed_questions_count']}
-    #### Метрики удовлетворенности
-    - **Среднее число сообщений в диалоге:** {report['satisfaction_metrics']['avg_messages_per_conversation']:.2f}
-    - **Процент диалогов с дополнительными вопросами:** {report['satisfaction_metrics']['follow_up_questions_rate']:.2f}%
-    """
-    # Популярные вопросы
-    if report.get('common_questions'):
-        html += "\n\n#### Популярные вопросы\n"
-        for i, (question, count) in enumerate(report['common_questions'][:10], 1):
-            html += f"{i}. \"{question}\" ({count} раз)\n"
-    # Вопросы без ответов
-    if report.get('failed_questions'):
-        html += "\n\n#### Примеры вопросов без ответов\n"
-        for i, question in enumerate(report['failed_questions'][:5], 1):
-            html += f"{i}. \"{question}\"\n"
-    return html
 def register_model_action(model_id, version, source, description, set_active):
     """
-    Действие регистрации модели
     Args:
-        model_id: Идентификатор модели
-        version: Версия модели
-        source: Источник модели
-        description: Описание модели
-        set_active: Установить как активную
     Returns:
-        str: Результат операции
     """
-    # Проверка входных данных
     if not model_id or not version or not source:
-        return "Ошибка: все поля обязательны для заполнения"
-    # Регистрация модели
     success, message = model_manager.register_model(
         model_id=model_id,
         version=version,
@@ -110,40 +74,40 @@ def register_model_action(model_id, version, source, description, set_active):
     )
     if not success:
-        return f"Ошибка: {message}"
-    # Если установлена опция загрузки модели, загружаем её
     if source.startswith("hf://"):
         success, download_message = model_manager.download_model(model_id, version)
         if not success:
-            return f"Модель зарегистрирована, но не загружена: {download_message}"
         message += f"\n{download_message}"
     return message
 def import_local_model_action(source_path, model_id, version, description, set_active):
     """
-    Действие импорта локальной модели
     Args:
-        source_path: Путь к директории с моделью
-        model_id: Идентификатор модели
-        version: Версия модели
-        description: Описание модели
-        set_active: Установить как активную
     Returns:
-        str: Результат операции
     """
-    # Проверка входных данных
     if not source_path or not model_id or not version:
-        return "Ошибка: все поля обязательны для заполнения"
-    # Проверка существования директории
     if not os.path.exists(source_path):
-        return f"Ошибка: директория {source_path} не существует"
-    # Импорт модели
     success, message = model_manager.import_local_model(
         source_path=source_path,
         model_id=model_id,
@@ -156,68 +120,64 @@ def import_local_model_action(source_path, model_id, version, description, set_a
 def set_active_model_action(model_row_index, models_df):
     """
-    Действие установки активной модели
     Args:
-        model_row_index: Индекс строки модели в датафрейме
-        models_df: Датафрейм с моделями
     Returns:
-        str: Результат операции
     """
     try:
-        # Получаем информацию о выбранной модели
         model_row = models_df.iloc[model_row_index]
         model_id = model_row["model_id"]
         version = model_row["version"]
-        # Устанавливаем как активную
         success, message = model_manager.set_active_model(model_id, version)
         return message
     except Exception as e:
-        return f"Ошибка: {str(e)}"
 def delete_model_action(model_row_index, models_df):
     """
-    Действие удаления модели
     Args:
-        model_row_index: Индекс строки модели в датафрейме
-        models_df: Датафрейм с моделями
     Returns:
-        str: Результат операции
     """
     try:
-        # Получаем информацию о выбранной модели
         model_row = models_df.iloc[model_row_index]
         model_id = model_row["model_id"]
         version = model_row["version"]
-        # Удаляем модель
         success, message = model_manager.delete_model(model_id, version)
         return message
     except Exception as e:
-        return f"Ошибка: {str(e)}"
-def start_finetune_action(
-    epochs,
-    batch_size,
-    learning_rate,
-    base_model_id,
-    new_model_id,
-    new_version,
-    description,
-    set_active
-):
-    """
-    Действие запуска дообучения модели
-    Args:
-        epochs: Количество эпох обучения
-        batch_size: Размер батча
-        learning_rate: Скорость обучения
-        base_model_id: Ид
-        """

 """
+Web interface for model management and training
 """
 import os
 from src.training.model_manager import ModelManager
 from config.settings import MODEL_PATH, TRAINING_OUTPUT_DIR
+# Initialize managers
 model_manager = ModelManager()
 chat_analyzer = ChatAnalyzer()
 def get_models_df():
     """
+    Get dataframe with models from registry
     Returns:
+        pandas.DataFrame: Dataframe with models
     """
     models = model_manager.list_models()
     if not models:
         return pd.DataFrame(columns=["model_id", "version", "description", "is_active", "registration_date"])
+    # Create dataframe
     df = pd.DataFrame(models)
+    # Select required columns
     columns = ["model_id", "version", "description", "is_active", "registration_date"]
     df = df[columns]
+    # Sort by model_id and registration_date
     df = df.sort_values(by=["model_id", "registration_date"], ascending=[True, False])
     return df
 def generate_chat_analysis():
+    """Generate analysis of chat history"""
+    return chat_analyzer.analyze_chats()
 def register_model_action(model_id, version, source, description, set_active):
     """
+    Model registration action
     Args:
+        model_id: Model identifier
+        version: Model version
+        source: Model source
+        description: Model description
+        set_active: Set as active
     Returns:
+        str: Operation result
     """
+    # Input validation
     if not model_id or not version or not source:
+        return "Error: all fields are required"
+    # Register model
     success, message = model_manager.register_model(
         model_id=model_id,
         version=version,
     )
     if not success:
+        return f"Error: {message}"
+    # If model download option is set, download it
     if source.startswith("hf://"):
         success, download_message = model_manager.download_model(model_id, version)
         if not success:
+            return f"Model registered but not downloaded: {download_message}"
         message += f"\n{download_message}"
     return message
 def import_local_model_action(source_path, model_id, version, description, set_active):
     """
+    Local model import action
     Args:
+        source_path: Path to model directory
+        model_id: Model identifier
+        version: Model version
+        description: Model description
+        set_active: Set as active
     Returns:
+        str: Operation result
     """
+    # Input validation
     if not source_path or not model_id or not version:
+        return "Error: all fields are required"
+    # Check directory existence
     if not os.path.exists(source_path):
+        return f"Error: directory {source_path} does not exist"
+    # Import model
     success, message = model_manager.import_local_model(
         source_path=source_path,
         model_id=model_id,
 def set_active_model_action(model_row_index, models_df):
     """
+    Set active model action
     Args:
+        model_row_index: Model row index in dataframe
+        models_df: Dataframe with models
     Returns:
+        str: Operation result
     """
     try:
+        # Get selected model information
         model_row = models_df.iloc[model_row_index]
         model_id = model_row["model_id"]
         version = model_row["version"]
+        # Set as active
         success, message = model_manager.set_active_model(model_id, version)
         return message
     except Exception as e:
+        return f"Error: {str(e)}"
 def delete_model_action(model_row_index, models_df):
     """
+    Delete model action
     Args:
+        model_row_index: Model row index in dataframe
+        models_df: Dataframe with models
     Returns:
+        str: Operation result
     """
     try:
+        # Get selected model information
         model_row = models_df.iloc[model_row_index]
         model_id = model_row["model_id"]
         version = model_row["version"]
+        # Delete model
         success, message = model_manager.delete_model(model_id, version)
         return message
     except Exception as e:
+        return f"Error: {str(e)}"
+def start_finetune_action(epochs, batch_size, learning_rate):
+    """Start model fine-tuning"""
+    try:
+        from src.training.fine_tuner import FineTuner
+        tuner = FineTuner()
+        success, message = tuner.train(
+            num_train_epochs=epochs,
+            per_device_train_batch_size=batch_size,
+            learning_rate=learning_rate
+        )
+        return f"Training {'completed' if success else 'failed'}: {message}"
+    except Exception as e:
+        return f"Error starting training: {str(e)}"