Rulga commited on
Commit
da8386d
·
1 Parent(s): 8f0f58a

Add ChatAnalyzer class for analyzing chat history and extracting insights

Browse files
src/analytics/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Empty init file
src/analytics/chat_analyzer.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class ChatAnalyzer:
2
+ """Chat history analyzer"""
3
+
4
+ def __init__(self):
5
+ self.history = []
6
+
7
+ def analyze_chats(self) -> str:
8
+ """
9
+ Analyzes chat history and returns a report
10
+ """
11
+ try:
12
+ from src.knowledge_base.dataset import DatasetManager
13
+ dataset = DatasetManager()
14
+ success, history = dataset.load_chat_history()
15
+
16
+ if not success:
17
+ return "Failed to load chat history"
18
+
19
+ if not history:
20
+ return "No chat history available for analysis"
21
+
22
+ # Basic analysis
23
+ total_chats = len(history)
24
+ total_messages = sum(len(chat) for chat in history)
25
+ avg_messages = total_messages / total_chats if total_chats > 0 else 0
26
+
27
+ report = f"""
28
+ ### Chat Analysis Report
29
+
30
+ - Total conversations: {total_chats}
31
+ - Total messages: {total_messages}
32
+ - Average messages per conversation: {avg_messages:.1f}
33
+ """
34
+
35
+ return report
36
+
37
+ except Exception as e:
38
+ return f"Error during analysis: {str(e)}"
39
+
40
+ def extract_question_answer_pairs(self, min_question_length: int = 10) -> List[Dict[str, str]]:
41
+ """
42
+ Extract question-answer pairs from chat history
43
+
44
+ Args:
45
+ min_question_length: Minimum question length to include in the sample
46
+
47
+ Returns:
48
+ List of question-answer pairs in format [{"question": "...", "answer": "..."}]
49
+ """
50
+ chat_data = self.get_chat_data()
51
+ qa_pairs = []
52
+
53
+ for chat in chat_data:
54
+ messages = chat.get("messages", [])
55
+
56
+ # Go through messages and collect question-answer pairs
57
+ for i in range(len(messages) - 1):
58
+ if messages[i].get("role") == "user" and messages[i+1].get("role") == "assistant":
59
+ question = messages[i].get("content", "").strip()
60
+ answer = messages[i+1].get("content", "").strip()
61
+
62
+ # Filter by question length
63
+ if len(question) >= min_question_length and answer:
64
+ qa_pairs.append({
65
+ "question": question,
66
+ "answer": answer
67
+ })
68
+
69
+ return qa_pairs
70
+
71
+ def analyze_common_questions(self, top_n: int = 10) -> List[Tuple[str, int]]:
72
+ """
73
+ Analysis of most frequently asked questions
74
+
75
+ Args:
76
+ top_n: Number of most popular questions to return
77
+
78
+ Returns:
79
+ List of tuples (question, count)
80
+ """
81
+ qa_pairs = self.extract_question_answer_pairs()
82
+
83
+ # Extract only questions
84
+ questions = [qa["question"] for qa in qa_pairs]
85
+
86
+ # Preprocess questions for better grouping
87
+ processed_questions = []
88
+ for q in questions:
89
+ # Convert to lowercase
90
+ q = q.lower()
91
+ # Remove punctuation and extra spaces
92
+ q = re.sub(r'[^\w\s]', ' ', q)
93
+ q = re.sub(r'\s+', ' ', q).strip()
94
+ processed_questions.append(q)
95
+
96
+ # Count question frequency
97
+ question_counter = Counter(processed_questions)
98
+
99
+ # Get top_n most frequent questions
100
+ return question_counter.most_common(top_n)
101
+
102
+ def analyze_user_satisfaction(self) -> Dict[str, Any]:
103
+ """
104
+ Analysis of user satisfaction based on chat history
105
+
106
+ Returns:
107
+ Dictionary with satisfaction metrics
108
+ """
109
+ chat_data = self.get_chat_data()
110
+
111
+ # Initialize metrics
112
+ metrics = {
113
+ "total_conversations": len(chat_data),
114
+ "avg_messages_per_conversation": 0,
115
+ "avg_conversation_duration": 0, # in seconds
116
+ "follow_up_questions_rate": 0, # percentage of dialogs with follow-up questions
117
+ }
118
+
119
+ if not chat_data:
120
+ return metrics
121
+
122
+ # Calculate averages
123
+ metrics["avg_messages_per_conversation"] = total_messages / len(chat_data)
124
+ metrics["follow_up_questions_rate"] = conversations_with_followups / len(chat_data) * 100
125
+
126
+ # Calculate average duration if data exists
127
+ if total_duration > 0:
128
+ metrics["avg_conversation_duration"] = total_duration / len(chat_data)
129
+
130
+ return metrics
131
+
132
+ def extract_failed_questions(self) -> List[str]:
133
+ """
134
+ Extract questions that the bot failed to answer satisfactorily
135
+
136
+ Returns:
137
+ List of questions that need improvement
138
+ """
139
+ chat_data = self.get_chat_data()
140
+ failed_questions = []
141
+
142
+ # Keywords indicating unsatisfactory response
143
+ failure_indicators = [
144
+ "don't know", "cannot answer", "unable to answer",
145
+ "I don't have information", "no data available"
146
+ ]
147
+
148
+ for chat in chat_data:
149
+ messages = chat.get("messages", [])
150
+
151
+ for i in range(len(messages) - 1):
152
+ if messages[i].get("role") == "user" and messages[i+1].get("role") == "assistant":
153
+ question = messages[i].get("content", "").strip()
154
+ answer = messages[i+1].get("content", "").strip().lower()
155
+
156
+ # Check if answer contains failure indicators
157
+ if any(indicator in answer for indicator in failure_indicators):
158
+ failed_questions.append(question)
159
+
160
+ return failed_questions
161
+
162
+ def export_training_data(self, output_file: str) -> Tuple[bool, str]:
163
+ """
164
+ Export training data in JSONL format
165
+
166
+ Args:
167
+ output_file: Path to output file
168
+
169
+ Returns:
170
+ (success, message)
171
+ """
172
+ try:
173
+ qa_pairs = self.extract_question_answer_pairs()
174
+
175
+ if not qa_pairs:
176
+ return False, "Not enough data for export"
177
+
178
+ with open(output_file, "w", encoding="utf-8") as f:
179
+ for pair in qa_pairs:
180
+ training_example = {
181
+ "messages": [
182
+ {"role": "user", "content": pair["question"]},
183
+ {"role": "assistant", "content": pair["answer"]}
184
+ ]
185
+ }
186
+ f.write(json.dumps(training_example, ensure_ascii=False) + "\n")
187
+
188
+ return True, f"Training data successfully exported to {output_file}. Exported {len(qa_pairs)} examples."
189
+ except Exception as e:
190
+ return False, f"Error exporting training data: {str(e)}"
web/training_interface.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Веб-интерфейс для управления моделями и запуска дообучения
3
  """
4
 
5
  import os
@@ -14,93 +14,57 @@ from src.training.fine_tuner import FineTuner, finetune_from_chat_history
14
  from src.training.model_manager import ModelManager
15
  from config.settings import MODEL_PATH, TRAINING_OUTPUT_DIR
16
 
17
- # Инициализация менеджеров
18
  model_manager = ModelManager()
19
  chat_analyzer = ChatAnalyzer()
20
 
21
  def get_models_df():
22
  """
23
- Получение датафрейма с моделями из реестра
24
 
25
  Returns:
26
- pandas.DataFrame: Датафрейм с моделями
27
  """
28
  models = model_manager.list_models()
29
 
30
  if not models:
31
  return pd.DataFrame(columns=["model_id", "version", "description", "is_active", "registration_date"])
32
 
33
- # Создаем датафрейм
34
  df = pd.DataFrame(models)
35
 
36
- # Выбираем нужные колонки
37
  columns = ["model_id", "version", "description", "is_active", "registration_date"]
38
  df = df[columns]
39
 
40
- # Сортируем по model_id и registration_date
41
  df = df.sort_values(by=["model_id", "registration_date"], ascending=[True, False])
42
 
43
  return df
44
 
45
  def generate_chat_analysis():
46
- """
47
- Генерация аналитического отчета по истории чатов
48
-
49
- Returns:
50
- str: HTML-отчет
51
- """
52
- report = chat_analyzer.generate_analytics_report()
53
-
54
- if not report or report.get("total_conversations", 0) == 0:
55
- return "### Нет данных для анализа\nИстория чатов пуста или не может быть загружена."
56
-
57
- # Формируем HTML-отчет
58
- html = f"""
59
- ### Аналитический отчет по истории чатов
60
-
61
- #### Основные метрики
62
- - **Всего диалогов:** {report['total_conversations']}
63
- - **Пар вопрос-ответ для обучения:** {report['qa_pairs_count']}
64
- - **Вопросы без ответов:** {report['failed_questions_count']}
65
-
66
- #### Метрики удовлетворенности
67
- - **Среднее число сообщений в диалоге:** {report['satisfaction_metrics']['avg_messages_per_conversation']:.2f}
68
- - **Процент диалогов с дополнительными вопросами:** {report['satisfaction_metrics']['follow_up_questions_rate']:.2f}%
69
- """
70
-
71
- # Популярные вопросы
72
- if report.get('common_questions'):
73
- html += "\n\n#### Популярные вопросы\n"
74
- for i, (question, count) in enumerate(report['common_questions'][:10], 1):
75
- html += f"{i}. \"{question}\" ({count} раз)\n"
76
-
77
- # Вопросы без ответов
78
- if report.get('failed_questions'):
79
- html += "\n\n#### Примеры вопросов без ответов\n"
80
- for i, question in enumerate(report['failed_questions'][:5], 1):
81
- html += f"{i}. \"{question}\"\n"
82
-
83
- return html
84
 
85
  def register_model_action(model_id, version, source, description, set_active):
86
  """
87
- Действие регистрации модели
88
 
89
  Args:
90
- model_id: Идентификатор модели
91
- version: Версия модели
92
- source: Источник модели
93
- description: Описание модели
94
- set_active: Установить как активную
95
 
96
  Returns:
97
- str: Результат операции
98
  """
99
- # Проверка входных данных
100
  if not model_id or not version or not source:
101
- return "Ошибка: все поля обязательны для заполнения"
102
 
103
- # Регистрация модели
104
  success, message = model_manager.register_model(
105
  model_id=model_id,
106
  version=version,
@@ -110,40 +74,40 @@ def register_model_action(model_id, version, source, description, set_active):
110
  )
111
 
112
  if not success:
113
- return f"Ошибка: {message}"
114
 
115
- # Если установлена опция загрузки модели, загружаем её
116
  if source.startswith("hf://"):
117
  success, download_message = model_manager.download_model(model_id, version)
118
  if not success:
119
- return f"Модель зарегистрирована, но не загружена: {download_message}"
120
  message += f"\n{download_message}"
121
 
122
  return message
123
 
124
  def import_local_model_action(source_path, model_id, version, description, set_active):
125
  """
126
- Действие импорта локальной модели
127
 
128
  Args:
129
- source_path: Путь к директории с моделью
130
- model_id: Идентификатор модели
131
- version: Версия модели
132
- description: Описание модели
133
- set_active: Установить как активную
134
 
135
  Returns:
136
- str: Результат операции
137
  """
138
- # Проверка входных данных
139
  if not source_path or not model_id or not version:
140
- return "Ошибка: все поля обязательны для заполнения"
141
 
142
- # Проверка существования директории
143
  if not os.path.exists(source_path):
144
- return f"Ошибка: директория {source_path} не существует"
145
 
146
- # Импорт модели
147
  success, message = model_manager.import_local_model(
148
  source_path=source_path,
149
  model_id=model_id,
@@ -156,68 +120,64 @@ def import_local_model_action(source_path, model_id, version, description, set_a
156
 
157
  def set_active_model_action(model_row_index, models_df):
158
  """
159
- Действие установки активной модели
160
 
161
  Args:
162
- model_row_index: Индекс строки модели в датафрейме
163
- models_df: Датафрейм с моделями
164
 
165
  Returns:
166
- str: Результат операции
167
  """
168
  try:
169
- # Получаем информацию о выбранной модели
170
  model_row = models_df.iloc[model_row_index]
171
  model_id = model_row["model_id"]
172
  version = model_row["version"]
173
 
174
- # Устанавливаем как активную
175
  success, message = model_manager.set_active_model(model_id, version)
176
 
177
  return message
178
  except Exception as e:
179
- return f"Ошибка: {str(e)}"
180
 
181
  def delete_model_action(model_row_index, models_df):
182
  """
183
- Действие удаления модели
184
 
185
  Args:
186
- model_row_index: Индекс строки модели в датафрейме
187
- models_df: Датафрейм с моделями
188
 
189
  Returns:
190
- str: Результат операции
191
  """
192
  try:
193
- # Получаем информацию о выбранной модели
194
  model_row = models_df.iloc[model_row_index]
195
  model_id = model_row["model_id"]
196
  version = model_row["version"]
197
 
198
- # Удаляем модель
199
  success, message = model_manager.delete_model(model_id, version)
200
 
201
  return message
202
  except Exception as e:
203
- return f"Ошибка: {str(e)}"
204
 
205
- def start_finetune_action(
206
- epochs,
207
- batch_size,
208
- learning_rate,
209
- base_model_id,
210
- new_model_id,
211
- new_version,
212
- description,
213
- set_active
214
- ):
215
- """
216
- Действие запуска дообучения модели
217
-
218
- Args:
219
- epochs: Количество эпох обучения
220
- batch_size: Размер батча
221
- learning_rate: Скорость обучения
222
- base_model_id: Ид
223
- """
 
1
  """
2
+ Web interface for model management and training
3
  """
4
 
5
  import os
 
14
  from src.training.model_manager import ModelManager
15
  from config.settings import MODEL_PATH, TRAINING_OUTPUT_DIR
16
 
17
+ # Initialize managers
18
  model_manager = ModelManager()
19
  chat_analyzer = ChatAnalyzer()
20
 
21
  def get_models_df():
22
  """
23
+ Get dataframe with models from registry
24
 
25
  Returns:
26
+ pandas.DataFrame: Dataframe with models
27
  """
28
  models = model_manager.list_models()
29
 
30
  if not models:
31
  return pd.DataFrame(columns=["model_id", "version", "description", "is_active", "registration_date"])
32
 
33
+ # Create dataframe
34
  df = pd.DataFrame(models)
35
 
36
+ # Select required columns
37
  columns = ["model_id", "version", "description", "is_active", "registration_date"]
38
  df = df[columns]
39
 
40
+ # Sort by model_id and registration_date
41
  df = df.sort_values(by=["model_id", "registration_date"], ascending=[True, False])
42
 
43
  return df
44
 
45
  def generate_chat_analysis():
46
+ """Generate analysis of chat history"""
47
+ return chat_analyzer.analyze_chats()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
  def register_model_action(model_id, version, source, description, set_active):
50
  """
51
+ Model registration action
52
 
53
  Args:
54
+ model_id: Model identifier
55
+ version: Model version
56
+ source: Model source
57
+ description: Model description
58
+ set_active: Set as active
59
 
60
  Returns:
61
+ str: Operation result
62
  """
63
+ # Input validation
64
  if not model_id or not version or not source:
65
+ return "Error: all fields are required"
66
 
67
+ # Register model
68
  success, message = model_manager.register_model(
69
  model_id=model_id,
70
  version=version,
 
74
  )
75
 
76
  if not success:
77
+ return f"Error: {message}"
78
 
79
+ # If model download option is set, download it
80
  if source.startswith("hf://"):
81
  success, download_message = model_manager.download_model(model_id, version)
82
  if not success:
83
+ return f"Model registered but not downloaded: {download_message}"
84
  message += f"\n{download_message}"
85
 
86
  return message
87
 
88
  def import_local_model_action(source_path, model_id, version, description, set_active):
89
  """
90
+ Local model import action
91
 
92
  Args:
93
+ source_path: Path to model directory
94
+ model_id: Model identifier
95
+ version: Model version
96
+ description: Model description
97
+ set_active: Set as active
98
 
99
  Returns:
100
+ str: Operation result
101
  """
102
+ # Input validation
103
  if not source_path or not model_id or not version:
104
+ return "Error: all fields are required"
105
 
106
+ # Check directory existence
107
  if not os.path.exists(source_path):
108
+ return f"Error: directory {source_path} does not exist"
109
 
110
+ # Import model
111
  success, message = model_manager.import_local_model(
112
  source_path=source_path,
113
  model_id=model_id,
 
120
 
121
  def set_active_model_action(model_row_index, models_df):
122
  """
123
+ Set active model action
124
 
125
  Args:
126
+ model_row_index: Model row index in dataframe
127
+ models_df: Dataframe with models
128
 
129
  Returns:
130
+ str: Operation result
131
  """
132
  try:
133
+ # Get selected model information
134
  model_row = models_df.iloc[model_row_index]
135
  model_id = model_row["model_id"]
136
  version = model_row["version"]
137
 
138
+ # Set as active
139
  success, message = model_manager.set_active_model(model_id, version)
140
 
141
  return message
142
  except Exception as e:
143
+ return f"Error: {str(e)}"
144
 
145
  def delete_model_action(model_row_index, models_df):
146
  """
147
+ Delete model action
148
 
149
  Args:
150
+ model_row_index: Model row index in dataframe
151
+ models_df: Dataframe with models
152
 
153
  Returns:
154
+ str: Operation result
155
  """
156
  try:
157
+ # Get selected model information
158
  model_row = models_df.iloc[model_row_index]
159
  model_id = model_row["model_id"]
160
  version = model_row["version"]
161
 
162
+ # Delete model
163
  success, message = model_manager.delete_model(model_id, version)
164
 
165
  return message
166
  except Exception as e:
167
+ return f"Error: {str(e)}"
168
 
169
+ def start_finetune_action(epochs, batch_size, learning_rate):
170
+ """Start model fine-tuning"""
171
+ try:
172
+ from src.training.fine_tuner import FineTuner
173
+
174
+ tuner = FineTuner()
175
+ success, message = tuner.train(
176
+ num_train_epochs=epochs,
177
+ per_device_train_batch_size=batch_size,
178
+ learning_rate=learning_rate
179
+ )
180
+
181
+ return f"Training {'completed' if success else 'failed'}: {message}"
182
+ except Exception as e:
183
+ return f"Error starting training: {str(e)}"