Rulga commited on
Commit
75bf67b
·
1 Parent(s): 9a1d867

Enhance evaluation interface by adding force reload option, improving data refresh handling, and updating QA pairs display logic

Browse files
app.py CHANGED
@@ -1106,18 +1106,26 @@ with gr.Blocks() as demo:
1106
  with gr.Column(scale=1):
1107
  evaluation_status = gr.Textbox(label="Evaluation Status", interactive=False)
1108
  refresh_status_btn = gr.Button("Refresh Status")
 
 
 
 
 
 
 
1109
 
1110
  with gr.Column(scale=1):
1111
  evaluation_report = gr.HTML(label="Evaluation Report")
1112
  refresh_report_btn = gr.Button("Generate Report")
1113
 
1114
- # QA pairs table section
1115
- show_evaluated = gr.Checkbox(label="Show Already Evaluated Pairs", value=False)
1116
- qa_table = gr.DataFrame(
1117
- get_qa_pairs_dataframe(chat_evaluator),
1118
- interactive=True,
1119
- wrap=True
1120
- )
 
1121
 
1122
  # Conversation selection section
1123
  gr.Markdown("### Select Conversation to Evaluate")
@@ -1160,9 +1168,9 @@ with gr.Blocks() as demo:
1160
 
1161
  # Event handlers for Chat Evaluation
1162
  refresh_status_btn.click(
1163
- fn=lambda: get_evaluation_status(chat_evaluator),
1164
  inputs=[],
1165
- outputs=[evaluation_status]
1166
  )
1167
 
1168
  refresh_report_btn.click(
 
1106
  with gr.Column(scale=1):
1107
  evaluation_status = gr.Textbox(label="Evaluation Status", interactive=False)
1108
  refresh_status_btn = gr.Button("Refresh Status")
1109
+
1110
+ # Add status message for data refresh
1111
+ refresh_data_status = gr.Textbox(
1112
+ label="Refresh Status",
1113
+ interactive=False,
1114
+ visible=True
1115
+ )
1116
 
1117
  with gr.Column(scale=1):
1118
  evaluation_report = gr.HTML(label="Evaluation Report")
1119
  refresh_report_btn = gr.Button("Generate Report")
1120
 
1121
+ # QA pairs table section
1122
+ show_evaluated = gr.Checkbox(label="Show Already Evaluated Pairs", value=False)
1123
+ import pandas as pd
1124
+ qa_table = gr.DataFrame(
1125
+ pd.DataFrame(columns=["Conversation ID", "Question", "Timestamp", "Evaluated"]),
1126
+ interactive=True,
1127
+ wrap=True
1128
+ )
1129
 
1130
  # Conversation selection section
1131
  gr.Markdown("### Select Conversation to Evaluate")
 
1168
 
1169
  # Event handlers for Chat Evaluation
1170
  refresh_status_btn.click(
1171
+ fn=lambda: get_evaluation_status(chat_evaluator, force_reload=True),
1172
  inputs=[],
1173
+ outputs=[evaluation_status, qa_table, refresh_data_status]
1174
  )
1175
 
1176
  refresh_report_btn.click(
src/analytics/chat_evaluator.py CHANGED
@@ -23,7 +23,7 @@ from config.settings import (
23
  class ChatEvaluator:
24
  def __init__(self, hf_token: str = None, dataset_id: str = None):
25
  """
26
- Initialize chat evaluator
27
 
28
  Args:
29
  hf_token: Hugging Face token
@@ -33,10 +33,15 @@ class ChatEvaluator:
33
  self.dataset_id = dataset_id or DATASET_ID
34
  self.api = HfApi(token=self.hf_token)
35
 
36
- # Используем пути из settings
37
  self.chat_history_path = DATASET_CHAT_HISTORY_PATH
38
  self.annotations_path = DATASET_ANNOTATIONS_PATH
39
 
 
 
 
 
 
40
  # Ensure directories exist in dataset
41
  try:
42
  self._ensure_dataset_structure()
@@ -69,11 +74,27 @@ class ChatEvaluator:
69
  logger.error(f"Error ensuring dataset structure: {e}")
70
  raise
71
 
72
-
73
- def get_chat_history(self) -> List[Dict[str, Any]]:
 
 
 
 
 
 
 
 
74
  """
75
  Get all chat histories from the dataset
 
 
 
76
  """
 
 
 
 
 
77
  try:
78
  # Get list of all files in chat history directory
79
  files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
@@ -81,7 +102,7 @@ class ChatEvaluator:
81
  # Filter for chat history files
82
  chat_path = f"{self.chat_history_path}/"
83
  chat_files = [f for f in files if f.startswith(chat_path) and f.endswith('.json')]
84
- logger.debug(f"Found {len(chat_files)} chat files") # Более компактный лог
85
 
86
  histories = []
87
  for file in chat_files:
@@ -102,6 +123,8 @@ class ChatEvaluator:
102
  logger.error(f"Error processing chat file {file}: {e}")
103
  continue
104
 
 
 
105
  return histories
106
 
107
  except Exception as e:
@@ -133,20 +156,26 @@ class ChatEvaluator:
133
  logger.debug(f"Extracted {len(qa_pairs)} QA pairs")
134
  return qa_pairs
135
 
136
- def get_qa_pairs_for_evaluation(self, limit: int = 50) -> List[Dict[str, Any]]:
137
  """
138
  Extract question-answer pairs for evaluation
139
 
140
  Args:
141
  limit: Maximum number of pairs to return
 
142
 
143
  Returns:
144
  List of QA pairs with metadata
145
  """
146
- chat_data = self.get_chat_history()
 
 
 
 
 
147
  qa_pairs = []
148
 
149
- print(f"Debug - Processing {len(chat_data)} chat histories") # Debug print
150
 
151
  for chat in chat_data:
152
  conversation_id = chat.get("conversation_id", "unknown")
@@ -170,24 +199,26 @@ class ChatEvaluator:
170
  "question_timestamp": messages[i].get("timestamp", ""),
171
  "answer_timestamp": messages[i+1].get("timestamp", "")
172
  })
173
-
174
- # Check if we've reached the limit
175
- if len(qa_pairs) >= limit:
176
- print(f"Debug - Reached limit of {limit} QA pairs") # Debug print
177
- return qa_pairs
178
 
179
- print(f"Debug - Extracted {len(qa_pairs)} QA pairs") # Debug print
180
- return qa_pairs
 
 
 
 
181
 
182
- def get_evaluation_status(self) -> Dict[str, int]:
183
  """
184
  Get status of evaluated QA pairs
185
 
 
 
 
186
  Returns:
187
  Dictionary with counts of evaluated and unevaluated QA pairs
188
  """
189
- all_pairs = self.get_qa_pairs_for_evaluation(limit=1000) # Get a large sample
190
- evaluated_pairs = self.get_annotations()
191
 
192
  # Count evaluated conversation IDs
193
  evaluated_ids = set(item.get("conversation_id") for item in evaluated_pairs)
@@ -246,16 +277,27 @@ class ChatEvaluator:
246
  repo_type="dataset"
247
  )
248
 
 
 
 
249
  return True, "Annotation saved successfully"
250
 
251
  except Exception as e:
252
  logger.error(f"Error saving annotation: {e}")
253
  return False, f"Failed to save annotation: {str(e)}"
254
 
255
- def get_annotations(self) -> List[Dict[str, Any]]:
256
  """
257
  Get all saved annotations from dataset
 
 
 
258
  """
 
 
 
 
 
259
  try:
260
  annotations = []
261
  files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
@@ -277,25 +319,36 @@ class ChatEvaluator:
277
 
278
  # Sort by timestamp (newest first)
279
  annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
 
 
 
 
280
  return annotations
281
 
282
  except Exception as e:
283
  logger.error(f"Error getting annotations: {e}")
284
  return []
285
 
286
- def get_annotation_by_conversation_id(self, conversation_id: str) -> Optional[Dict[str, Any]]:
287
  """
288
  Get annotation for a specific conversation
289
 
290
  Args:
291
  conversation_id: Conversation ID to look for
 
292
 
293
  Returns:
294
  Annotation object or None if not found
295
  """
 
 
 
 
 
 
296
  try:
297
- # Используем DATASET_ANNOTATIONS_PATH для формирования пути
298
- filename = f"{DATASET_ANNOTATIONS_PATH}/annotation_{conversation_id}.json"
299
 
300
  # Download and parse annotation file
301
  content = self.api.hf_hub_download(
@@ -404,21 +457,4 @@ class ChatEvaluator:
404
  improved_count = sum(1 for a in annotations if a.get("original_answer") != a.get("improved_answer"))
405
  metrics["improvement_rate"] = (improved_count / len(annotations)) * 100
406
 
407
- return metrics
408
-
409
-
410
-
411
-
412
-
413
-
414
-
415
-
416
-
417
-
418
-
419
-
420
-
421
-
422
-
423
-
424
-
 
23
  class ChatEvaluator:
24
  def __init__(self, hf_token: str = None, dataset_id: str = None):
25
  """
26
+ Initialize chat evaluator with lazy loading
27
 
28
  Args:
29
  hf_token: Hugging Face token
 
33
  self.dataset_id = dataset_id or DATASET_ID
34
  self.api = HfApi(token=self.hf_token)
35
 
36
+ # Using paths from settings
37
  self.chat_history_path = DATASET_CHAT_HISTORY_PATH
38
  self.annotations_path = DATASET_ANNOTATIONS_PATH
39
 
40
+ # Cache for chat histories and QA pairs
41
+ self._chat_histories = None
42
+ self._qa_pairs = None
43
+ self._annotations = None
44
+
45
  # Ensure directories exist in dataset
46
  try:
47
  self._ensure_dataset_structure()
 
74
  logger.error(f"Error ensuring dataset structure: {e}")
75
  raise
76
 
77
+ def reset_cache(self):
78
+ """
79
+ Reset the cache to force reload of data
80
+ """
81
+ self._chat_histories = None
82
+ self._qa_pairs = None
83
+ self._annotations = None
84
+ logger.info("Chat evaluator cache has been reset")
85
+
86
+ def get_chat_history(self, force_reload=False) -> List[Dict[str, Any]]:
87
  """
88
  Get all chat histories from the dataset
89
+
90
+ Args:
91
+ force_reload: If True, ignore cache and reload from dataset
92
  """
93
+ # Return cached data if available and not forcing reload
94
+ if self._chat_histories is not None and not force_reload:
95
+ logger.debug("Returning cached chat histories")
96
+ return self._chat_histories
97
+
98
  try:
99
  # Get list of all files in chat history directory
100
  files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
 
102
  # Filter for chat history files
103
  chat_path = f"{self.chat_history_path}/"
104
  chat_files = [f for f in files if f.startswith(chat_path) and f.endswith('.json')]
105
+ logger.debug(f"Found {len(chat_files)} chat files") # More compact log
106
 
107
  histories = []
108
  for file in chat_files:
 
123
  logger.error(f"Error processing chat file {file}: {e}")
124
  continue
125
 
126
+ # Cache the results
127
+ self._chat_histories = histories
128
  return histories
129
 
130
  except Exception as e:
 
156
  logger.debug(f"Extracted {len(qa_pairs)} QA pairs")
157
  return qa_pairs
158
 
159
+ def get_qa_pairs_for_evaluation(self, limit: int = 50, force_reload=False) -> List[Dict[str, Any]]:
160
  """
161
  Extract question-answer pairs for evaluation
162
 
163
  Args:
164
  limit: Maximum number of pairs to return
165
+ force_reload: If True, force reload from dataset
166
 
167
  Returns:
168
  List of QA pairs with metadata
169
  """
170
+ # Return cached data if available and not forcing reload
171
+ if self._qa_pairs is not None and not force_reload:
172
+ logger.debug("Returning cached QA pairs")
173
+ return self._qa_pairs[:limit] # Respect the limit parameter
174
+
175
+ chat_data = self.get_chat_history(force_reload=force_reload)
176
  qa_pairs = []
177
 
178
+ logger.debug(f"Processing {len(chat_data)} chat histories")
179
 
180
  for chat in chat_data:
181
  conversation_id = chat.get("conversation_id", "unknown")
 
199
  "question_timestamp": messages[i].get("timestamp", ""),
200
  "answer_timestamp": messages[i+1].get("timestamp", "")
201
  })
 
 
 
 
 
202
 
203
+ # Cache the results
204
+ self._qa_pairs = qa_pairs
205
+
206
+ logger.debug(f"Extracted {len(qa_pairs)} QA pairs")
207
+ # Return up to the limit
208
+ return qa_pairs[:limit]
209
 
210
+ def get_evaluation_status(self, force_reload=False) -> Dict[str, int]:
211
  """
212
  Get status of evaluated QA pairs
213
 
214
+ Args:
215
+ force_reload: If True, force reload from dataset
216
+
217
  Returns:
218
  Dictionary with counts of evaluated and unevaluated QA pairs
219
  """
220
+ all_pairs = self.get_qa_pairs_for_evaluation(limit=1000, force_reload=force_reload) # Get a large sample
221
+ evaluated_pairs = self.get_annotations(force_reload=force_reload)
222
 
223
  # Count evaluated conversation IDs
224
  evaluated_ids = set(item.get("conversation_id") for item in evaluated_pairs)
 
277
  repo_type="dataset"
278
  )
279
 
280
+ # Reset annotations cache
281
+ self._annotations = None
282
+
283
  return True, "Annotation saved successfully"
284
 
285
  except Exception as e:
286
  logger.error(f"Error saving annotation: {e}")
287
  return False, f"Failed to save annotation: {str(e)}"
288
 
289
+ def get_annotations(self, force_reload=False) -> List[Dict[str, Any]]:
290
  """
291
  Get all saved annotations from dataset
292
+
293
+ Args:
294
+ force_reload: If True, force reload from dataset
295
  """
296
+ # Return cached data if available and not forcing reload
297
+ if self._annotations is not None and not force_reload:
298
+ logger.debug("Returning cached annotations")
299
+ return self._annotations
300
+
301
  try:
302
  annotations = []
303
  files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
 
319
 
320
  # Sort by timestamp (newest first)
321
  annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
322
+
323
+ # Cache the results
324
+ self._annotations = annotations
325
+
326
  return annotations
327
 
328
  except Exception as e:
329
  logger.error(f"Error getting annotations: {e}")
330
  return []
331
 
332
+ def get_annotation_by_conversation_id(self, conversation_id: str, force_reload=False) -> Optional[Dict[str, Any]]:
333
  """
334
  Get annotation for a specific conversation
335
 
336
  Args:
337
  conversation_id: Conversation ID to look for
338
+ force_reload: If True, force reload from dataset
339
 
340
  Returns:
341
  Annotation object or None if not found
342
  """
343
+ # If we have cached annotations and not forcing reload, look there first
344
+ if self._annotations is not None and not force_reload:
345
+ for annotation in self._annotations:
346
+ if annotation.get("conversation_id") == conversation_id:
347
+ return annotation
348
+
349
  try:
350
+ # Try direct file access
351
+ filename = f"{self.annotations_path}/annotation_{conversation_id}.json"
352
 
353
  # Download and parse annotation file
354
  content = self.api.hf_hub_download(
 
457
  improved_count = sum(1 for a in annotations if a.get("original_answer") != a.get("improved_answer"))
458
  metrics["improvement_rate"] = (improved_count / len(annotations)) * 100
459
 
460
+ return metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
web/evaluation_interface.py CHANGED
@@ -9,78 +9,92 @@ import json
9
  import os
10
  from typing import Dict, Any, List, Tuple
11
 
12
- def get_evaluation_status(evaluator: ChatEvaluator) -> str:
13
  """
14
- Format evaluation status for display
15
 
16
  Args:
17
  evaluator: ChatEvaluator instance
 
18
 
19
  Returns:
20
- Formatted markdown string with status information
21
  """
22
- status = evaluator.get_evaluation_status()
23
-
24
- status_md = f"""
25
- ## Evaluation Status
26
-
27
- - **Total QA Pairs:** {status['total_qa_pairs']}
28
- - **Evaluated Pairs:** {status['evaluated_pairs']} ({status['evaluated_pairs']/max(1, status['total_qa_pairs'])*100:.1f}%)
29
- - **Unevaluated Pairs:** {status['unevaluated_pairs']}
30
- - **Evaluated Conversations:** {status['evaluated_conversations']}
31
- """
32
-
33
- return status_md
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- def get_qa_pairs_dataframe(evaluator: ChatEvaluator, show_evaluated: bool = False, limit: int = 50) -> pd.DataFrame:
36
  """
37
- Get QA pairs as a pandas DataFrame for display
38
 
39
  Args:
40
  evaluator: ChatEvaluator instance
41
- show_evaluated: Whether to show already evaluated pairs
42
- limit: Maximum number of pairs to return
43
 
44
  Returns:
45
  DataFrame with QA pairs
46
  """
47
- qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=200) # Get more than needed for filtering
48
- annotations = evaluator.get_annotations()
49
-
50
- # Create set of evaluated conversation IDs
51
- evaluated_ids = set(a.get("conversation_id") for a in annotations)
52
-
53
- # Filter QA pairs based on show_evaluated parameter
54
- if not show_evaluated:
55
- qa_pairs = [pair for pair in qa_pairs if pair.get("conversation_id") not in evaluated_ids]
56
-
57
- # Limit the results
58
- qa_pairs = qa_pairs[:limit]
59
-
60
- # Create DataFrame
61
- if qa_pairs:
62
- df = pd.DataFrame(qa_pairs)
63
-
64
- # Add "Evaluated" column
65
- df["evaluated"] = df["conversation_id"].apply(lambda x: "Yes" if x in evaluated_ids else "No")
66
 
67
- # Select and rename columns for display
68
- display_df = df[["conversation_id", "question", "original_answer", "evaluated"]].copy()
69
- display_df = display_df.rename(columns={
70
- "conversation_id": "ID",
71
- "question": "Question",
72
- "original_answer": "Answer",
73
- "evaluated": "Evaluated"
74
- })
75
 
76
- # Truncate long text for better display
77
- display_df["Question"] = display_df["Question"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
78
- display_df["Answer"] = display_df["Answer"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
79
 
80
- return display_df
81
-
82
- # Return empty DataFrame if no pairs
83
- return pd.DataFrame(columns=["ID", "Question", "Answer", "Evaluated"])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
  def load_qa_pair_for_evaluation(conversation_id: str, evaluator: ChatEvaluator) -> Tuple[str, str, str, int, int, int, int, int, str]:
86
  """
 
9
  import os
10
  from typing import Dict, Any, List, Tuple
11
 
12
+ def get_evaluation_status(evaluator, force_reload=False):
13
  """
14
+ Get evaluation status as formatted string and refresh QA data
15
 
16
  Args:
17
  evaluator: ChatEvaluator instance
18
+ force_reload: If True, force reload data from dataset
19
 
20
  Returns:
21
+ Status message, updated QA table and refresh message
22
  """
23
+ try:
24
+ # First, reset cache if forcing reload
25
+ if force_reload:
26
+ evaluator.reset_cache()
27
+
28
+ # Get status data
29
+ status = evaluator.get_evaluation_status(force_reload=force_reload)
30
+
31
+ # Get updated QA table
32
+ qa_table = get_qa_pairs_dataframe(evaluator, show_evaluated=False, force_reload=force_reload)
33
+
34
+ status_message = f"""
35
+ Total QA Pairs: {status['total_qa_pairs']}
36
+ Evaluated Pairs: {status['evaluated_pairs']}
37
+ Unevaluated Pairs: {status['unevaluated_pairs']}
38
+ Evaluated Conversations: {status['evaluated_conversations']}
39
+ """
40
+
41
+ refresh_message = "Data refreshed successfully" if force_reload else ""
42
+
43
+ return status_message, qa_table, refresh_message
44
+ except Exception as e:
45
+ logger.error(f"Error getting evaluation status: {e}")
46
+
47
+ # Import pandas here to avoid circular imports
48
+ import pandas as pd
49
+ empty_df = pd.DataFrame(columns=["Conversation ID", "Question", "Timestamp", "Evaluated"])
50
+
51
+ return f"Error getting status: {str(e)}", empty_df, f"Error: {str(e)}"
52
 
53
+ def get_qa_pairs_dataframe(evaluator, show_evaluated=False, force_reload=False):
54
  """
55
+ Get QA pairs as DataFrame for the evaluation interface
56
 
57
  Args:
58
  evaluator: ChatEvaluator instance
59
+ show_evaluated: If True, include already evaluated pairs
60
+ force_reload: If True, force reload from dataset
61
 
62
  Returns:
63
  DataFrame with QA pairs
64
  """
65
+ try:
66
+ # Get QA pairs with potential force reload
67
+ qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=100, force_reload=force_reload)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
 
69
+ # Get annotations
70
+ annotations = evaluator.get_annotations(force_reload=force_reload)
71
+ evaluated_ids = {a.get("conversation_id") for a in annotations}
 
 
 
 
 
72
 
73
+ # Filter out already evaluated pairs if needed
74
+ if not show_evaluated:
75
+ qa_pairs = [qa for qa in qa_pairs if qa["conversation_id"] not in evaluated_ids]
76
 
77
+ # Convert to DataFrame
78
+ if qa_pairs:
79
+ import pandas as pd
80
+
81
+ df = pd.DataFrame([
82
+ {
83
+ "Conversation ID": qa["conversation_id"],
84
+ "Question": qa["question"][:50] + "..." if len(qa["question"]) > 50 else qa["question"],
85
+ "Timestamp": qa.get("timestamp", ""),
86
+ "Evaluated": "Yes" if qa["conversation_id"] in evaluated_ids else "No"
87
+ }
88
+ for qa in qa_pairs
89
+ ])
90
+ return df
91
+ else:
92
+ import pandas as pd
93
+ return pd.DataFrame(columns=["Conversation ID", "Question", "Timestamp", "Evaluated"])
94
+ except Exception as e:
95
+ logger.error(f"Error getting QA pairs dataframe: {e}")
96
+ import pandas as pd
97
+ return pd.DataFrame(columns=["Conversation ID", "Question", "Timestamp", "Evaluated"])
98
 
99
  def load_qa_pair_for_evaluation(conversation_id: str, evaluator: ChatEvaluator) -> Tuple[str, str, str, int, int, int, int, int, str]:
100
  """