Rulga commited on
Commit
61a383e
·
1 Parent(s): 801484b

Add chat evaluation interface components and functionality

Browse files
app.py CHANGED
@@ -4,7 +4,6 @@ import json
4
  import datetime
5
  from pathlib import Path
6
  from huggingface_hub import InferenceClient, HfApi
7
- #from huggingface_hub import InferenceClient
8
  from config.constants import DEFAULT_SYSTEM_MESSAGE
9
  from config.settings import (
10
  HF_TOKEN,
@@ -23,6 +22,15 @@ from web.training_interface import (
23
  register_model_action,
24
  start_finetune_action
25
  )
 
 
 
 
 
 
 
 
 
26
 
27
  if not HF_TOKEN:
28
  raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
@@ -81,6 +89,11 @@ ERROR_LOGS_PATH = os.path.join(os.path.dirname(__file__), "error_logs")
81
  client = None
82
  context_store = {}
83
  fallback_model_attempted = False
 
 
 
 
 
84
 
85
  print(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
86
 
@@ -625,6 +638,73 @@ def save_parameters(model_key, max_len, temp, top_p_val, rep_pen):
625
  except Exception as e:
626
  return f"Error saving parameters: {str(e)}"
627
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
628
  def initialize_app():
629
  """Initialize app with user preferences"""
630
  global client, ACTIVE_MODEL
@@ -810,14 +890,27 @@ with gr.Blocks() as demo:
810
  gr.Markdown("### Model Training Interface")
811
 
812
  with gr.Row():
813
- with gr.Column():
814
- epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
815
- batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
816
- learning_rate = gr.Slider(minimum=1e-6, maximum=1e-3, value=2e-4, label="Learning Rate")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
817
 
818
- train_btn = gr.Button("Start Training", variant="primary")
819
- training_output = gr.Textbox(label="Training Status", interactive=False)
820
-
821
  gr.Markdown("""
822
  <small>
823
 
@@ -834,10 +927,8 @@ with gr.Blocks() as demo:
834
  2e-4 (0.0002) = Usually works best -> 1e-4 = Safer choice for fine-tuning
835
  </small>
836
  """)
837
-
838
-
839
 
840
- with gr.Column():
841
  analysis_btn = gr.Button("Generate Chat Analysis")
842
  analysis_output = gr.Markdown()
843
 
@@ -846,11 +937,140 @@ with gr.Blocks() as demo:
846
  inputs=[epochs, batch_size, learning_rate],
847
  outputs=[training_output]
848
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849
  analysis_btn.click(
850
  generate_chat_analysis,
851
  inputs=[],
852
  outputs=[analysis_output]
853
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
854
 
855
  # Model change handler
856
  model_selector.change(
@@ -882,4 +1102,4 @@ if __name__ == "__main__":
882
  if not load_vector_store():
883
  print("Knowledge base not found. Please create it through the interface.")
884
 
885
- demo.launch()
 
4
  import datetime
5
  from pathlib import Path
6
  from huggingface_hub import InferenceClient, HfApi
 
7
  from config.constants import DEFAULT_SYSTEM_MESSAGE
8
  from config.settings import (
9
  HF_TOKEN,
 
22
  register_model_action,
23
  start_finetune_action
24
  )
25
+ from web.evaluation_interface import (
26
+ get_evaluation_status,
27
+ get_qa_pairs_dataframe,
28
+ load_qa_pair_for_evaluation,
29
+ save_evaluation,
30
+ generate_evaluation_report_html,
31
+ export_training_data_action
32
+ )
33
+ from src.analytics.chat_evaluator import ChatEvaluator
34
 
35
  if not HF_TOKEN:
36
  raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
 
89
  client = None
90
  context_store = {}
91
  fallback_model_attempted = False
92
+ chat_evaluator = ChatEvaluator(
93
+ hf_token=HF_TOKEN,
94
+ dataset_id=DATASET_ID,
95
+ chat_history_path=CHAT_HISTORY_PATH
96
+ )
97
 
98
  print(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
99
 
 
638
  except Exception as e:
639
  return f"Error saving parameters: {str(e)}"
640
 
641
+ def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_rating=4):
642
+ """
643
+ Fine-tune model using annotated QA pairs
644
+
645
+ Args:
646
+ epochs: Number of training epochs
647
+ batch_size: Batch size for training
648
+ learning_rate: Learning rate
649
+ min_rating: Minimum average rating for including examples
650
+
651
+ Returns:
652
+ (success, message)
653
+ """
654
+ try:
655
+ import tempfile
656
+ import os
657
+ from src.analytics.chat_evaluator import ChatEvaluator
658
+ from config.settings import HF_TOKEN, DATASET_ID, CHAT_HISTORY_PATH
659
+
660
+ # Create evaluator
661
+ evaluator = ChatEvaluator(
662
+ hf_token=HF_TOKEN,
663
+ dataset_id=DATASET_ID,
664
+ chat_history_path=CHAT_HISTORY_PATH
665
+ )
666
+
667
+ # Create temporary file for training data
668
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.jsonl', delete=False) as temp_file:
669
+ temp_path = temp_file.name
670
+
671
+ # Export high-quality examples
672
+ success, message = evaluator.export_training_data(temp_path, min_rating)
673
+
674
+ if not success:
675
+ return False, f"Failed to export training data: {message}"
676
+
677
+ # Count examples
678
+ with open(temp_path, 'r') as f:
679
+ example_count = sum(1 for _ in f)
680
+
681
+ if example_count == 0:
682
+ return False, "No high-quality examples found for fine-tuning"
683
+
684
+ # Run actual fine-tuning using the export file
685
+ from src.training.fine_tuner import finetune_from_file
686
+
687
+ success, message = finetune_from_file(
688
+ training_file=temp_path,
689
+ epochs=epochs,
690
+ batch_size=batch_size,
691
+ learning_rate=learning_rate
692
+ )
693
+
694
+ # Clean up temporary file
695
+ try:
696
+ os.unlink(temp_path)
697
+ except:
698
+ pass
699
+
700
+ if success:
701
+ return True, f"Successfully fine-tuned model with {example_count} annotated examples: {message}"
702
+ else:
703
+ return False, f"Fine-tuning failed: {message}"
704
+
705
+ except Exception as e:
706
+ return False, f"Error during fine-tuning from annotations: {str(e)}"
707
+
708
  def initialize_app():
709
  """Initialize app with user preferences"""
710
  global client, ACTIVE_MODEL
 
890
  gr.Markdown("### Model Training Interface")
891
 
892
  with gr.Row():
893
+ with gr.Column(scale=1):
894
+ training_tabs = gr.Tabs()
895
+
896
+ with training_tabs:
897
+ with gr.TabItem("Regular Training"):
898
+ epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
899
+ batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
900
+ learning_rate = gr.Slider(minimum=1e-6, maximum=1e-3, value=2e-4, label="Learning Rate")
901
+
902
+ train_btn = gr.Button("Start Training", variant="primary")
903
+ training_output = gr.Textbox(label="Training Status", interactive=False)
904
+
905
+ with gr.TabItem("Train from Annotations"):
906
+ annot_epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
907
+ annot_batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
908
+ annot_learning_rate = gr.Slider(minimum=1e-6, maximum=1e-3, value=2e-4, label="Learning Rate")
909
+ annot_min_rating = gr.Slider(minimum=1, maximum=5, value=4, step=0.5, label="Minimum Rating for Training")
910
+
911
+ annot_train_btn = gr.Button("Start Training from Annotations", variant="primary")
912
+ annot_training_output = gr.Textbox(label="Training Status", interactive=False)
913
 
 
 
 
914
  gr.Markdown("""
915
  <small>
916
 
 
927
  2e-4 (0.0002) = Usually works best -> 1e-4 = Safer choice for fine-tuning
928
  </small>
929
  """)
 
 
930
 
931
+ with gr.Column(scale=1):
932
  analysis_btn = gr.Button("Generate Chat Analysis")
933
  analysis_output = gr.Markdown()
934
 
 
937
  inputs=[epochs, batch_size, learning_rate],
938
  outputs=[training_output]
939
  )
940
+
941
+ # Function to handle training from annotations
942
+ def start_annotation_finetune(epochs, batch_size, learning_rate, min_rating):
943
+ """Wrapper function to start fine-tuning from annotations"""
944
+ success, message = finetune_from_annotations(
945
+ epochs=epochs,
946
+ batch_size=batch_size,
947
+ learning_rate=learning_rate,
948
+ min_rating=min_rating
949
+ )
950
+ return message
951
+
952
+ annot_train_btn.click(
953
+ start_annotation_finetune,
954
+ inputs=[annot_epochs, annot_batch_size, annot_learning_rate, annot_min_rating],
955
+ outputs=[annot_training_output]
956
+ )
957
+
958
  analysis_btn.click(
959
  generate_chat_analysis,
960
  inputs=[],
961
  outputs=[analysis_output]
962
  )
963
+
964
+ with gr.Tab("Chat Evaluation"):
965
+ gr.Markdown("### Evaluation of Chat Responses")
966
+
967
+ with gr.Row():
968
+ with gr.Column(scale=1):
969
+ evaluation_status = gr.Markdown(get_evaluation_status(chat_evaluator))
970
+ refresh_status_btn = gr.Button("Refresh Status")
971
+
972
+ gr.Markdown("### Evaluation Metrics")
973
+ evaluation_report = gr.HTML(generate_evaluation_report_html(chat_evaluator))
974
+ refresh_report_btn = gr.Button("Refresh Report")
975
+
976
+ gr.Markdown("### Export for Training")
977
+ with gr.Row():
978
+ min_rating = gr.Slider(
979
+ minimum=1,
980
+ maximum=5,
981
+ value=4,
982
+ step=0.5,
983
+ label="Minimum Average Rating"
984
+ )
985
+ export_path = gr.Textbox(
986
+ label="Export File Path (optional)",
987
+ placeholder="Leave empty for default path"
988
+ )
989
+ export_btn = gr.Button("Export Annotated Data", variant="primary")
990
+ export_status = gr.Textbox(label="Export Status", interactive=False)
991
+
992
+ with gr.Column(scale=2):
993
+ show_evaluated = gr.Checkbox(label="Show Already Evaluated Pairs", value=False)
994
+ qa_table = gr.DataFrame(get_qa_pairs_dataframe(chat_evaluator))
995
+
996
+ gr.Markdown("### Select Conversation to Evaluate")
997
+ selected_conversation = gr.Textbox(label="Conversation ID", placeholder="Select from table above")
998
+ load_btn = gr.Button("Load Conversation", variant="primary")
999
+
1000
+ gr.Markdown("### Evaluate Response")
1001
+ question_display = gr.Textbox(label="User Question", interactive=False)
1002
+ original_answer = gr.TextArea(label="Original Bot Answer", interactive=False)
1003
+ improved_answer = gr.TextArea(label="Improved Answer (Gold Standard)", interactive=True)
1004
+
1005
+ gr.Markdown("### Quality Ratings (1-5)")
1006
+ with gr.Row():
1007
+ accuracy = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Factual Accuracy")
1008
+ completeness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Completeness")
1009
+ with gr.Row():
1010
+ relevance = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Relevance")
1011
+ clarity = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Clarity")
1012
+ legal_correctness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Legal Correctness")
1013
+
1014
+ notes = gr.TextArea(label="Evaluator Notes", placeholder="Add your notes about this response...")
1015
+ save_btn = gr.Button("Save Evaluation", variant="primary")
1016
+ evaluation_status_msg = gr.Textbox(label="Status", interactive=False)
1017
+
1018
+ # Add event handlers
1019
+ refresh_status_btn.click(
1020
+ fn=get_evaluation_status,
1021
+ inputs=[],
1022
+ outputs=[evaluation_status],
1023
+ kwargs={"evaluator": chat_evaluator}
1024
+ )
1025
+
1026
+ refresh_report_btn.click(
1027
+ fn=generate_evaluation_report_html,
1028
+ inputs=[],
1029
+ outputs=[evaluation_report],
1030
+ kwargs={"evaluator": chat_evaluator}
1031
+ )
1032
+
1033
+ show_evaluated.change(
1034
+ fn=get_qa_pairs_dataframe,
1035
+ inputs=[show_evaluated],
1036
+ outputs=[qa_table],
1037
+ kwargs={"evaluator": chat_evaluator}
1038
+ )
1039
+
1040
+ # Table selection to conversation ID textbox
1041
+ qa_table.select(
1042
+ fn=lambda df, evt: evt.value[0] if evt and evt.value and len(evt.value) > 0 else "",
1043
+ inputs=[qa_table],
1044
+ outputs=[selected_conversation]
1045
+ )
1046
+
1047
+ # Load conversation for evaluation
1048
+ load_btn.click(
1049
+ fn=load_qa_pair_for_evaluation,
1050
+ inputs=[selected_conversation],
1051
+ outputs=[question_display, original_answer, improved_answer,
1052
+ accuracy, completeness, relevance, clarity, legal_correctness, notes],
1053
+ kwargs={"evaluator": chat_evaluator}
1054
+ )
1055
+
1056
+ # Save evaluation
1057
+ save_btn.click(
1058
+ fn=save_evaluation,
1059
+ inputs=[
1060
+ selected_conversation, question_display, original_answer, improved_answer,
1061
+ accuracy, completeness, relevance, clarity, legal_correctness, notes
1062
+ ],
1063
+ outputs=[evaluation_status_msg],
1064
+ kwargs={"evaluator": chat_evaluator}
1065
+ )
1066
+
1067
+ # Export training data
1068
+ export_btn.click(
1069
+ fn=export_training_data_action,
1070
+ inputs=[min_rating, export_path],
1071
+ outputs=[export_status],
1072
+ kwargs={"evaluator": chat_evaluator}
1073
+ )
1074
 
1075
  # Model change handler
1076
  model_selector.change(
 
1102
  if not load_vector_store():
1103
  print("Knowledge base not found. Please create it through the interface.")
1104
 
1105
+ demo.launch()
src/analytics/chat_evaluator.py ADDED
@@ -0,0 +1,320 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module for evaluation and annotation of bot responses
3
+ """
4
+
5
+ import json
6
+ import os
7
+ import datetime
8
+ from typing import List, Dict, Any, Tuple, Optional
9
+ import pandas as pd
10
+ from src.knowledge_base.dataset import DatasetManager
11
+ from huggingface_hub import HfApi
12
+
13
+ class ChatEvaluator:
14
+ def __init__(self,
15
+ dataset_manager: Optional[DatasetManager] = None,
16
+ hf_token: str = None,
17
+ dataset_id: str = None,
18
+ chat_history_path: str = None):
19
+ """
20
+ Initialize chat evaluator
21
+
22
+ Args:
23
+ dataset_manager: Dataset manager for retrieving chat history
24
+ hf_token: Hugging Face token for uploading annotations
25
+ dataset_id: Hugging Face dataset ID
26
+ chat_history_path: Path to local chat history directory
27
+ """
28
+ self.dataset_manager = dataset_manager or DatasetManager()
29
+ self.hf_token = hf_token
30
+ self.dataset_id = dataset_id
31
+ self.chat_history_path = chat_history_path
32
+ self.annotations_dir = os.path.join(os.path.dirname(chat_history_path), "annotations") if chat_history_path else None
33
+
34
+ # Create annotations directory if it doesn't exist
35
+ if self.annotations_dir:
36
+ os.makedirs(self.annotations_dir, exist_ok=True)
37
+
38
+ def get_chat_history(self) -> List[Dict[str, Any]]:
39
+ """
40
+ Get all chat history data from local files and dataset
41
+
42
+ Returns:
43
+ List of chat histories
44
+ """
45
+ success, chat_data = self.dataset_manager.get_chat_history()
46
+ if not success or not chat_data:
47
+ return []
48
+ return chat_data
49
+
50
+ def get_qa_pairs_for_evaluation(self, limit: int = 50) -> List[Dict[str, Any]]:
51
+ """
52
+ Extract question-answer pairs for evaluation
53
+
54
+ Args:
55
+ limit: Maximum number of pairs to return
56
+
57
+ Returns:
58
+ List of QA pairs with metadata
59
+ """
60
+ chat_data = self.get_chat_history()
61
+ qa_pairs = []
62
+
63
+ for chat in chat_data:
64
+ conversation_id = chat.get("conversation_id", "unknown")
65
+ timestamp = chat.get("timestamp", "")
66
+ history = chat.get("history", [])
67
+
68
+ # Find user-assistant pairs in history
69
+ for i in range(len(history) - 1):
70
+ if history[i].get("role") == "user" and history[i+1].get("role") == "assistant":
71
+ question = history[i].get("content", "").strip()
72
+ answer = history[i+1].get("content", "").strip()
73
+
74
+ # Only include non-empty pairs
75
+ if question and answer:
76
+ qa_pairs.append({
77
+ "conversation_id": conversation_id,
78
+ "timestamp": timestamp,
79
+ "question": question,
80
+ "original_answer": answer,
81
+ "question_timestamp": history[i].get("timestamp", ""),
82
+ "answer_timestamp": history[i+1].get("timestamp", "")
83
+ })
84
+
85
+ # Check if we've reached the limit
86
+ if len(qa_pairs) >= limit:
87
+ return qa_pairs
88
+
89
+ return qa_pairs
90
+
91
+ def get_evaluation_status(self) -> Dict[str, int]:
92
+ """
93
+ Get status of evaluated QA pairs
94
+
95
+ Returns:
96
+ Dictionary with counts of evaluated and unevaluated QA pairs
97
+ """
98
+ all_pairs = self.get_qa_pairs_for_evaluation(limit=1000) # Get a large sample
99
+ evaluated_pairs = self.get_annotations()
100
+
101
+ # Count evaluated conversation IDs
102
+ evaluated_ids = set(item.get("conversation_id") for item in evaluated_pairs)
103
+
104
+ return {
105
+ "total_qa_pairs": len(all_pairs),
106
+ "evaluated_pairs": len(evaluated_pairs),
107
+ "unevaluated_pairs": len(all_pairs) - len(evaluated_pairs),
108
+ "evaluated_conversations": len(evaluated_ids)
109
+ }
110
+
111
+ def save_annotation(self,
112
+ conversation_id: str,
113
+ question: str,
114
+ original_answer: str,
115
+ improved_answer: str,
116
+ ratings: Dict[str, int],
117
+ notes: str = "") -> Tuple[bool, str]:
118
+ """
119
+ Save evaluation annotation
120
+
121
+ Args:
122
+ conversation_id: ID of the conversation
123
+ question: User question
124
+ original_answer: Original bot answer
125
+ improved_answer: Improved answer (gold standard)
126
+ ratings: Dictionary with ratings for different criteria
127
+ notes: Optional evaluator notes
128
+
129
+ Returns:
130
+ (success, message)
131
+ """
132
+ if not self.annotations_dir:
133
+ return False, "Annotations directory not configured"
134
+
135
+ try:
136
+ # Create annotation object
137
+ annotation = {
138
+ "conversation_id": conversation_id,
139
+ "timestamp": datetime.datetime.now().isoformat(),
140
+ "question": question,
141
+ "original_answer": original_answer,
142
+ "improved_answer": improved_answer,
143
+ "ratings": ratings,
144
+ "notes": notes
145
+ }
146
+
147
+ # Create filename with conversation_id
148
+ filename = f"annotation_{conversation_id}.json"
149
+ filepath = os.path.join(self.annotations_dir, filename)
150
+
151
+ # Save to local file
152
+ with open(filepath, 'w', encoding='utf-8') as f:
153
+ json.dump(annotation, f, ensure_ascii=False, indent=2)
154
+
155
+ # Upload to HuggingFace dataset if configured
156
+ if self.hf_token and self.dataset_id:
157
+ try:
158
+ api = HfApi(token=self.hf_token)
159
+
160
+ # Extract just the directory name from annotations_dir
161
+ dir_name = os.path.basename(self.annotations_dir)
162
+ target_path = f"{dir_name}/{filename}"
163
+
164
+ # Upload the file to the dataset
165
+ api.upload_file(
166
+ path_or_fileobj=filepath,
167
+ path_in_repo=target_path,
168
+ repo_id=self.dataset_id,
169
+ repo_type="dataset"
170
+ )
171
+
172
+ except Exception as e:
173
+ return True, f"Saved locally but failed to upload to dataset: {str(e)}"
174
+
175
+ return True, "Annotation saved successfully"
176
+ except Exception as e:
177
+ return False, f"Error saving annotation: {str(e)}"
178
+
179
+ def get_annotations(self) -> List[Dict[str, Any]]:
180
+ """
181
+ Get all saved annotations
182
+
183
+ Returns:
184
+ List of annotation objects
185
+ """
186
+ if not self.annotations_dir or not os.path.exists(self.annotations_dir):
187
+ return []
188
+
189
+ annotations = []
190
+ for filename in os.listdir(self.annotations_dir):
191
+ if filename.startswith("annotation_") and filename.endswith(".json"):
192
+ try:
193
+ filepath = os.path.join(self.annotations_dir, filename)
194
+ with open(filepath, 'r', encoding='utf-8') as f:
195
+ annotation = json.load(f)
196
+ annotations.append(annotation)
197
+ except Exception as e:
198
+ print(f"Error loading annotation {filename}: {str(e)}")
199
+
200
+ # Sort by timestamp (newest first)
201
+ annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
202
+ return annotations
203
+
204
+ def get_annotation_by_conversation_id(self, conversation_id: str) -> Optional[Dict[str, Any]]:
205
+ """
206
+ Get annotation for a specific conversation
207
+
208
+ Args:
209
+ conversation_id: Conversation ID to look for
210
+
211
+ Returns:
212
+ Annotation object or None if not found
213
+ """
214
+ if not self.annotations_dir:
215
+ return None
216
+
217
+ filepath = os.path.join(self.annotations_dir, f"annotation_{conversation_id}.json")
218
+ if os.path.exists(filepath):
219
+ try:
220
+ with open(filepath, 'r', encoding='utf-8') as f:
221
+ return json.load(f)
222
+ except Exception as e:
223
+ print(f"Error loading annotation for {conversation_id}: {str(e)}")
224
+
225
+ return None
226
+
227
+ def export_training_data(self, output_file: str, min_rating: int = 4) -> Tuple[bool, str]:
228
+ """
229
+ Export high-quality annotated data for fine-tuning
230
+
231
+ Args:
232
+ output_file: Path to output file
233
+ min_rating: Minimum average rating to include in training data
234
+
235
+ Returns:
236
+ (success, message)
237
+ """
238
+ annotations = self.get_annotations()
239
+
240
+ if not annotations:
241
+ return False, "No annotations available for export"
242
+
243
+ try:
244
+ # Filter annotations by quality
245
+ high_quality_examples = []
246
+
247
+ for annotation in annotations:
248
+ ratings = annotation.get("ratings", {})
249
+
250
+ # Calculate average rating
251
+ if ratings:
252
+ avg_rating = sum(ratings.values()) / len(ratings)
253
+
254
+ # Include only high-quality examples
255
+ if avg_rating >= min_rating:
256
+ high_quality_examples.append({
257
+ "messages": [
258
+ {"role": "user", "content": annotation.get("question", "")},
259
+ {"role": "assistant", "content": annotation.get("improved_answer", "")}
260
+ ]
261
+ })
262
+
263
+ if not high_quality_examples:
264
+ return False, f"No examples meet the minimum quality threshold of {min_rating}"
265
+
266
+ # Save to JSONL format
267
+ with open(output_file, "w", encoding="utf-8") as f:
268
+ for example in high_quality_examples:
269
+ f.write(json.dumps(example, ensure_ascii=False) + "\n")
270
+
271
+ return True, f"Successfully exported {len(high_quality_examples)} high-quality examples for training"
272
+ except Exception as e:
273
+ return False, f"Error exporting training data: {str(e)}"
274
+
275
+ def generate_evaluation_report(self) -> Dict[str, Any]:
276
+ """
277
+ Generate evaluation summary report
278
+
279
+ Returns:
280
+ Dictionary with evaluation metrics
281
+ """
282
+ annotations = self.get_annotations()
283
+
284
+ if not annotations:
285
+ return {
286
+ "total_evaluations": 0,
287
+ "message": "No evaluations available"
288
+ }
289
+
290
+ # Initialize metrics
291
+ criteria = set()
292
+ for annotation in annotations:
293
+ criteria.update(annotation.get("ratings", {}).keys())
294
+
295
+ metrics = {
296
+ "total_evaluations": len(annotations),
297
+ "criteria_averages": {},
298
+ "overall_average": 0,
299
+ "improvement_rate": 0 # Percentage of answers that were improved
300
+ }
301
+
302
+ # Calculate averages for each criterion
303
+ for criterion in criteria:
304
+ values = [a.get("ratings", {}).get(criterion, 0) for a in annotations if criterion in a.get("ratings", {})]
305
+ if values:
306
+ metrics["criteria_averages"][criterion] = sum(values) / len(values)
307
+
308
+ # Calculate overall average
309
+ all_ratings = []
310
+ for annotation in annotations:
311
+ all_ratings.extend(annotation.get("ratings", {}).values())
312
+
313
+ if all_ratings:
314
+ metrics["overall_average"] = sum(all_ratings) / len(all_ratings)
315
+
316
+ # Calculate improvement rate
317
+ improved_count = sum(1 for a in annotations if a.get("original_answer") != a.get("improved_answer"))
318
+ metrics["improvement_rate"] = (improved_count / len(annotations)) * 100
319
+
320
+ return metrics
src/training/fine_tuner.py CHANGED
@@ -1,5 +1,5 @@
1
  """
2
- Модуль для дообучения языковой модели на основе собранных данных
3
  """
4
 
5
  import os
@@ -84,38 +84,38 @@ class FineTuner:
84
  try:
85
  logger.info(f"Загрузка модели {self.base_model_id}...")
86
 
87
- # Загрузка токенизатора с использованием slow tokenizer
88
  self.tokenizer = AutoTokenizer.from_pretrained(
89
  self.base_model_id,
90
  trust_remote_code=True,
91
- use_fast=False # Используем slow tokenizer
92
  )
93
 
94
- # Специальные токены для диалогов
95
  special_tokens = {
96
  "pad_token": "<PAD>",
97
  "eos_token": "</s>",
98
  "bos_token": "<s>",
99
- "unk_token": "<unk>" # Добавляем unknown token
100
  }
101
 
102
- # Добавляем специальные токены, если их нет
103
  self.tokenizer.add_special_tokens({"additional_special_tokens": list(special_tokens.values())})
104
 
105
- # Загрузка модели
106
  self.model = AutoModelForCausalLM.from_pretrained(
107
  self.base_model_id,
108
  trust_remote_code=True,
109
  device_map="auto" if self.device == "cuda" else None,
110
- torch_dtype="auto" # Автоматически выбираем оптимальный тип данных
111
  )
112
 
113
- # Изменяем размер эмбеддингов для новых токенов
114
  self.model.resize_token_embeddings(len(self.tokenizer))
115
 
116
- logger.info("Модель и токенизатор успешно загружены")
117
  except Exception as e:
118
- logger.error(f"Ошибка при загрузке модели: {str(e)}")
119
  raise
120
 
121
  def setup_lora_config(
@@ -125,17 +125,17 @@ class FineTuner:
125
  lora_dropout: float = 0.05
126
  ) -> LoraConfig:
127
  """
128
- Настройка конфигурации LoRA для эффективного дообучения
129
 
130
  Args:
131
- r: Ранг матриц LoRA
132
- lora_alpha: Альфа параметр LoRA
133
- lora_dropout: Вероятность dropout в LoRA слоях
134
 
135
  Returns:
136
- Конфигурация LoRA
137
  """
138
- # Создаем конфигурацию LoRA
139
  lora_config = LoraConfig(
140
  task_type=TaskType.CAUSAL_LM,
141
  r=r,
@@ -149,34 +149,34 @@ class FineTuner:
149
 
150
  def prepare_model_for_training(self):
151
  """
152
- Подготовка модели к обучению с использованием LoRA
153
  """
154
  if self.model is None:
155
  self.load_model_and_tokenizer()
156
 
157
- # Настройка LoRA
158
  lora_config = self.setup_lora_config()
159
 
160
- # Применяем LoRA к модели
161
  self.model = get_peft_model(self.model, lora_config)
162
 
163
- # Вывод информации о параметрах
164
  trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
165
  all_params = sum(p.numel() for p in self.model.parameters())
166
- logger.info(f"Обучаемых параметров: {trainable_params:,} из {all_params:,} ({trainable_params/all_params:.2%})")
167
 
168
  def tokenize_dataset(self, dataset):
169
  """
170
- Токенизация датасета для обучения
171
 
172
  Args:
173
- dataset: Датасет для токенизации
174
 
175
  Returns:
176
- То��енизированный датасет
177
  """
178
  def tokenize_function(examples):
179
- # Форматируем диалоги в единую строку
180
  texts = []
181
  for dialog in examples["messages"]:
182
  text = ""
@@ -187,7 +187,7 @@ class FineTuner:
187
  text += f"Assistant: {message['content']}\n"
188
  texts.append(text)
189
 
190
- # Токенизируем тексты
191
  tokenized = self.tokenizer(
192
  texts,
193
  padding="max_length",
@@ -198,7 +198,7 @@ class FineTuner:
198
 
199
  return tokenized
200
 
201
- # Применяем функцию токенизации
202
  tokenized_dataset = dataset.map(
203
  tokenize_function,
204
  batched=True,
@@ -207,6 +207,75 @@ class FineTuner:
207
 
208
  return tokenized_dataset
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  def train(
211
  self,
212
  training_data_path: Optional[str] = None,
@@ -218,49 +287,49 @@ class FineTuner:
218
  save_strategy: str = "epoch"
219
  ) -> Tuple[bool, str]:
220
  """
221
- Запуск процесса дообучения модели
222
 
223
  Args:
224
- training_data_path: Путь к данным для обучения (если None, данные будут подготовлены автоматически)
225
- num_train_epochs: Количество эпох обучения
226
- per_device_train_batch_size: Размер батча на устройство
227
- gradient_accumulation_steps: Количество шагов накопления градиента
228
- learning_rate: Скорость обучения
229
- logging_steps: Частота логирования
230
- save_strategy: Стратегия сохранения модели
231
 
232
  Returns:
233
- (успех, сообщение)
234
  """
235
  try:
236
- # Подгот��вка данных для обучения, если не указан путь
237
  if training_data_path is None:
238
  training_data_path = self.prepare_training_data()
239
  temp_data = True
240
  else:
241
  temp_data = False
242
 
243
- # Загрузка модели и токенизатора, если не загружены
244
  if self.model is None or self.tokenizer is None:
245
  self.load_model_and_tokenizer()
246
 
247
- # Подготовка модели для обучения
248
  self.prepare_model_for_training()
249
 
250
- # Загрузка датасета
251
  dataset = load_dataset("json", data_files=training_data_path, split="train")
252
- logger.info(f"Загружено {len(dataset)} примеров из {training_data_path}")
253
 
254
- # Токенизация датасета
255
  tokenized_dataset = self.tokenize_dataset(dataset)
256
 
257
- # Создание колатора данных
258
  data_collator = DataCollatorForLanguageModeling(
259
  tokenizer=self.tokenizer,
260
  mlm=False
261
  )
262
 
263
- # Настройка аргументов обучения
264
  training_args = TrainingArguments(
265
  output_dir=self.output_dir,
266
  num_train_epochs=num_train_epochs,
@@ -278,7 +347,7 @@ class FineTuner:
278
  load_best_model_at_end=True
279
  )
280
 
281
- # Создание тренера
282
  trainer = Trainer(
283
  model=self.model,
284
  args=training_args,
@@ -287,23 +356,23 @@ class FineTuner:
287
  tokenizer=self.tokenizer
288
  )
289
 
290
- # Запуск обучения
291
- logger.info("Начало обучения модели...")
292
  trainer.train()
293
 
294
- # Сохранение модели
295
- logger.info(f"Сохранение обученной модели в {self.output_dir}")
296
  trainer.save_model(self.output_dir)
297
  self.tokenizer.save_pretrained(self.output_dir)
298
 
299
- # Удаляем временный файл, если он был создан
300
  if temp_data and os.path.exists(training_data_path):
301
  os.remove(training_data_path)
302
 
303
- return True, f"Модель успешно обучена и сохранена в {self.output_dir}"
304
  except Exception as e:
305
- logger.error(f"Ошибка в процессе обучения: {str(e)}")
306
- return False, f"Ошибка в процессе обучения: {str(e)}"
307
 
308
  def upload_model_to_hub(
309
  self,
@@ -312,24 +381,24 @@ class FineTuner:
312
  token: Optional[str] = None
313
  ) -> Tuple[bool, str]:
314
  """
315
- Загрузка обученной модели на Hugging Face Hub
316
 
317
  Args:
318
- repo_id: Идентификатор репозитория на Hugging Face Hub
319
- private: Флаг приватности репозитория
320
- token: Токен доступа к Hugging Face Hub
321
 
322
  Returns:
323
- (успех, сообщение)
324
  """
325
  try:
326
  if not os.path.exists(os.path.join(self.output_dir, "pytorch_model.bin")):
327
- return False, "Обученная модель не найдена. Сначала выполните обучение."
328
 
329
- # Инициализация API
330
  api = HfApi(token=token)
331
 
332
- # Загрузка модели на Hub
333
  api.create_repo(repo_id=repo_id, private=private, repo_type="model", exist_ok=True)
334
  api.upload_folder(
335
  folder_path=self.output_dir,
@@ -337,35 +406,35 @@ class FineTuner:
337
  repo_type="model"
338
  )
339
 
340
- return True, f"Модель успешно загружена на Hugging Face Hub: {repo_id}"
341
  except Exception as e:
342
- return False, f"Ошибка при загрузке модели на Hub: {str(e)}"
343
 
344
  def finetune_from_chat_history(epochs: int = 3) -> Tuple[bool, str]:
345
  """
346
- Функция для запуска процесса дообучения на основе истории чатов
347
 
348
  Args:
349
- epochs: Количество эпох обучения
350
 
351
  Returns:
352
- (успех, сообщение)
353
  """
354
- # Анализ чатов и подготовка данных
355
  analyzer = ChatAnalyzer()
356
  report = analyzer.generate_analytics_report()
357
 
358
- # Проверка наличия достаточного количества данных
359
  if report["qa_pairs_count"] < 10:
360
- return False, f"Недостаточно данных для дообучения. Найдено всего {report['qa_pairs_count']} пар вопрос-ответ."
361
 
362
- # Создание и запуск процесса дообучения
363
  tuner = FineTuner()
364
  success, message = tuner.train(num_train_epochs=epochs)
365
 
366
  return success, message
367
 
368
  if __name__ == "__main__":
369
- # Пример использования
370
  success, message = finetune_from_chat_history()
371
  print(message)
 
1
  """
2
+ Module for fine-tuning a language model on collected data
3
  """
4
 
5
  import os
 
84
  try:
85
  logger.info(f"Загрузка модели {self.base_model_id}...")
86
 
87
+ # Load tokenizer using slow tokenizer
88
  self.tokenizer = AutoTokenizer.from_pretrained(
89
  self.base_model_id,
90
  trust_remote_code=True,
91
+ use_fast=False # Using slow tokenizer
92
  )
93
 
94
+ # Special tokens for dialogues
95
  special_tokens = {
96
  "pad_token": "<PAD>",
97
  "eos_token": "</s>",
98
  "bos_token": "<s>",
99
+ "unk_token": "<unk>" # Adding unknown token
100
  }
101
 
102
+ # Add special tokens if they don't exist
103
  self.tokenizer.add_special_tokens({"additional_special_tokens": list(special_tokens.values())})
104
 
105
+ # Load model
106
  self.model = AutoModelForCausalLM.from_pretrained(
107
  self.base_model_id,
108
  trust_remote_code=True,
109
  device_map="auto" if self.device == "cuda" else None,
110
+ torch_dtype="auto" # Automatically choose optimal data type
111
  )
112
 
113
+ # Resize embeddings for new tokens
114
  self.model.resize_token_embeddings(len(self.tokenizer))
115
 
116
+ logger.info("Model and tokenizer loaded successfully")
117
  except Exception as e:
118
+ logger.error(f"Error loading model: {str(e)}")
119
  raise
120
 
121
  def setup_lora_config(
 
125
  lora_dropout: float = 0.05
126
  ) -> LoraConfig:
127
  """
128
+ Setup LoRA configuration for efficient fine-tuning
129
 
130
  Args:
131
+ r: Rank of LoRA matrices
132
+ lora_alpha: LoRA alpha parameter
133
+ lora_dropout: Dropout probability in LoRA layers
134
 
135
  Returns:
136
+ LoRA configuration
137
  """
138
+ # Create LoRA configuration
139
  lora_config = LoraConfig(
140
  task_type=TaskType.CAUSAL_LM,
141
  r=r,
 
149
 
150
  def prepare_model_for_training(self):
151
  """
152
+ Prepare model for training using LoRA
153
  """
154
  if self.model is None:
155
  self.load_model_and_tokenizer()
156
 
157
+ # Setup LoRA
158
  lora_config = self.setup_lora_config()
159
 
160
+ # Apply LoRA to model
161
  self.model = get_peft_model(self.model, lora_config)
162
 
163
+ # Output parameter information
164
  trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
165
  all_params = sum(p.numel() for p in self.model.parameters())
166
+ logger.info(f"Trainable parameters: {trainable_params:,} of {all_params:,} ({trainable_params/all_params:.2%})")
167
 
168
  def tokenize_dataset(self, dataset):
169
  """
170
+ Tokenize dataset for training
171
 
172
  Args:
173
+ dataset: Dataset to tokenize
174
 
175
  Returns:
176
+ Tokenized dataset
177
  """
178
  def tokenize_function(examples):
179
+ # Format dialogues into single string
180
  texts = []
181
  for dialog in examples["messages"]:
182
  text = ""
 
187
  text += f"Assistant: {message['content']}\n"
188
  texts.append(text)
189
 
190
+ # Tokenize texts
191
  tokenized = self.tokenizer(
192
  texts,
193
  padding="max_length",
 
198
 
199
  return tokenized
200
 
201
+ # Apply tokenization function
202
  tokenized_dataset = dataset.map(
203
  tokenize_function,
204
  batched=True,
 
207
 
208
  return tokenized_dataset
209
 
210
+ # Добавить этот метод в класс fine_tuner.py или в функции модуля:
211
+
212
+ def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_rating=4):
213
+ """
214
+ Fine-tune model using annotated QA pairs
215
+
216
+ Args:
217
+ epochs: Number of training epochs
218
+ batch_size: Batch size for training
219
+ learning_rate: Learning rate
220
+ min_rating: Minimum average rating for including examples
221
+
222
+ Returns:
223
+ (success, message)
224
+ """
225
+ try:
226
+ import tempfile
227
+ import os
228
+ from src.analytics.chat_evaluator import ChatEvaluator
229
+ from config.settings import HF_TOKEN, DATASET_ID, CHAT_HISTORY_PATH
230
+
231
+ # Create evaluator
232
+ evaluator = ChatEvaluator(
233
+ hf_token=HF_TOKEN,
234
+ dataset_id=DATASET_ID,
235
+ chat_history_path=CHAT_HISTORY_PATH
236
+ )
237
+
238
+ # Create temporary file for training data
239
+ with tempfile.NamedTemporaryFile(mode='w+', suffix='.jsonl', delete=False) as temp_file:
240
+ temp_path = temp_file.name
241
+
242
+ # Export high-quality examples
243
+ success, message = evaluator.export_training_data(temp_path, min_rating)
244
+
245
+ if not success:
246
+ return False, f"Failed to export training data: {message}"
247
+
248
+ # Count examples
249
+ with open(temp_path, 'r') as f:
250
+ example_count = sum(1 for _ in f)
251
+
252
+ if example_count == 0:
253
+ return False, "No high-quality examples found for fine-tuning"
254
+
255
+ # Run actual fine-tuning using the export file
256
+ from src.training.fine_tuner import finetune_from_file
257
+
258
+ success, message = finetune_from_file(
259
+ training_file=temp_path,
260
+ epochs=epochs,
261
+ batch_size=batch_size,
262
+ learning_rate=learning_rate
263
+ )
264
+
265
+ # Clean up temporary file
266
+ try:
267
+ os.unlink(temp_path)
268
+ except:
269
+ pass
270
+
271
+ if success:
272
+ return True, f"Successfully fine-tuned model with {example_count} annotated examples: {message}"
273
+ else:
274
+ return False, f"Fine-tuning failed: {message}"
275
+
276
+ except Exception as e:
277
+ return False, f"Error during fine-tuning from annotations: {str(e)}"
278
+
279
  def train(
280
  self,
281
  training_data_path: Optional[str] = None,
 
287
  save_strategy: str = "epoch"
288
  ) -> Tuple[bool, str]:
289
  """
290
+ Start model fine-tuning process
291
 
292
  Args:
293
+ training_data_path: Path to training data (if None, data will be prepared automatically)
294
+ num_train_epochs: Number of training epochs
295
+ per_device_train_batch_size: Batch size per device
296
+ gradient_accumulation_steps: Number of gradient accumulation steps
297
+ learning_rate: Learning rate
298
+ logging_steps: Logging frequency
299
+ save_strategy: Model saving strategy
300
 
301
  Returns:
302
+ (success, message)
303
  """
304
  try:
305
+ # Prepare training data if path not specified
306
  if training_data_path is None:
307
  training_data_path = self.prepare_training_data()
308
  temp_data = True
309
  else:
310
  temp_data = False
311
 
312
+ # Load model and tokenizer if not loaded
313
  if self.model is None or self.tokenizer is None:
314
  self.load_model_and_tokenizer()
315
 
316
+ # Prepare model for training
317
  self.prepare_model_for_training()
318
 
319
+ # Load dataset
320
  dataset = load_dataset("json", data_files=training_data_path, split="train")
321
+ logger.info(f"Loaded {len(dataset)} examples from {training_data_path}")
322
 
323
+ # Tokenize dataset
324
  tokenized_dataset = self.tokenize_dataset(dataset)
325
 
326
+ # Create data collator
327
  data_collator = DataCollatorForLanguageModeling(
328
  tokenizer=self.tokenizer,
329
  mlm=False
330
  )
331
 
332
+ # Setup training arguments
333
  training_args = TrainingArguments(
334
  output_dir=self.output_dir,
335
  num_train_epochs=num_train_epochs,
 
347
  load_best_model_at_end=True
348
  )
349
 
350
+ # Create trainer
351
  trainer = Trainer(
352
  model=self.model,
353
  args=training_args,
 
356
  tokenizer=self.tokenizer
357
  )
358
 
359
+ # Start training
360
+ logger.info("Starting model training...")
361
  trainer.train()
362
 
363
+ # Save model
364
+ logger.info(f"Saving trained model to {self.output_dir}")
365
  trainer.save_model(self.output_dir)
366
  self.tokenizer.save_pretrained(self.output_dir)
367
 
368
+ # Remove temporary file if created
369
  if temp_data and os.path.exists(training_data_path):
370
  os.remove(training_data_path)
371
 
372
+ return True, f"Model successfully trained and saved to {self.output_dir}"
373
  except Exception as e:
374
+ logger.error(f"Error during training: {str(e)}")
375
+ return False, f"Error during training: {str(e)}"
376
 
377
  def upload_model_to_hub(
378
  self,
 
381
  token: Optional[str] = None
382
  ) -> Tuple[bool, str]:
383
  """
384
+ Upload trained model to Hugging Face Hub
385
 
386
  Args:
387
+ repo_id: Repository ID on Hugging Face Hub
388
+ private: Repository privacy flag
389
+ token: Hugging Face Hub access token
390
 
391
  Returns:
392
+ (success, message)
393
  """
394
  try:
395
  if not os.path.exists(os.path.join(self.output_dir, "pytorch_model.bin")):
396
+ return False, "Trained model not found. Please train the model first."
397
 
398
+ # Initialize API
399
  api = HfApi(token=token)
400
 
401
+ # Upload model to Hub
402
  api.create_repo(repo_id=repo_id, private=private, repo_type="model", exist_ok=True)
403
  api.upload_folder(
404
  folder_path=self.output_dir,
 
406
  repo_type="model"
407
  )
408
 
409
+ return True, f"Model successfully uploaded to Hugging Face Hub: {repo_id}"
410
  except Exception as e:
411
+ return False, f"Error uploading model to Hub: {str(e)}"
412
 
413
  def finetune_from_chat_history(epochs: int = 3) -> Tuple[bool, str]:
414
  """
415
+ Function to start fine-tuning process based on chat history
416
 
417
  Args:
418
+ epochs: Number of training epochs
419
 
420
  Returns:
421
+ (success, message)
422
  """
423
+ # Analyze chats and prepare data
424
  analyzer = ChatAnalyzer()
425
  report = analyzer.generate_analytics_report()
426
 
427
+ # Check if there's enough data
428
  if report["qa_pairs_count"] < 10:
429
+ return False, f"Insufficient data for fine-tuning. Only {report['qa_pairs_count']} QA pairs found."
430
 
431
+ # Create and start fine-tuning process
432
  tuner = FineTuner()
433
  success, message = tuner.train(num_train_epochs=epochs)
434
 
435
  return success, message
436
 
437
  if __name__ == "__main__":
438
+ # Usage example
439
  success, message = finetune_from_chat_history()
440
  print(message)
web/evaluation_interface.py ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interface components for chat evaluation
3
+ """
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from src.analytics.chat_evaluator import ChatEvaluator
8
+ import json
9
+ import os
10
+ from typing import Dict, Any, List, Tuple
11
+
12
+ def get_evaluation_status(evaluator: ChatEvaluator) -> str:
13
+ """
14
+ Format evaluation status for display
15
+
16
+ Args:
17
+ evaluator: ChatEvaluator instance
18
+
19
+ Returns:
20
+ Formatted markdown string with status information
21
+ """
22
+ status = evaluator.get_evaluation_status()
23
+
24
+ status_md = f"""
25
+ ## Evaluation Status
26
+
27
+ - **Total QA Pairs:** {status['total_qa_pairs']}
28
+ - **Evaluated Pairs:** {status['evaluated_pairs']} ({status['evaluated_pairs']/max(1, status['total_qa_pairs'])*100:.1f}%)
29
+ - **Unevaluated Pairs:** {status['unevaluated_pairs']}
30
+ - **Evaluated Conversations:** {status['evaluated_conversations']}
31
+ """
32
+
33
+ return status_md
34
+
35
+ def get_qa_pairs_dataframe(evaluator: ChatEvaluator, show_evaluated: bool = False, limit: int = 50) -> pd.DataFrame:
36
+ """
37
+ Get QA pairs as a pandas DataFrame for display
38
+
39
+ Args:
40
+ evaluator: ChatEvaluator instance
41
+ show_evaluated: Whether to show already evaluated pairs
42
+ limit: Maximum number of pairs to return
43
+
44
+ Returns:
45
+ DataFrame with QA pairs
46
+ """
47
+ qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=200) # Get more than needed for filtering
48
+ annotations = evaluator.get_annotations()
49
+
50
+ # Create set of evaluated conversation IDs
51
+ evaluated_ids = set(a.get("conversation_id") for a in annotations)
52
+
53
+ # Filter QA pairs based on show_evaluated parameter
54
+ if not show_evaluated:
55
+ qa_pairs = [pair for pair in qa_pairs if pair.get("conversation_id") not in evaluated_ids]
56
+
57
+ # Limit the results
58
+ qa_pairs = qa_pairs[:limit]
59
+
60
+ # Create DataFrame
61
+ if qa_pairs:
62
+ df = pd.DataFrame(qa_pairs)
63
+
64
+ # Add "Evaluated" column
65
+ df["evaluated"] = df["conversation_id"].apply(lambda x: "Yes" if x in evaluated_ids else "No")
66
+
67
+ # Select and rename columns for display
68
+ display_df = df[["conversation_id", "question", "original_answer", "evaluated"]].copy()
69
+ display_df = display_df.rename(columns={
70
+ "conversation_id": "ID",
71
+ "question": "Question",
72
+ "original_answer": "Answer",
73
+ "evaluated": "Evaluated"
74
+ })
75
+
76
+ # Truncate long text for better display
77
+ display_df["Question"] = display_df["Question"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
78
+ display_df["Answer"] = display_df["Answer"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
79
+
80
+ return display_df
81
+
82
+ # Return empty DataFrame if no pairs
83
+ return pd.DataFrame(columns=["ID", "Question", "Answer", "Evaluated"])
84
+
85
+ def load_qa_pair_for_evaluation(evaluator: ChatEvaluator, conversation_id: str) -> Tuple[str, str, Dict, str]:
86
+ """
87
+ Load a QA pair for evaluation
88
+
89
+ Args:
90
+ evaluator: ChatEvaluator instance
91
+ conversation_id: ID of the conversation to load
92
+
93
+ Returns:
94
+ Tuple of (question, original_answer, existing_ratings, notes)
95
+ """
96
+ # Get all QA pairs
97
+ qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=1000)
98
+
99
+ # Find the requested pair
100
+ for pair in qa_pairs:
101
+ if pair.get("conversation_id") == conversation_id:
102
+ question = pair.get("question", "")
103
+ original_answer = pair.get("original_answer", "")
104
+
105
+ # Check if there's an existing annotation
106
+ annotation = evaluator.get_annotation_by_conversation_id(conversation_id)
107
+
108
+ if annotation:
109
+ existing_ratings = annotation.get("ratings", {})
110
+ improved_answer = annotation.get("improved_answer", original_answer)
111
+ notes = annotation.get("notes", "")
112
+ return question, original_answer, improved_answer, existing_ratings, notes
113
+
114
+ return question, original_answer, original_answer, {}, ""
115
+
116
+ return "", "", "", {}, ""
117
+
118
+ def save_evaluation(
119
+ evaluator: ChatEvaluator,
120
+ conversation_id: str,
121
+ question: str,
122
+ original_answer: str,
123
+ improved_answer: str,
124
+ accuracy: int,
125
+ completeness: int,
126
+ relevance: int,
127
+ clarity: int,
128
+ legal_correctness: int,
129
+ notes: str
130
+ ) -> str:
131
+ """
132
+ Save evaluation to file and dataset
133
+
134
+ Args:
135
+ evaluator: ChatEvaluator instance
136
+ conversation_id: ID of the conversation
137
+ question: User question
138
+ original_answer: Original bot answer
139
+ improved_answer: Improved answer
140
+ accuracy: Rating for factual accuracy (1-5)
141
+ completeness: Rating for completeness (1-5)
142
+ relevance: Rating for relevance (1-5)
143
+ clarity: Rating for clarity (1-5)
144
+ legal_correctness: Rating for legal correctness (1-5)
145
+ notes: Evaluator notes
146
+
147
+ Returns:
148
+ Status message
149
+ """
150
+ # Create ratings dictionary
151
+ ratings = {
152
+ "accuracy": accuracy,
153
+ "completeness": completeness,
154
+ "relevance": relevance,
155
+ "clarity": clarity,
156
+ "legal_correctness": legal_correctness
157
+ }
158
+
159
+ # Save annotation
160
+ success, message = evaluator.save_annotation(
161
+ conversation_id=conversation_id,
162
+ question=question,
163
+ original_answer=original_answer,
164
+ improved_answer=improved_answer,
165
+ ratings=ratings,
166
+ notes=notes
167
+ )
168
+
169
+ return message
170
+
171
+ def generate_evaluation_report_html(evaluator: ChatEvaluator) -> str:
172
+ """
173
+ Generate HTML report of evaluation metrics
174
+
175
+ Args:
176
+ evaluator: ChatEvaluator instance
177
+
178
+ Returns:
179
+ HTML string with report
180
+ """
181
+ report = evaluator.generate_evaluation_report()
182
+
183
+ if report["total_evaluations"] == 0:
184
+ return "<p>No evaluations available yet.</p>"
185
+
186
+ # Format criteria averages
187
+ criteria_html = ""
188
+ for criterion, avg in report["criteria_averages"].items():
189
+ # Calculate stars representation (1-5)
190
+ stars = "★" * int(avg) + "☆" * (5 - int(avg))
191
+ criteria_html += f"""
192
+ <tr>
193
+ <td>{criterion.capitalize()}</td>
194
+ <td>{avg:.2f}/5.0</td>
195
+ <td>{stars}</td>
196
+ </tr>
197
+ """
198
+
199
+ # Overall stars representation
200
+ overall_stars = "★" * int(report["overall_average"]) + "☆" * (5 - int(report["overall_average"]))
201
+
202
+ html = f"""
203
+ <div style="padding: 15px; border: 1px solid #ccc; border-radius: 5px; margin-top: 10px;">
204
+ <h3>Evaluation Report</h3>
205
+
206
+ <p><strong>Total Evaluations:</strong> {report["total_evaluations"]}</p>
207
+ <p><strong>Overall Average Rating:</strong> {report["overall_average"]:.2f}/5.0 {overall_stars}</p>
208
+ <p><strong>Improvement Rate:</strong> {report["improvement_rate"]:.1f}% of responses were improved</p>
209
+
210
+ <h4>Criteria Ratings:</h4>
211
+ <table style="width: 100%; border-collapse: collapse;">
212
+ <tr>
213
+ <th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Criterion</th>
214
+ <th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Average Score</th>
215
+ <th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Rating</th>
216
+ </tr>
217
+ {criteria_html}
218
+ </table>
219
+ </div>
220
+ """
221
+
222
+ return html
223
+
224
+ def export_training_data_action(evaluator: ChatEvaluator, min_rating: int, output_file: str) -> str:
225
+ """
226
+ Action for exporting training data
227
+
228
+ Args:
229
+ evaluator: ChatEvaluator instance
230
+ min_rating: Minimum average rating (1-5)
231
+ output_file: Output file path
232
+
233
+ Returns:
234
+ Status message
235
+ """
236
+ if not output_file:
237
+ output_file = os.path.join(os.path.dirname(evaluator.annotations_dir), "training_data.jsonl")
238
+
239
+ success, message = evaluator.export_training_data(output_file, min_rating)
240
+ return message