Spaces:
Running
Running
Add chat evaluation interface components and functionality
Browse files- app.py +232 -12
- src/analytics/chat_evaluator.py +320 -0
- src/training/fine_tuner.py +141 -72
- web/evaluation_interface.py +240 -0
app.py
CHANGED
|
@@ -4,7 +4,6 @@ import json
|
|
| 4 |
import datetime
|
| 5 |
from pathlib import Path
|
| 6 |
from huggingface_hub import InferenceClient, HfApi
|
| 7 |
-
#from huggingface_hub import InferenceClient
|
| 8 |
from config.constants import DEFAULT_SYSTEM_MESSAGE
|
| 9 |
from config.settings import (
|
| 10 |
HF_TOKEN,
|
|
@@ -23,6 +22,15 @@ from web.training_interface import (
|
|
| 23 |
register_model_action,
|
| 24 |
start_finetune_action
|
| 25 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
|
| 27 |
if not HF_TOKEN:
|
| 28 |
raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
|
|
@@ -81,6 +89,11 @@ ERROR_LOGS_PATH = os.path.join(os.path.dirname(__file__), "error_logs")
|
|
| 81 |
client = None
|
| 82 |
context_store = {}
|
| 83 |
fallback_model_attempted = False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
print(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
|
| 86 |
|
|
@@ -625,6 +638,73 @@ def save_parameters(model_key, max_len, temp, top_p_val, rep_pen):
|
|
| 625 |
except Exception as e:
|
| 626 |
return f"Error saving parameters: {str(e)}"
|
| 627 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 628 |
def initialize_app():
|
| 629 |
"""Initialize app with user preferences"""
|
| 630 |
global client, ACTIVE_MODEL
|
|
@@ -810,14 +890,27 @@ with gr.Blocks() as demo:
|
|
| 810 |
gr.Markdown("### Model Training Interface")
|
| 811 |
|
| 812 |
with gr.Row():
|
| 813 |
-
with gr.Column():
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 817 |
|
| 818 |
-
train_btn = gr.Button("Start Training", variant="primary")
|
| 819 |
-
training_output = gr.Textbox(label="Training Status", interactive=False)
|
| 820 |
-
|
| 821 |
gr.Markdown("""
|
| 822 |
<small>
|
| 823 |
|
|
@@ -834,10 +927,8 @@ with gr.Blocks() as demo:
|
|
| 834 |
2e-4 (0.0002) = Usually works best -> 1e-4 = Safer choice for fine-tuning
|
| 835 |
</small>
|
| 836 |
""")
|
| 837 |
-
|
| 838 |
-
|
| 839 |
|
| 840 |
-
with gr.Column():
|
| 841 |
analysis_btn = gr.Button("Generate Chat Analysis")
|
| 842 |
analysis_output = gr.Markdown()
|
| 843 |
|
|
@@ -846,11 +937,140 @@ with gr.Blocks() as demo:
|
|
| 846 |
inputs=[epochs, batch_size, learning_rate],
|
| 847 |
outputs=[training_output]
|
| 848 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
analysis_btn.click(
|
| 850 |
generate_chat_analysis,
|
| 851 |
inputs=[],
|
| 852 |
outputs=[analysis_output]
|
| 853 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 854 |
|
| 855 |
# Model change handler
|
| 856 |
model_selector.change(
|
|
@@ -882,4 +1102,4 @@ if __name__ == "__main__":
|
|
| 882 |
if not load_vector_store():
|
| 883 |
print("Knowledge base not found. Please create it through the interface.")
|
| 884 |
|
| 885 |
-
demo.launch()
|
|
|
|
| 4 |
import datetime
|
| 5 |
from pathlib import Path
|
| 6 |
from huggingface_hub import InferenceClient, HfApi
|
|
|
|
| 7 |
from config.constants import DEFAULT_SYSTEM_MESSAGE
|
| 8 |
from config.settings import (
|
| 9 |
HF_TOKEN,
|
|
|
|
| 22 |
register_model_action,
|
| 23 |
start_finetune_action
|
| 24 |
)
|
| 25 |
+
from web.evaluation_interface import (
|
| 26 |
+
get_evaluation_status,
|
| 27 |
+
get_qa_pairs_dataframe,
|
| 28 |
+
load_qa_pair_for_evaluation,
|
| 29 |
+
save_evaluation,
|
| 30 |
+
generate_evaluation_report_html,
|
| 31 |
+
export_training_data_action
|
| 32 |
+
)
|
| 33 |
+
from src.analytics.chat_evaluator import ChatEvaluator
|
| 34 |
|
| 35 |
if not HF_TOKEN:
|
| 36 |
raise ValueError("HUGGINGFACE_TOKEN not found in environment variables")
|
|
|
|
| 89 |
client = None
|
| 90 |
context_store = {}
|
| 91 |
fallback_model_attempted = False
|
| 92 |
+
chat_evaluator = ChatEvaluator(
|
| 93 |
+
hf_token=HF_TOKEN,
|
| 94 |
+
dataset_id=DATASET_ID,
|
| 95 |
+
chat_history_path=CHAT_HISTORY_PATH
|
| 96 |
+
)
|
| 97 |
|
| 98 |
print(f"Chat histories will be saved to: {CHAT_HISTORY_PATH}")
|
| 99 |
|
|
|
|
| 638 |
except Exception as e:
|
| 639 |
return f"Error saving parameters: {str(e)}"
|
| 640 |
|
| 641 |
+
def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_rating=4):
|
| 642 |
+
"""
|
| 643 |
+
Fine-tune model using annotated QA pairs
|
| 644 |
+
|
| 645 |
+
Args:
|
| 646 |
+
epochs: Number of training epochs
|
| 647 |
+
batch_size: Batch size for training
|
| 648 |
+
learning_rate: Learning rate
|
| 649 |
+
min_rating: Minimum average rating for including examples
|
| 650 |
+
|
| 651 |
+
Returns:
|
| 652 |
+
(success, message)
|
| 653 |
+
"""
|
| 654 |
+
try:
|
| 655 |
+
import tempfile
|
| 656 |
+
import os
|
| 657 |
+
from src.analytics.chat_evaluator import ChatEvaluator
|
| 658 |
+
from config.settings import HF_TOKEN, DATASET_ID, CHAT_HISTORY_PATH
|
| 659 |
+
|
| 660 |
+
# Create evaluator
|
| 661 |
+
evaluator = ChatEvaluator(
|
| 662 |
+
hf_token=HF_TOKEN,
|
| 663 |
+
dataset_id=DATASET_ID,
|
| 664 |
+
chat_history_path=CHAT_HISTORY_PATH
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
# Create temporary file for training data
|
| 668 |
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.jsonl', delete=False) as temp_file:
|
| 669 |
+
temp_path = temp_file.name
|
| 670 |
+
|
| 671 |
+
# Export high-quality examples
|
| 672 |
+
success, message = evaluator.export_training_data(temp_path, min_rating)
|
| 673 |
+
|
| 674 |
+
if not success:
|
| 675 |
+
return False, f"Failed to export training data: {message}"
|
| 676 |
+
|
| 677 |
+
# Count examples
|
| 678 |
+
with open(temp_path, 'r') as f:
|
| 679 |
+
example_count = sum(1 for _ in f)
|
| 680 |
+
|
| 681 |
+
if example_count == 0:
|
| 682 |
+
return False, "No high-quality examples found for fine-tuning"
|
| 683 |
+
|
| 684 |
+
# Run actual fine-tuning using the export file
|
| 685 |
+
from src.training.fine_tuner import finetune_from_file
|
| 686 |
+
|
| 687 |
+
success, message = finetune_from_file(
|
| 688 |
+
training_file=temp_path,
|
| 689 |
+
epochs=epochs,
|
| 690 |
+
batch_size=batch_size,
|
| 691 |
+
learning_rate=learning_rate
|
| 692 |
+
)
|
| 693 |
+
|
| 694 |
+
# Clean up temporary file
|
| 695 |
+
try:
|
| 696 |
+
os.unlink(temp_path)
|
| 697 |
+
except:
|
| 698 |
+
pass
|
| 699 |
+
|
| 700 |
+
if success:
|
| 701 |
+
return True, f"Successfully fine-tuned model with {example_count} annotated examples: {message}"
|
| 702 |
+
else:
|
| 703 |
+
return False, f"Fine-tuning failed: {message}"
|
| 704 |
+
|
| 705 |
+
except Exception as e:
|
| 706 |
+
return False, f"Error during fine-tuning from annotations: {str(e)}"
|
| 707 |
+
|
| 708 |
def initialize_app():
|
| 709 |
"""Initialize app with user preferences"""
|
| 710 |
global client, ACTIVE_MODEL
|
|
|
|
| 890 |
gr.Markdown("### Model Training Interface")
|
| 891 |
|
| 892 |
with gr.Row():
|
| 893 |
+
with gr.Column(scale=1):
|
| 894 |
+
training_tabs = gr.Tabs()
|
| 895 |
+
|
| 896 |
+
with training_tabs:
|
| 897 |
+
with gr.TabItem("Regular Training"):
|
| 898 |
+
epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
|
| 899 |
+
batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
|
| 900 |
+
learning_rate = gr.Slider(minimum=1e-6, maximum=1e-3, value=2e-4, label="Learning Rate")
|
| 901 |
+
|
| 902 |
+
train_btn = gr.Button("Start Training", variant="primary")
|
| 903 |
+
training_output = gr.Textbox(label="Training Status", interactive=False)
|
| 904 |
+
|
| 905 |
+
with gr.TabItem("Train from Annotations"):
|
| 906 |
+
annot_epochs = gr.Slider(minimum=1, maximum=10, value=3, step=1, label="Number of Epochs")
|
| 907 |
+
annot_batch_size = gr.Slider(minimum=1, maximum=32, value=4, step=1, label="Batch Size")
|
| 908 |
+
annot_learning_rate = gr.Slider(minimum=1e-6, maximum=1e-3, value=2e-4, label="Learning Rate")
|
| 909 |
+
annot_min_rating = gr.Slider(minimum=1, maximum=5, value=4, step=0.5, label="Minimum Rating for Training")
|
| 910 |
+
|
| 911 |
+
annot_train_btn = gr.Button("Start Training from Annotations", variant="primary")
|
| 912 |
+
annot_training_output = gr.Textbox(label="Training Status", interactive=False)
|
| 913 |
|
|
|
|
|
|
|
|
|
|
| 914 |
gr.Markdown("""
|
| 915 |
<small>
|
| 916 |
|
|
|
|
| 927 |
2e-4 (0.0002) = Usually works best -> 1e-4 = Safer choice for fine-tuning
|
| 928 |
</small>
|
| 929 |
""")
|
|
|
|
|
|
|
| 930 |
|
| 931 |
+
with gr.Column(scale=1):
|
| 932 |
analysis_btn = gr.Button("Generate Chat Analysis")
|
| 933 |
analysis_output = gr.Markdown()
|
| 934 |
|
|
|
|
| 937 |
inputs=[epochs, batch_size, learning_rate],
|
| 938 |
outputs=[training_output]
|
| 939 |
)
|
| 940 |
+
|
| 941 |
+
# Function to handle training from annotations
|
| 942 |
+
def start_annotation_finetune(epochs, batch_size, learning_rate, min_rating):
|
| 943 |
+
"""Wrapper function to start fine-tuning from annotations"""
|
| 944 |
+
success, message = finetune_from_annotations(
|
| 945 |
+
epochs=epochs,
|
| 946 |
+
batch_size=batch_size,
|
| 947 |
+
learning_rate=learning_rate,
|
| 948 |
+
min_rating=min_rating
|
| 949 |
+
)
|
| 950 |
+
return message
|
| 951 |
+
|
| 952 |
+
annot_train_btn.click(
|
| 953 |
+
start_annotation_finetune,
|
| 954 |
+
inputs=[annot_epochs, annot_batch_size, annot_learning_rate, annot_min_rating],
|
| 955 |
+
outputs=[annot_training_output]
|
| 956 |
+
)
|
| 957 |
+
|
| 958 |
analysis_btn.click(
|
| 959 |
generate_chat_analysis,
|
| 960 |
inputs=[],
|
| 961 |
outputs=[analysis_output]
|
| 962 |
)
|
| 963 |
+
|
| 964 |
+
with gr.Tab("Chat Evaluation"):
|
| 965 |
+
gr.Markdown("### Evaluation of Chat Responses")
|
| 966 |
+
|
| 967 |
+
with gr.Row():
|
| 968 |
+
with gr.Column(scale=1):
|
| 969 |
+
evaluation_status = gr.Markdown(get_evaluation_status(chat_evaluator))
|
| 970 |
+
refresh_status_btn = gr.Button("Refresh Status")
|
| 971 |
+
|
| 972 |
+
gr.Markdown("### Evaluation Metrics")
|
| 973 |
+
evaluation_report = gr.HTML(generate_evaluation_report_html(chat_evaluator))
|
| 974 |
+
refresh_report_btn = gr.Button("Refresh Report")
|
| 975 |
+
|
| 976 |
+
gr.Markdown("### Export for Training")
|
| 977 |
+
with gr.Row():
|
| 978 |
+
min_rating = gr.Slider(
|
| 979 |
+
minimum=1,
|
| 980 |
+
maximum=5,
|
| 981 |
+
value=4,
|
| 982 |
+
step=0.5,
|
| 983 |
+
label="Minimum Average Rating"
|
| 984 |
+
)
|
| 985 |
+
export_path = gr.Textbox(
|
| 986 |
+
label="Export File Path (optional)",
|
| 987 |
+
placeholder="Leave empty for default path"
|
| 988 |
+
)
|
| 989 |
+
export_btn = gr.Button("Export Annotated Data", variant="primary")
|
| 990 |
+
export_status = gr.Textbox(label="Export Status", interactive=False)
|
| 991 |
+
|
| 992 |
+
with gr.Column(scale=2):
|
| 993 |
+
show_evaluated = gr.Checkbox(label="Show Already Evaluated Pairs", value=False)
|
| 994 |
+
qa_table = gr.DataFrame(get_qa_pairs_dataframe(chat_evaluator))
|
| 995 |
+
|
| 996 |
+
gr.Markdown("### Select Conversation to Evaluate")
|
| 997 |
+
selected_conversation = gr.Textbox(label="Conversation ID", placeholder="Select from table above")
|
| 998 |
+
load_btn = gr.Button("Load Conversation", variant="primary")
|
| 999 |
+
|
| 1000 |
+
gr.Markdown("### Evaluate Response")
|
| 1001 |
+
question_display = gr.Textbox(label="User Question", interactive=False)
|
| 1002 |
+
original_answer = gr.TextArea(label="Original Bot Answer", interactive=False)
|
| 1003 |
+
improved_answer = gr.TextArea(label="Improved Answer (Gold Standard)", interactive=True)
|
| 1004 |
+
|
| 1005 |
+
gr.Markdown("### Quality Ratings (1-5)")
|
| 1006 |
+
with gr.Row():
|
| 1007 |
+
accuracy = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Factual Accuracy")
|
| 1008 |
+
completeness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Completeness")
|
| 1009 |
+
with gr.Row():
|
| 1010 |
+
relevance = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Relevance")
|
| 1011 |
+
clarity = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Clarity")
|
| 1012 |
+
legal_correctness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Legal Correctness")
|
| 1013 |
+
|
| 1014 |
+
notes = gr.TextArea(label="Evaluator Notes", placeholder="Add your notes about this response...")
|
| 1015 |
+
save_btn = gr.Button("Save Evaluation", variant="primary")
|
| 1016 |
+
evaluation_status_msg = gr.Textbox(label="Status", interactive=False)
|
| 1017 |
+
|
| 1018 |
+
# Add event handlers
|
| 1019 |
+
refresh_status_btn.click(
|
| 1020 |
+
fn=get_evaluation_status,
|
| 1021 |
+
inputs=[],
|
| 1022 |
+
outputs=[evaluation_status],
|
| 1023 |
+
kwargs={"evaluator": chat_evaluator}
|
| 1024 |
+
)
|
| 1025 |
+
|
| 1026 |
+
refresh_report_btn.click(
|
| 1027 |
+
fn=generate_evaluation_report_html,
|
| 1028 |
+
inputs=[],
|
| 1029 |
+
outputs=[evaluation_report],
|
| 1030 |
+
kwargs={"evaluator": chat_evaluator}
|
| 1031 |
+
)
|
| 1032 |
+
|
| 1033 |
+
show_evaluated.change(
|
| 1034 |
+
fn=get_qa_pairs_dataframe,
|
| 1035 |
+
inputs=[show_evaluated],
|
| 1036 |
+
outputs=[qa_table],
|
| 1037 |
+
kwargs={"evaluator": chat_evaluator}
|
| 1038 |
+
)
|
| 1039 |
+
|
| 1040 |
+
# Table selection to conversation ID textbox
|
| 1041 |
+
qa_table.select(
|
| 1042 |
+
fn=lambda df, evt: evt.value[0] if evt and evt.value and len(evt.value) > 0 else "",
|
| 1043 |
+
inputs=[qa_table],
|
| 1044 |
+
outputs=[selected_conversation]
|
| 1045 |
+
)
|
| 1046 |
+
|
| 1047 |
+
# Load conversation for evaluation
|
| 1048 |
+
load_btn.click(
|
| 1049 |
+
fn=load_qa_pair_for_evaluation,
|
| 1050 |
+
inputs=[selected_conversation],
|
| 1051 |
+
outputs=[question_display, original_answer, improved_answer,
|
| 1052 |
+
accuracy, completeness, relevance, clarity, legal_correctness, notes],
|
| 1053 |
+
kwargs={"evaluator": chat_evaluator}
|
| 1054 |
+
)
|
| 1055 |
+
|
| 1056 |
+
# Save evaluation
|
| 1057 |
+
save_btn.click(
|
| 1058 |
+
fn=save_evaluation,
|
| 1059 |
+
inputs=[
|
| 1060 |
+
selected_conversation, question_display, original_answer, improved_answer,
|
| 1061 |
+
accuracy, completeness, relevance, clarity, legal_correctness, notes
|
| 1062 |
+
],
|
| 1063 |
+
outputs=[evaluation_status_msg],
|
| 1064 |
+
kwargs={"evaluator": chat_evaluator}
|
| 1065 |
+
)
|
| 1066 |
+
|
| 1067 |
+
# Export training data
|
| 1068 |
+
export_btn.click(
|
| 1069 |
+
fn=export_training_data_action,
|
| 1070 |
+
inputs=[min_rating, export_path],
|
| 1071 |
+
outputs=[export_status],
|
| 1072 |
+
kwargs={"evaluator": chat_evaluator}
|
| 1073 |
+
)
|
| 1074 |
|
| 1075 |
# Model change handler
|
| 1076 |
model_selector.change(
|
|
|
|
| 1102 |
if not load_vector_store():
|
| 1103 |
print("Knowledge base not found. Please create it through the interface.")
|
| 1104 |
|
| 1105 |
+
demo.launch()
|
src/analytics/chat_evaluator.py
ADDED
|
@@ -0,0 +1,320 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Module for evaluation and annotation of bot responses
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import datetime
|
| 8 |
+
from typing import List, Dict, Any, Tuple, Optional
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from src.knowledge_base.dataset import DatasetManager
|
| 11 |
+
from huggingface_hub import HfApi
|
| 12 |
+
|
| 13 |
+
class ChatEvaluator:
|
| 14 |
+
def __init__(self,
|
| 15 |
+
dataset_manager: Optional[DatasetManager] = None,
|
| 16 |
+
hf_token: str = None,
|
| 17 |
+
dataset_id: str = None,
|
| 18 |
+
chat_history_path: str = None):
|
| 19 |
+
"""
|
| 20 |
+
Initialize chat evaluator
|
| 21 |
+
|
| 22 |
+
Args:
|
| 23 |
+
dataset_manager: Dataset manager for retrieving chat history
|
| 24 |
+
hf_token: Hugging Face token for uploading annotations
|
| 25 |
+
dataset_id: Hugging Face dataset ID
|
| 26 |
+
chat_history_path: Path to local chat history directory
|
| 27 |
+
"""
|
| 28 |
+
self.dataset_manager = dataset_manager or DatasetManager()
|
| 29 |
+
self.hf_token = hf_token
|
| 30 |
+
self.dataset_id = dataset_id
|
| 31 |
+
self.chat_history_path = chat_history_path
|
| 32 |
+
self.annotations_dir = os.path.join(os.path.dirname(chat_history_path), "annotations") if chat_history_path else None
|
| 33 |
+
|
| 34 |
+
# Create annotations directory if it doesn't exist
|
| 35 |
+
if self.annotations_dir:
|
| 36 |
+
os.makedirs(self.annotations_dir, exist_ok=True)
|
| 37 |
+
|
| 38 |
+
def get_chat_history(self) -> List[Dict[str, Any]]:
|
| 39 |
+
"""
|
| 40 |
+
Get all chat history data from local files and dataset
|
| 41 |
+
|
| 42 |
+
Returns:
|
| 43 |
+
List of chat histories
|
| 44 |
+
"""
|
| 45 |
+
success, chat_data = self.dataset_manager.get_chat_history()
|
| 46 |
+
if not success or not chat_data:
|
| 47 |
+
return []
|
| 48 |
+
return chat_data
|
| 49 |
+
|
| 50 |
+
def get_qa_pairs_for_evaluation(self, limit: int = 50) -> List[Dict[str, Any]]:
|
| 51 |
+
"""
|
| 52 |
+
Extract question-answer pairs for evaluation
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
limit: Maximum number of pairs to return
|
| 56 |
+
|
| 57 |
+
Returns:
|
| 58 |
+
List of QA pairs with metadata
|
| 59 |
+
"""
|
| 60 |
+
chat_data = self.get_chat_history()
|
| 61 |
+
qa_pairs = []
|
| 62 |
+
|
| 63 |
+
for chat in chat_data:
|
| 64 |
+
conversation_id = chat.get("conversation_id", "unknown")
|
| 65 |
+
timestamp = chat.get("timestamp", "")
|
| 66 |
+
history = chat.get("history", [])
|
| 67 |
+
|
| 68 |
+
# Find user-assistant pairs in history
|
| 69 |
+
for i in range(len(history) - 1):
|
| 70 |
+
if history[i].get("role") == "user" and history[i+1].get("role") == "assistant":
|
| 71 |
+
question = history[i].get("content", "").strip()
|
| 72 |
+
answer = history[i+1].get("content", "").strip()
|
| 73 |
+
|
| 74 |
+
# Only include non-empty pairs
|
| 75 |
+
if question and answer:
|
| 76 |
+
qa_pairs.append({
|
| 77 |
+
"conversation_id": conversation_id,
|
| 78 |
+
"timestamp": timestamp,
|
| 79 |
+
"question": question,
|
| 80 |
+
"original_answer": answer,
|
| 81 |
+
"question_timestamp": history[i].get("timestamp", ""),
|
| 82 |
+
"answer_timestamp": history[i+1].get("timestamp", "")
|
| 83 |
+
})
|
| 84 |
+
|
| 85 |
+
# Check if we've reached the limit
|
| 86 |
+
if len(qa_pairs) >= limit:
|
| 87 |
+
return qa_pairs
|
| 88 |
+
|
| 89 |
+
return qa_pairs
|
| 90 |
+
|
| 91 |
+
def get_evaluation_status(self) -> Dict[str, int]:
|
| 92 |
+
"""
|
| 93 |
+
Get status of evaluated QA pairs
|
| 94 |
+
|
| 95 |
+
Returns:
|
| 96 |
+
Dictionary with counts of evaluated and unevaluated QA pairs
|
| 97 |
+
"""
|
| 98 |
+
all_pairs = self.get_qa_pairs_for_evaluation(limit=1000) # Get a large sample
|
| 99 |
+
evaluated_pairs = self.get_annotations()
|
| 100 |
+
|
| 101 |
+
# Count evaluated conversation IDs
|
| 102 |
+
evaluated_ids = set(item.get("conversation_id") for item in evaluated_pairs)
|
| 103 |
+
|
| 104 |
+
return {
|
| 105 |
+
"total_qa_pairs": len(all_pairs),
|
| 106 |
+
"evaluated_pairs": len(evaluated_pairs),
|
| 107 |
+
"unevaluated_pairs": len(all_pairs) - len(evaluated_pairs),
|
| 108 |
+
"evaluated_conversations": len(evaluated_ids)
|
| 109 |
+
}
|
| 110 |
+
|
| 111 |
+
def save_annotation(self,
|
| 112 |
+
conversation_id: str,
|
| 113 |
+
question: str,
|
| 114 |
+
original_answer: str,
|
| 115 |
+
improved_answer: str,
|
| 116 |
+
ratings: Dict[str, int],
|
| 117 |
+
notes: str = "") -> Tuple[bool, str]:
|
| 118 |
+
"""
|
| 119 |
+
Save evaluation annotation
|
| 120 |
+
|
| 121 |
+
Args:
|
| 122 |
+
conversation_id: ID of the conversation
|
| 123 |
+
question: User question
|
| 124 |
+
original_answer: Original bot answer
|
| 125 |
+
improved_answer: Improved answer (gold standard)
|
| 126 |
+
ratings: Dictionary with ratings for different criteria
|
| 127 |
+
notes: Optional evaluator notes
|
| 128 |
+
|
| 129 |
+
Returns:
|
| 130 |
+
(success, message)
|
| 131 |
+
"""
|
| 132 |
+
if not self.annotations_dir:
|
| 133 |
+
return False, "Annotations directory not configured"
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
# Create annotation object
|
| 137 |
+
annotation = {
|
| 138 |
+
"conversation_id": conversation_id,
|
| 139 |
+
"timestamp": datetime.datetime.now().isoformat(),
|
| 140 |
+
"question": question,
|
| 141 |
+
"original_answer": original_answer,
|
| 142 |
+
"improved_answer": improved_answer,
|
| 143 |
+
"ratings": ratings,
|
| 144 |
+
"notes": notes
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
# Create filename with conversation_id
|
| 148 |
+
filename = f"annotation_{conversation_id}.json"
|
| 149 |
+
filepath = os.path.join(self.annotations_dir, filename)
|
| 150 |
+
|
| 151 |
+
# Save to local file
|
| 152 |
+
with open(filepath, 'w', encoding='utf-8') as f:
|
| 153 |
+
json.dump(annotation, f, ensure_ascii=False, indent=2)
|
| 154 |
+
|
| 155 |
+
# Upload to HuggingFace dataset if configured
|
| 156 |
+
if self.hf_token and self.dataset_id:
|
| 157 |
+
try:
|
| 158 |
+
api = HfApi(token=self.hf_token)
|
| 159 |
+
|
| 160 |
+
# Extract just the directory name from annotations_dir
|
| 161 |
+
dir_name = os.path.basename(self.annotations_dir)
|
| 162 |
+
target_path = f"{dir_name}/{filename}"
|
| 163 |
+
|
| 164 |
+
# Upload the file to the dataset
|
| 165 |
+
api.upload_file(
|
| 166 |
+
path_or_fileobj=filepath,
|
| 167 |
+
path_in_repo=target_path,
|
| 168 |
+
repo_id=self.dataset_id,
|
| 169 |
+
repo_type="dataset"
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
except Exception as e:
|
| 173 |
+
return True, f"Saved locally but failed to upload to dataset: {str(e)}"
|
| 174 |
+
|
| 175 |
+
return True, "Annotation saved successfully"
|
| 176 |
+
except Exception as e:
|
| 177 |
+
return False, f"Error saving annotation: {str(e)}"
|
| 178 |
+
|
| 179 |
+
def get_annotations(self) -> List[Dict[str, Any]]:
|
| 180 |
+
"""
|
| 181 |
+
Get all saved annotations
|
| 182 |
+
|
| 183 |
+
Returns:
|
| 184 |
+
List of annotation objects
|
| 185 |
+
"""
|
| 186 |
+
if not self.annotations_dir or not os.path.exists(self.annotations_dir):
|
| 187 |
+
return []
|
| 188 |
+
|
| 189 |
+
annotations = []
|
| 190 |
+
for filename in os.listdir(self.annotations_dir):
|
| 191 |
+
if filename.startswith("annotation_") and filename.endswith(".json"):
|
| 192 |
+
try:
|
| 193 |
+
filepath = os.path.join(self.annotations_dir, filename)
|
| 194 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 195 |
+
annotation = json.load(f)
|
| 196 |
+
annotations.append(annotation)
|
| 197 |
+
except Exception as e:
|
| 198 |
+
print(f"Error loading annotation {filename}: {str(e)}")
|
| 199 |
+
|
| 200 |
+
# Sort by timestamp (newest first)
|
| 201 |
+
annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
|
| 202 |
+
return annotations
|
| 203 |
+
|
| 204 |
+
def get_annotation_by_conversation_id(self, conversation_id: str) -> Optional[Dict[str, Any]]:
|
| 205 |
+
"""
|
| 206 |
+
Get annotation for a specific conversation
|
| 207 |
+
|
| 208 |
+
Args:
|
| 209 |
+
conversation_id: Conversation ID to look for
|
| 210 |
+
|
| 211 |
+
Returns:
|
| 212 |
+
Annotation object or None if not found
|
| 213 |
+
"""
|
| 214 |
+
if not self.annotations_dir:
|
| 215 |
+
return None
|
| 216 |
+
|
| 217 |
+
filepath = os.path.join(self.annotations_dir, f"annotation_{conversation_id}.json")
|
| 218 |
+
if os.path.exists(filepath):
|
| 219 |
+
try:
|
| 220 |
+
with open(filepath, 'r', encoding='utf-8') as f:
|
| 221 |
+
return json.load(f)
|
| 222 |
+
except Exception as e:
|
| 223 |
+
print(f"Error loading annotation for {conversation_id}: {str(e)}")
|
| 224 |
+
|
| 225 |
+
return None
|
| 226 |
+
|
| 227 |
+
def export_training_data(self, output_file: str, min_rating: int = 4) -> Tuple[bool, str]:
|
| 228 |
+
"""
|
| 229 |
+
Export high-quality annotated data for fine-tuning
|
| 230 |
+
|
| 231 |
+
Args:
|
| 232 |
+
output_file: Path to output file
|
| 233 |
+
min_rating: Minimum average rating to include in training data
|
| 234 |
+
|
| 235 |
+
Returns:
|
| 236 |
+
(success, message)
|
| 237 |
+
"""
|
| 238 |
+
annotations = self.get_annotations()
|
| 239 |
+
|
| 240 |
+
if not annotations:
|
| 241 |
+
return False, "No annotations available for export"
|
| 242 |
+
|
| 243 |
+
try:
|
| 244 |
+
# Filter annotations by quality
|
| 245 |
+
high_quality_examples = []
|
| 246 |
+
|
| 247 |
+
for annotation in annotations:
|
| 248 |
+
ratings = annotation.get("ratings", {})
|
| 249 |
+
|
| 250 |
+
# Calculate average rating
|
| 251 |
+
if ratings:
|
| 252 |
+
avg_rating = sum(ratings.values()) / len(ratings)
|
| 253 |
+
|
| 254 |
+
# Include only high-quality examples
|
| 255 |
+
if avg_rating >= min_rating:
|
| 256 |
+
high_quality_examples.append({
|
| 257 |
+
"messages": [
|
| 258 |
+
{"role": "user", "content": annotation.get("question", "")},
|
| 259 |
+
{"role": "assistant", "content": annotation.get("improved_answer", "")}
|
| 260 |
+
]
|
| 261 |
+
})
|
| 262 |
+
|
| 263 |
+
if not high_quality_examples:
|
| 264 |
+
return False, f"No examples meet the minimum quality threshold of {min_rating}"
|
| 265 |
+
|
| 266 |
+
# Save to JSONL format
|
| 267 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 268 |
+
for example in high_quality_examples:
|
| 269 |
+
f.write(json.dumps(example, ensure_ascii=False) + "\n")
|
| 270 |
+
|
| 271 |
+
return True, f"Successfully exported {len(high_quality_examples)} high-quality examples for training"
|
| 272 |
+
except Exception as e:
|
| 273 |
+
return False, f"Error exporting training data: {str(e)}"
|
| 274 |
+
|
| 275 |
+
def generate_evaluation_report(self) -> Dict[str, Any]:
|
| 276 |
+
"""
|
| 277 |
+
Generate evaluation summary report
|
| 278 |
+
|
| 279 |
+
Returns:
|
| 280 |
+
Dictionary with evaluation metrics
|
| 281 |
+
"""
|
| 282 |
+
annotations = self.get_annotations()
|
| 283 |
+
|
| 284 |
+
if not annotations:
|
| 285 |
+
return {
|
| 286 |
+
"total_evaluations": 0,
|
| 287 |
+
"message": "No evaluations available"
|
| 288 |
+
}
|
| 289 |
+
|
| 290 |
+
# Initialize metrics
|
| 291 |
+
criteria = set()
|
| 292 |
+
for annotation in annotations:
|
| 293 |
+
criteria.update(annotation.get("ratings", {}).keys())
|
| 294 |
+
|
| 295 |
+
metrics = {
|
| 296 |
+
"total_evaluations": len(annotations),
|
| 297 |
+
"criteria_averages": {},
|
| 298 |
+
"overall_average": 0,
|
| 299 |
+
"improvement_rate": 0 # Percentage of answers that were improved
|
| 300 |
+
}
|
| 301 |
+
|
| 302 |
+
# Calculate averages for each criterion
|
| 303 |
+
for criterion in criteria:
|
| 304 |
+
values = [a.get("ratings", {}).get(criterion, 0) for a in annotations if criterion in a.get("ratings", {})]
|
| 305 |
+
if values:
|
| 306 |
+
metrics["criteria_averages"][criterion] = sum(values) / len(values)
|
| 307 |
+
|
| 308 |
+
# Calculate overall average
|
| 309 |
+
all_ratings = []
|
| 310 |
+
for annotation in annotations:
|
| 311 |
+
all_ratings.extend(annotation.get("ratings", {}).values())
|
| 312 |
+
|
| 313 |
+
if all_ratings:
|
| 314 |
+
metrics["overall_average"] = sum(all_ratings) / len(all_ratings)
|
| 315 |
+
|
| 316 |
+
# Calculate improvement rate
|
| 317 |
+
improved_count = sum(1 for a in annotations if a.get("original_answer") != a.get("improved_answer"))
|
| 318 |
+
metrics["improvement_rate"] = (improved_count / len(annotations)) * 100
|
| 319 |
+
|
| 320 |
+
return metrics
|
src/training/fine_tuner.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
"""
|
| 4 |
|
| 5 |
import os
|
|
@@ -84,38 +84,38 @@ class FineTuner:
|
|
| 84 |
try:
|
| 85 |
logger.info(f"Загрузка модели {self.base_model_id}...")
|
| 86 |
|
| 87 |
-
#
|
| 88 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 89 |
self.base_model_id,
|
| 90 |
trust_remote_code=True,
|
| 91 |
-
use_fast=False #
|
| 92 |
)
|
| 93 |
|
| 94 |
-
#
|
| 95 |
special_tokens = {
|
| 96 |
"pad_token": "<PAD>",
|
| 97 |
"eos_token": "</s>",
|
| 98 |
"bos_token": "<s>",
|
| 99 |
-
"unk_token": "<unk>" #
|
| 100 |
}
|
| 101 |
|
| 102 |
-
#
|
| 103 |
self.tokenizer.add_special_tokens({"additional_special_tokens": list(special_tokens.values())})
|
| 104 |
|
| 105 |
-
#
|
| 106 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 107 |
self.base_model_id,
|
| 108 |
trust_remote_code=True,
|
| 109 |
device_map="auto" if self.device == "cuda" else None,
|
| 110 |
-
torch_dtype="auto" #
|
| 111 |
)
|
| 112 |
|
| 113 |
-
#
|
| 114 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
| 115 |
|
| 116 |
-
logger.info("
|
| 117 |
except Exception as e:
|
| 118 |
-
logger.error(f"
|
| 119 |
raise
|
| 120 |
|
| 121 |
def setup_lora_config(
|
|
@@ -125,17 +125,17 @@ class FineTuner:
|
|
| 125 |
lora_dropout: float = 0.05
|
| 126 |
) -> LoraConfig:
|
| 127 |
"""
|
| 128 |
-
|
| 129 |
|
| 130 |
Args:
|
| 131 |
-
r:
|
| 132 |
-
lora_alpha:
|
| 133 |
-
lora_dropout:
|
| 134 |
|
| 135 |
Returns:
|
| 136 |
-
|
| 137 |
"""
|
| 138 |
-
#
|
| 139 |
lora_config = LoraConfig(
|
| 140 |
task_type=TaskType.CAUSAL_LM,
|
| 141 |
r=r,
|
|
@@ -149,34 +149,34 @@ class FineTuner:
|
|
| 149 |
|
| 150 |
def prepare_model_for_training(self):
|
| 151 |
"""
|
| 152 |
-
|
| 153 |
"""
|
| 154 |
if self.model is None:
|
| 155 |
self.load_model_and_tokenizer()
|
| 156 |
|
| 157 |
-
#
|
| 158 |
lora_config = self.setup_lora_config()
|
| 159 |
|
| 160 |
-
#
|
| 161 |
self.model = get_peft_model(self.model, lora_config)
|
| 162 |
|
| 163 |
-
#
|
| 164 |
trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
|
| 165 |
all_params = sum(p.numel() for p in self.model.parameters())
|
| 166 |
-
logger.info(f"
|
| 167 |
|
| 168 |
def tokenize_dataset(self, dataset):
|
| 169 |
"""
|
| 170 |
-
|
| 171 |
|
| 172 |
Args:
|
| 173 |
-
dataset:
|
| 174 |
|
| 175 |
Returns:
|
| 176 |
-
|
| 177 |
"""
|
| 178 |
def tokenize_function(examples):
|
| 179 |
-
#
|
| 180 |
texts = []
|
| 181 |
for dialog in examples["messages"]:
|
| 182 |
text = ""
|
|
@@ -187,7 +187,7 @@ class FineTuner:
|
|
| 187 |
text += f"Assistant: {message['content']}\n"
|
| 188 |
texts.append(text)
|
| 189 |
|
| 190 |
-
#
|
| 191 |
tokenized = self.tokenizer(
|
| 192 |
texts,
|
| 193 |
padding="max_length",
|
|
@@ -198,7 +198,7 @@ class FineTuner:
|
|
| 198 |
|
| 199 |
return tokenized
|
| 200 |
|
| 201 |
-
#
|
| 202 |
tokenized_dataset = dataset.map(
|
| 203 |
tokenize_function,
|
| 204 |
batched=True,
|
|
@@ -207,6 +207,75 @@ class FineTuner:
|
|
| 207 |
|
| 208 |
return tokenized_dataset
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
def train(
|
| 211 |
self,
|
| 212 |
training_data_path: Optional[str] = None,
|
|
@@ -218,49 +287,49 @@ class FineTuner:
|
|
| 218 |
save_strategy: str = "epoch"
|
| 219 |
) -> Tuple[bool, str]:
|
| 220 |
"""
|
| 221 |
-
|
| 222 |
|
| 223 |
Args:
|
| 224 |
-
training_data_path:
|
| 225 |
-
num_train_epochs:
|
| 226 |
-
per_device_train_batch_size:
|
| 227 |
-
gradient_accumulation_steps:
|
| 228 |
-
learning_rate:
|
| 229 |
-
logging_steps:
|
| 230 |
-
save_strategy:
|
| 231 |
|
| 232 |
Returns:
|
| 233 |
-
(
|
| 234 |
"""
|
| 235 |
try:
|
| 236 |
-
#
|
| 237 |
if training_data_path is None:
|
| 238 |
training_data_path = self.prepare_training_data()
|
| 239 |
temp_data = True
|
| 240 |
else:
|
| 241 |
temp_data = False
|
| 242 |
|
| 243 |
-
#
|
| 244 |
if self.model is None or self.tokenizer is None:
|
| 245 |
self.load_model_and_tokenizer()
|
| 246 |
|
| 247 |
-
#
|
| 248 |
self.prepare_model_for_training()
|
| 249 |
|
| 250 |
-
#
|
| 251 |
dataset = load_dataset("json", data_files=training_data_path, split="train")
|
| 252 |
-
logger.info(f"
|
| 253 |
|
| 254 |
-
#
|
| 255 |
tokenized_dataset = self.tokenize_dataset(dataset)
|
| 256 |
|
| 257 |
-
#
|
| 258 |
data_collator = DataCollatorForLanguageModeling(
|
| 259 |
tokenizer=self.tokenizer,
|
| 260 |
mlm=False
|
| 261 |
)
|
| 262 |
|
| 263 |
-
#
|
| 264 |
training_args = TrainingArguments(
|
| 265 |
output_dir=self.output_dir,
|
| 266 |
num_train_epochs=num_train_epochs,
|
|
@@ -278,7 +347,7 @@ class FineTuner:
|
|
| 278 |
load_best_model_at_end=True
|
| 279 |
)
|
| 280 |
|
| 281 |
-
#
|
| 282 |
trainer = Trainer(
|
| 283 |
model=self.model,
|
| 284 |
args=training_args,
|
|
@@ -287,23 +356,23 @@ class FineTuner:
|
|
| 287 |
tokenizer=self.tokenizer
|
| 288 |
)
|
| 289 |
|
| 290 |
-
#
|
| 291 |
-
logger.info("
|
| 292 |
trainer.train()
|
| 293 |
|
| 294 |
-
#
|
| 295 |
-
logger.info(f"
|
| 296 |
trainer.save_model(self.output_dir)
|
| 297 |
self.tokenizer.save_pretrained(self.output_dir)
|
| 298 |
|
| 299 |
-
#
|
| 300 |
if temp_data and os.path.exists(training_data_path):
|
| 301 |
os.remove(training_data_path)
|
| 302 |
|
| 303 |
-
return True, f"
|
| 304 |
except Exception as e:
|
| 305 |
-
logger.error(f"
|
| 306 |
-
return False, f"
|
| 307 |
|
| 308 |
def upload_model_to_hub(
|
| 309 |
self,
|
|
@@ -312,24 +381,24 @@ class FineTuner:
|
|
| 312 |
token: Optional[str] = None
|
| 313 |
) -> Tuple[bool, str]:
|
| 314 |
"""
|
| 315 |
-
|
| 316 |
|
| 317 |
Args:
|
| 318 |
-
repo_id:
|
| 319 |
-
private:
|
| 320 |
-
token:
|
| 321 |
|
| 322 |
Returns:
|
| 323 |
-
(
|
| 324 |
"""
|
| 325 |
try:
|
| 326 |
if not os.path.exists(os.path.join(self.output_dir, "pytorch_model.bin")):
|
| 327 |
-
return False, "
|
| 328 |
|
| 329 |
-
#
|
| 330 |
api = HfApi(token=token)
|
| 331 |
|
| 332 |
-
#
|
| 333 |
api.create_repo(repo_id=repo_id, private=private, repo_type="model", exist_ok=True)
|
| 334 |
api.upload_folder(
|
| 335 |
folder_path=self.output_dir,
|
|
@@ -337,35 +406,35 @@ class FineTuner:
|
|
| 337 |
repo_type="model"
|
| 338 |
)
|
| 339 |
|
| 340 |
-
return True, f"
|
| 341 |
except Exception as e:
|
| 342 |
-
return False, f"
|
| 343 |
|
| 344 |
def finetune_from_chat_history(epochs: int = 3) -> Tuple[bool, str]:
|
| 345 |
"""
|
| 346 |
-
|
| 347 |
|
| 348 |
Args:
|
| 349 |
-
epochs:
|
| 350 |
|
| 351 |
Returns:
|
| 352 |
-
(
|
| 353 |
"""
|
| 354 |
-
#
|
| 355 |
analyzer = ChatAnalyzer()
|
| 356 |
report = analyzer.generate_analytics_report()
|
| 357 |
|
| 358 |
-
#
|
| 359 |
if report["qa_pairs_count"] < 10:
|
| 360 |
-
return False, f"
|
| 361 |
|
| 362 |
-
#
|
| 363 |
tuner = FineTuner()
|
| 364 |
success, message = tuner.train(num_train_epochs=epochs)
|
| 365 |
|
| 366 |
return success, message
|
| 367 |
|
| 368 |
if __name__ == "__main__":
|
| 369 |
-
#
|
| 370 |
success, message = finetune_from_chat_history()
|
| 371 |
print(message)
|
|
|
|
| 1 |
"""
|
| 2 |
+
Module for fine-tuning a language model on collected data
|
| 3 |
"""
|
| 4 |
|
| 5 |
import os
|
|
|
|
| 84 |
try:
|
| 85 |
logger.info(f"Загрузка модели {self.base_model_id}...")
|
| 86 |
|
| 87 |
+
# Load tokenizer using slow tokenizer
|
| 88 |
self.tokenizer = AutoTokenizer.from_pretrained(
|
| 89 |
self.base_model_id,
|
| 90 |
trust_remote_code=True,
|
| 91 |
+
use_fast=False # Using slow tokenizer
|
| 92 |
)
|
| 93 |
|
| 94 |
+
# Special tokens for dialogues
|
| 95 |
special_tokens = {
|
| 96 |
"pad_token": "<PAD>",
|
| 97 |
"eos_token": "</s>",
|
| 98 |
"bos_token": "<s>",
|
| 99 |
+
"unk_token": "<unk>" # Adding unknown token
|
| 100 |
}
|
| 101 |
|
| 102 |
+
# Add special tokens if they don't exist
|
| 103 |
self.tokenizer.add_special_tokens({"additional_special_tokens": list(special_tokens.values())})
|
| 104 |
|
| 105 |
+
# Load model
|
| 106 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 107 |
self.base_model_id,
|
| 108 |
trust_remote_code=True,
|
| 109 |
device_map="auto" if self.device == "cuda" else None,
|
| 110 |
+
torch_dtype="auto" # Automatically choose optimal data type
|
| 111 |
)
|
| 112 |
|
| 113 |
+
# Resize embeddings for new tokens
|
| 114 |
self.model.resize_token_embeddings(len(self.tokenizer))
|
| 115 |
|
| 116 |
+
logger.info("Model and tokenizer loaded successfully")
|
| 117 |
except Exception as e:
|
| 118 |
+
logger.error(f"Error loading model: {str(e)}")
|
| 119 |
raise
|
| 120 |
|
| 121 |
def setup_lora_config(
|
|
|
|
| 125 |
lora_dropout: float = 0.05
|
| 126 |
) -> LoraConfig:
|
| 127 |
"""
|
| 128 |
+
Setup LoRA configuration for efficient fine-tuning
|
| 129 |
|
| 130 |
Args:
|
| 131 |
+
r: Rank of LoRA matrices
|
| 132 |
+
lora_alpha: LoRA alpha parameter
|
| 133 |
+
lora_dropout: Dropout probability in LoRA layers
|
| 134 |
|
| 135 |
Returns:
|
| 136 |
+
LoRA configuration
|
| 137 |
"""
|
| 138 |
+
# Create LoRA configuration
|
| 139 |
lora_config = LoraConfig(
|
| 140 |
task_type=TaskType.CAUSAL_LM,
|
| 141 |
r=r,
|
|
|
|
| 149 |
|
| 150 |
def prepare_model_for_training(self):
|
| 151 |
"""
|
| 152 |
+
Prepare model for training using LoRA
|
| 153 |
"""
|
| 154 |
if self.model is None:
|
| 155 |
self.load_model_and_tokenizer()
|
| 156 |
|
| 157 |
+
# Setup LoRA
|
| 158 |
lora_config = self.setup_lora_config()
|
| 159 |
|
| 160 |
+
# Apply LoRA to model
|
| 161 |
self.model = get_peft_model(self.model, lora_config)
|
| 162 |
|
| 163 |
+
# Output parameter information
|
| 164 |
trainable_params = sum(p.numel() for p in self.model.parameters() if p.requires_grad)
|
| 165 |
all_params = sum(p.numel() for p in self.model.parameters())
|
| 166 |
+
logger.info(f"Trainable parameters: {trainable_params:,} of {all_params:,} ({trainable_params/all_params:.2%})")
|
| 167 |
|
| 168 |
def tokenize_dataset(self, dataset):
|
| 169 |
"""
|
| 170 |
+
Tokenize dataset for training
|
| 171 |
|
| 172 |
Args:
|
| 173 |
+
dataset: Dataset to tokenize
|
| 174 |
|
| 175 |
Returns:
|
| 176 |
+
Tokenized dataset
|
| 177 |
"""
|
| 178 |
def tokenize_function(examples):
|
| 179 |
+
# Format dialogues into single string
|
| 180 |
texts = []
|
| 181 |
for dialog in examples["messages"]:
|
| 182 |
text = ""
|
|
|
|
| 187 |
text += f"Assistant: {message['content']}\n"
|
| 188 |
texts.append(text)
|
| 189 |
|
| 190 |
+
# Tokenize texts
|
| 191 |
tokenized = self.tokenizer(
|
| 192 |
texts,
|
| 193 |
padding="max_length",
|
|
|
|
| 198 |
|
| 199 |
return tokenized
|
| 200 |
|
| 201 |
+
# Apply tokenization function
|
| 202 |
tokenized_dataset = dataset.map(
|
| 203 |
tokenize_function,
|
| 204 |
batched=True,
|
|
|
|
| 207 |
|
| 208 |
return tokenized_dataset
|
| 209 |
|
| 210 |
+
# Добавить этот метод в класс fine_tuner.py или в функции модуля:
|
| 211 |
+
|
| 212 |
+
def finetune_from_annotations(epochs=3, batch_size=4, learning_rate=2e-4, min_rating=4):
|
| 213 |
+
"""
|
| 214 |
+
Fine-tune model using annotated QA pairs
|
| 215 |
+
|
| 216 |
+
Args:
|
| 217 |
+
epochs: Number of training epochs
|
| 218 |
+
batch_size: Batch size for training
|
| 219 |
+
learning_rate: Learning rate
|
| 220 |
+
min_rating: Minimum average rating for including examples
|
| 221 |
+
|
| 222 |
+
Returns:
|
| 223 |
+
(success, message)
|
| 224 |
+
"""
|
| 225 |
+
try:
|
| 226 |
+
import tempfile
|
| 227 |
+
import os
|
| 228 |
+
from src.analytics.chat_evaluator import ChatEvaluator
|
| 229 |
+
from config.settings import HF_TOKEN, DATASET_ID, CHAT_HISTORY_PATH
|
| 230 |
+
|
| 231 |
+
# Create evaluator
|
| 232 |
+
evaluator = ChatEvaluator(
|
| 233 |
+
hf_token=HF_TOKEN,
|
| 234 |
+
dataset_id=DATASET_ID,
|
| 235 |
+
chat_history_path=CHAT_HISTORY_PATH
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# Create temporary file for training data
|
| 239 |
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.jsonl', delete=False) as temp_file:
|
| 240 |
+
temp_path = temp_file.name
|
| 241 |
+
|
| 242 |
+
# Export high-quality examples
|
| 243 |
+
success, message = evaluator.export_training_data(temp_path, min_rating)
|
| 244 |
+
|
| 245 |
+
if not success:
|
| 246 |
+
return False, f"Failed to export training data: {message}"
|
| 247 |
+
|
| 248 |
+
# Count examples
|
| 249 |
+
with open(temp_path, 'r') as f:
|
| 250 |
+
example_count = sum(1 for _ in f)
|
| 251 |
+
|
| 252 |
+
if example_count == 0:
|
| 253 |
+
return False, "No high-quality examples found for fine-tuning"
|
| 254 |
+
|
| 255 |
+
# Run actual fine-tuning using the export file
|
| 256 |
+
from src.training.fine_tuner import finetune_from_file
|
| 257 |
+
|
| 258 |
+
success, message = finetune_from_file(
|
| 259 |
+
training_file=temp_path,
|
| 260 |
+
epochs=epochs,
|
| 261 |
+
batch_size=batch_size,
|
| 262 |
+
learning_rate=learning_rate
|
| 263 |
+
)
|
| 264 |
+
|
| 265 |
+
# Clean up temporary file
|
| 266 |
+
try:
|
| 267 |
+
os.unlink(temp_path)
|
| 268 |
+
except:
|
| 269 |
+
pass
|
| 270 |
+
|
| 271 |
+
if success:
|
| 272 |
+
return True, f"Successfully fine-tuned model with {example_count} annotated examples: {message}"
|
| 273 |
+
else:
|
| 274 |
+
return False, f"Fine-tuning failed: {message}"
|
| 275 |
+
|
| 276 |
+
except Exception as e:
|
| 277 |
+
return False, f"Error during fine-tuning from annotations: {str(e)}"
|
| 278 |
+
|
| 279 |
def train(
|
| 280 |
self,
|
| 281 |
training_data_path: Optional[str] = None,
|
|
|
|
| 287 |
save_strategy: str = "epoch"
|
| 288 |
) -> Tuple[bool, str]:
|
| 289 |
"""
|
| 290 |
+
Start model fine-tuning process
|
| 291 |
|
| 292 |
Args:
|
| 293 |
+
training_data_path: Path to training data (if None, data will be prepared automatically)
|
| 294 |
+
num_train_epochs: Number of training epochs
|
| 295 |
+
per_device_train_batch_size: Batch size per device
|
| 296 |
+
gradient_accumulation_steps: Number of gradient accumulation steps
|
| 297 |
+
learning_rate: Learning rate
|
| 298 |
+
logging_steps: Logging frequency
|
| 299 |
+
save_strategy: Model saving strategy
|
| 300 |
|
| 301 |
Returns:
|
| 302 |
+
(success, message)
|
| 303 |
"""
|
| 304 |
try:
|
| 305 |
+
# Prepare training data if path not specified
|
| 306 |
if training_data_path is None:
|
| 307 |
training_data_path = self.prepare_training_data()
|
| 308 |
temp_data = True
|
| 309 |
else:
|
| 310 |
temp_data = False
|
| 311 |
|
| 312 |
+
# Load model and tokenizer if not loaded
|
| 313 |
if self.model is None or self.tokenizer is None:
|
| 314 |
self.load_model_and_tokenizer()
|
| 315 |
|
| 316 |
+
# Prepare model for training
|
| 317 |
self.prepare_model_for_training()
|
| 318 |
|
| 319 |
+
# Load dataset
|
| 320 |
dataset = load_dataset("json", data_files=training_data_path, split="train")
|
| 321 |
+
logger.info(f"Loaded {len(dataset)} examples from {training_data_path}")
|
| 322 |
|
| 323 |
+
# Tokenize dataset
|
| 324 |
tokenized_dataset = self.tokenize_dataset(dataset)
|
| 325 |
|
| 326 |
+
# Create data collator
|
| 327 |
data_collator = DataCollatorForLanguageModeling(
|
| 328 |
tokenizer=self.tokenizer,
|
| 329 |
mlm=False
|
| 330 |
)
|
| 331 |
|
| 332 |
+
# Setup training arguments
|
| 333 |
training_args = TrainingArguments(
|
| 334 |
output_dir=self.output_dir,
|
| 335 |
num_train_epochs=num_train_epochs,
|
|
|
|
| 347 |
load_best_model_at_end=True
|
| 348 |
)
|
| 349 |
|
| 350 |
+
# Create trainer
|
| 351 |
trainer = Trainer(
|
| 352 |
model=self.model,
|
| 353 |
args=training_args,
|
|
|
|
| 356 |
tokenizer=self.tokenizer
|
| 357 |
)
|
| 358 |
|
| 359 |
+
# Start training
|
| 360 |
+
logger.info("Starting model training...")
|
| 361 |
trainer.train()
|
| 362 |
|
| 363 |
+
# Save model
|
| 364 |
+
logger.info(f"Saving trained model to {self.output_dir}")
|
| 365 |
trainer.save_model(self.output_dir)
|
| 366 |
self.tokenizer.save_pretrained(self.output_dir)
|
| 367 |
|
| 368 |
+
# Remove temporary file if created
|
| 369 |
if temp_data and os.path.exists(training_data_path):
|
| 370 |
os.remove(training_data_path)
|
| 371 |
|
| 372 |
+
return True, f"Model successfully trained and saved to {self.output_dir}"
|
| 373 |
except Exception as e:
|
| 374 |
+
logger.error(f"Error during training: {str(e)}")
|
| 375 |
+
return False, f"Error during training: {str(e)}"
|
| 376 |
|
| 377 |
def upload_model_to_hub(
|
| 378 |
self,
|
|
|
|
| 381 |
token: Optional[str] = None
|
| 382 |
) -> Tuple[bool, str]:
|
| 383 |
"""
|
| 384 |
+
Upload trained model to Hugging Face Hub
|
| 385 |
|
| 386 |
Args:
|
| 387 |
+
repo_id: Repository ID on Hugging Face Hub
|
| 388 |
+
private: Repository privacy flag
|
| 389 |
+
token: Hugging Face Hub access token
|
| 390 |
|
| 391 |
Returns:
|
| 392 |
+
(success, message)
|
| 393 |
"""
|
| 394 |
try:
|
| 395 |
if not os.path.exists(os.path.join(self.output_dir, "pytorch_model.bin")):
|
| 396 |
+
return False, "Trained model not found. Please train the model first."
|
| 397 |
|
| 398 |
+
# Initialize API
|
| 399 |
api = HfApi(token=token)
|
| 400 |
|
| 401 |
+
# Upload model to Hub
|
| 402 |
api.create_repo(repo_id=repo_id, private=private, repo_type="model", exist_ok=True)
|
| 403 |
api.upload_folder(
|
| 404 |
folder_path=self.output_dir,
|
|
|
|
| 406 |
repo_type="model"
|
| 407 |
)
|
| 408 |
|
| 409 |
+
return True, f"Model successfully uploaded to Hugging Face Hub: {repo_id}"
|
| 410 |
except Exception as e:
|
| 411 |
+
return False, f"Error uploading model to Hub: {str(e)}"
|
| 412 |
|
| 413 |
def finetune_from_chat_history(epochs: int = 3) -> Tuple[bool, str]:
|
| 414 |
"""
|
| 415 |
+
Function to start fine-tuning process based on chat history
|
| 416 |
|
| 417 |
Args:
|
| 418 |
+
epochs: Number of training epochs
|
| 419 |
|
| 420 |
Returns:
|
| 421 |
+
(success, message)
|
| 422 |
"""
|
| 423 |
+
# Analyze chats and prepare data
|
| 424 |
analyzer = ChatAnalyzer()
|
| 425 |
report = analyzer.generate_analytics_report()
|
| 426 |
|
| 427 |
+
# Check if there's enough data
|
| 428 |
if report["qa_pairs_count"] < 10:
|
| 429 |
+
return False, f"Insufficient data for fine-tuning. Only {report['qa_pairs_count']} QA pairs found."
|
| 430 |
|
| 431 |
+
# Create and start fine-tuning process
|
| 432 |
tuner = FineTuner()
|
| 433 |
success, message = tuner.train(num_train_epochs=epochs)
|
| 434 |
|
| 435 |
return success, message
|
| 436 |
|
| 437 |
if __name__ == "__main__":
|
| 438 |
+
# Usage example
|
| 439 |
success, message = finetune_from_chat_history()
|
| 440 |
print(message)
|
web/evaluation_interface.py
ADDED
|
@@ -0,0 +1,240 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Interface components for chat evaluation
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import gradio as gr
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from src.analytics.chat_evaluator import ChatEvaluator
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
from typing import Dict, Any, List, Tuple
|
| 11 |
+
|
| 12 |
+
def get_evaluation_status(evaluator: ChatEvaluator) -> str:
|
| 13 |
+
"""
|
| 14 |
+
Format evaluation status for display
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
evaluator: ChatEvaluator instance
|
| 18 |
+
|
| 19 |
+
Returns:
|
| 20 |
+
Formatted markdown string with status information
|
| 21 |
+
"""
|
| 22 |
+
status = evaluator.get_evaluation_status()
|
| 23 |
+
|
| 24 |
+
status_md = f"""
|
| 25 |
+
## Evaluation Status
|
| 26 |
+
|
| 27 |
+
- **Total QA Pairs:** {status['total_qa_pairs']}
|
| 28 |
+
- **Evaluated Pairs:** {status['evaluated_pairs']} ({status['evaluated_pairs']/max(1, status['total_qa_pairs'])*100:.1f}%)
|
| 29 |
+
- **Unevaluated Pairs:** {status['unevaluated_pairs']}
|
| 30 |
+
- **Evaluated Conversations:** {status['evaluated_conversations']}
|
| 31 |
+
"""
|
| 32 |
+
|
| 33 |
+
return status_md
|
| 34 |
+
|
| 35 |
+
def get_qa_pairs_dataframe(evaluator: ChatEvaluator, show_evaluated: bool = False, limit: int = 50) -> pd.DataFrame:
|
| 36 |
+
"""
|
| 37 |
+
Get QA pairs as a pandas DataFrame for display
|
| 38 |
+
|
| 39 |
+
Args:
|
| 40 |
+
evaluator: ChatEvaluator instance
|
| 41 |
+
show_evaluated: Whether to show already evaluated pairs
|
| 42 |
+
limit: Maximum number of pairs to return
|
| 43 |
+
|
| 44 |
+
Returns:
|
| 45 |
+
DataFrame with QA pairs
|
| 46 |
+
"""
|
| 47 |
+
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=200) # Get more than needed for filtering
|
| 48 |
+
annotations = evaluator.get_annotations()
|
| 49 |
+
|
| 50 |
+
# Create set of evaluated conversation IDs
|
| 51 |
+
evaluated_ids = set(a.get("conversation_id") for a in annotations)
|
| 52 |
+
|
| 53 |
+
# Filter QA pairs based on show_evaluated parameter
|
| 54 |
+
if not show_evaluated:
|
| 55 |
+
qa_pairs = [pair for pair in qa_pairs if pair.get("conversation_id") not in evaluated_ids]
|
| 56 |
+
|
| 57 |
+
# Limit the results
|
| 58 |
+
qa_pairs = qa_pairs[:limit]
|
| 59 |
+
|
| 60 |
+
# Create DataFrame
|
| 61 |
+
if qa_pairs:
|
| 62 |
+
df = pd.DataFrame(qa_pairs)
|
| 63 |
+
|
| 64 |
+
# Add "Evaluated" column
|
| 65 |
+
df["evaluated"] = df["conversation_id"].apply(lambda x: "Yes" if x in evaluated_ids else "No")
|
| 66 |
+
|
| 67 |
+
# Select and rename columns for display
|
| 68 |
+
display_df = df[["conversation_id", "question", "original_answer", "evaluated"]].copy()
|
| 69 |
+
display_df = display_df.rename(columns={
|
| 70 |
+
"conversation_id": "ID",
|
| 71 |
+
"question": "Question",
|
| 72 |
+
"original_answer": "Answer",
|
| 73 |
+
"evaluated": "Evaluated"
|
| 74 |
+
})
|
| 75 |
+
|
| 76 |
+
# Truncate long text for better display
|
| 77 |
+
display_df["Question"] = display_df["Question"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
|
| 78 |
+
display_df["Answer"] = display_df["Answer"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
|
| 79 |
+
|
| 80 |
+
return display_df
|
| 81 |
+
|
| 82 |
+
# Return empty DataFrame if no pairs
|
| 83 |
+
return pd.DataFrame(columns=["ID", "Question", "Answer", "Evaluated"])
|
| 84 |
+
|
| 85 |
+
def load_qa_pair_for_evaluation(evaluator: ChatEvaluator, conversation_id: str) -> Tuple[str, str, Dict, str]:
|
| 86 |
+
"""
|
| 87 |
+
Load a QA pair for evaluation
|
| 88 |
+
|
| 89 |
+
Args:
|
| 90 |
+
evaluator: ChatEvaluator instance
|
| 91 |
+
conversation_id: ID of the conversation to load
|
| 92 |
+
|
| 93 |
+
Returns:
|
| 94 |
+
Tuple of (question, original_answer, existing_ratings, notes)
|
| 95 |
+
"""
|
| 96 |
+
# Get all QA pairs
|
| 97 |
+
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=1000)
|
| 98 |
+
|
| 99 |
+
# Find the requested pair
|
| 100 |
+
for pair in qa_pairs:
|
| 101 |
+
if pair.get("conversation_id") == conversation_id:
|
| 102 |
+
question = pair.get("question", "")
|
| 103 |
+
original_answer = pair.get("original_answer", "")
|
| 104 |
+
|
| 105 |
+
# Check if there's an existing annotation
|
| 106 |
+
annotation = evaluator.get_annotation_by_conversation_id(conversation_id)
|
| 107 |
+
|
| 108 |
+
if annotation:
|
| 109 |
+
existing_ratings = annotation.get("ratings", {})
|
| 110 |
+
improved_answer = annotation.get("improved_answer", original_answer)
|
| 111 |
+
notes = annotation.get("notes", "")
|
| 112 |
+
return question, original_answer, improved_answer, existing_ratings, notes
|
| 113 |
+
|
| 114 |
+
return question, original_answer, original_answer, {}, ""
|
| 115 |
+
|
| 116 |
+
return "", "", "", {}, ""
|
| 117 |
+
|
| 118 |
+
def save_evaluation(
|
| 119 |
+
evaluator: ChatEvaluator,
|
| 120 |
+
conversation_id: str,
|
| 121 |
+
question: str,
|
| 122 |
+
original_answer: str,
|
| 123 |
+
improved_answer: str,
|
| 124 |
+
accuracy: int,
|
| 125 |
+
completeness: int,
|
| 126 |
+
relevance: int,
|
| 127 |
+
clarity: int,
|
| 128 |
+
legal_correctness: int,
|
| 129 |
+
notes: str
|
| 130 |
+
) -> str:
|
| 131 |
+
"""
|
| 132 |
+
Save evaluation to file and dataset
|
| 133 |
+
|
| 134 |
+
Args:
|
| 135 |
+
evaluator: ChatEvaluator instance
|
| 136 |
+
conversation_id: ID of the conversation
|
| 137 |
+
question: User question
|
| 138 |
+
original_answer: Original bot answer
|
| 139 |
+
improved_answer: Improved answer
|
| 140 |
+
accuracy: Rating for factual accuracy (1-5)
|
| 141 |
+
completeness: Rating for completeness (1-5)
|
| 142 |
+
relevance: Rating for relevance (1-5)
|
| 143 |
+
clarity: Rating for clarity (1-5)
|
| 144 |
+
legal_correctness: Rating for legal correctness (1-5)
|
| 145 |
+
notes: Evaluator notes
|
| 146 |
+
|
| 147 |
+
Returns:
|
| 148 |
+
Status message
|
| 149 |
+
"""
|
| 150 |
+
# Create ratings dictionary
|
| 151 |
+
ratings = {
|
| 152 |
+
"accuracy": accuracy,
|
| 153 |
+
"completeness": completeness,
|
| 154 |
+
"relevance": relevance,
|
| 155 |
+
"clarity": clarity,
|
| 156 |
+
"legal_correctness": legal_correctness
|
| 157 |
+
}
|
| 158 |
+
|
| 159 |
+
# Save annotation
|
| 160 |
+
success, message = evaluator.save_annotation(
|
| 161 |
+
conversation_id=conversation_id,
|
| 162 |
+
question=question,
|
| 163 |
+
original_answer=original_answer,
|
| 164 |
+
improved_answer=improved_answer,
|
| 165 |
+
ratings=ratings,
|
| 166 |
+
notes=notes
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
return message
|
| 170 |
+
|
| 171 |
+
def generate_evaluation_report_html(evaluator: ChatEvaluator) -> str:
|
| 172 |
+
"""
|
| 173 |
+
Generate HTML report of evaluation metrics
|
| 174 |
+
|
| 175 |
+
Args:
|
| 176 |
+
evaluator: ChatEvaluator instance
|
| 177 |
+
|
| 178 |
+
Returns:
|
| 179 |
+
HTML string with report
|
| 180 |
+
"""
|
| 181 |
+
report = evaluator.generate_evaluation_report()
|
| 182 |
+
|
| 183 |
+
if report["total_evaluations"] == 0:
|
| 184 |
+
return "<p>No evaluations available yet.</p>"
|
| 185 |
+
|
| 186 |
+
# Format criteria averages
|
| 187 |
+
criteria_html = ""
|
| 188 |
+
for criterion, avg in report["criteria_averages"].items():
|
| 189 |
+
# Calculate stars representation (1-5)
|
| 190 |
+
stars = "★" * int(avg) + "☆" * (5 - int(avg))
|
| 191 |
+
criteria_html += f"""
|
| 192 |
+
<tr>
|
| 193 |
+
<td>{criterion.capitalize()}</td>
|
| 194 |
+
<td>{avg:.2f}/5.0</td>
|
| 195 |
+
<td>{stars}</td>
|
| 196 |
+
</tr>
|
| 197 |
+
"""
|
| 198 |
+
|
| 199 |
+
# Overall stars representation
|
| 200 |
+
overall_stars = "★" * int(report["overall_average"]) + "☆" * (5 - int(report["overall_average"]))
|
| 201 |
+
|
| 202 |
+
html = f"""
|
| 203 |
+
<div style="padding: 15px; border: 1px solid #ccc; border-radius: 5px; margin-top: 10px;">
|
| 204 |
+
<h3>Evaluation Report</h3>
|
| 205 |
+
|
| 206 |
+
<p><strong>Total Evaluations:</strong> {report["total_evaluations"]}</p>
|
| 207 |
+
<p><strong>Overall Average Rating:</strong> {report["overall_average"]:.2f}/5.0 {overall_stars}</p>
|
| 208 |
+
<p><strong>Improvement Rate:</strong> {report["improvement_rate"]:.1f}% of responses were improved</p>
|
| 209 |
+
|
| 210 |
+
<h4>Criteria Ratings:</h4>
|
| 211 |
+
<table style="width: 100%; border-collapse: collapse;">
|
| 212 |
+
<tr>
|
| 213 |
+
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Criterion</th>
|
| 214 |
+
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Average Score</th>
|
| 215 |
+
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Rating</th>
|
| 216 |
+
</tr>
|
| 217 |
+
{criteria_html}
|
| 218 |
+
</table>
|
| 219 |
+
</div>
|
| 220 |
+
"""
|
| 221 |
+
|
| 222 |
+
return html
|
| 223 |
+
|
| 224 |
+
def export_training_data_action(evaluator: ChatEvaluator, min_rating: int, output_file: str) -> str:
|
| 225 |
+
"""
|
| 226 |
+
Action for exporting training data
|
| 227 |
+
|
| 228 |
+
Args:
|
| 229 |
+
evaluator: ChatEvaluator instance
|
| 230 |
+
min_rating: Minimum average rating (1-5)
|
| 231 |
+
output_file: Output file path
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
Status message
|
| 235 |
+
"""
|
| 236 |
+
if not output_file:
|
| 237 |
+
output_file = os.path.join(os.path.dirname(evaluator.annotations_dir), "training_data.jsonl")
|
| 238 |
+
|
| 239 |
+
success, message = evaluator.export_training_data(output_file, min_rating)
|
| 240 |
+
return message
|