Spaces:
Sleeping
Sleeping
Refactor chat history retrieval; remove local file reading and debug print statements for cleaner code
Browse files- app.py +115 -128
- src/analytics/chat_evaluator.py +7 -31
app.py
CHANGED
|
@@ -935,143 +935,130 @@ with gr.Blocks() as demo:
|
|
| 935 |
outputs=[analysis_output]
|
| 936 |
)
|
| 937 |
|
| 938 |
-
|
| 939 |
-
|
| 940 |
-
|
|
|
|
|
|
|
|
|
|
| 941 |
with gr.Row():
|
| 942 |
-
with gr.Column(scale=
|
| 943 |
-
|
| 944 |
-
|
| 945 |
-
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
"editable": False,
|
| 950 |
-
},
|
| 951 |
-
"Question": {
|
| 952 |
-
"editable": False,
|
| 953 |
-
},
|
| 954 |
-
"Answer": {
|
| 955 |
-
"editable": False,
|
| 956 |
-
},
|
| 957 |
-
"Evaluated": {
|
| 958 |
-
"editable": False,
|
| 959 |
-
}
|
| 960 |
-
}
|
| 961 |
-
)
|
| 962 |
-
|
| 963 |
-
gr.Markdown("### Select Conversation to Evaluate")
|
| 964 |
-
selected_conversation = gr.Textbox(
|
| 965 |
-
label="Conversation ID",
|
| 966 |
-
placeholder="Select from table above",
|
| 967 |
-
interactive=False
|
| 968 |
-
)
|
| 969 |
-
|
| 970 |
-
# Define event handlers outside of the UI definition
|
| 971 |
-
def on_table_select(evt: gr.SelectData):
|
| 972 |
-
if evt.value:
|
| 973 |
-
return evt.value[0] # Return the ID from the first column
|
| 974 |
-
return ""
|
| 975 |
-
|
| 976 |
-
def on_show_evaluated_change(show: bool):
|
| 977 |
-
return get_qa_pairs_dataframe(chat_evaluator, show_evaluated=show)
|
| 978 |
-
|
| 979 |
-
# Connect event handlers
|
| 980 |
-
qa_table.select(
|
| 981 |
-
fn=on_table_select,
|
| 982 |
-
outputs=selected_conversation
|
| 983 |
-
)
|
| 984 |
-
|
| 985 |
-
show_evaluated.change(
|
| 986 |
-
fn=on_show_evaluated_change,
|
| 987 |
-
inputs=show_evaluated,
|
| 988 |
-
outputs=qa_table
|
| 989 |
-
)
|
| 990 |
-
|
| 991 |
-
gr.Markdown("### Evaluate Response")
|
| 992 |
-
question_display = gr.Textbox(label="User Question", interactive=False)
|
| 993 |
-
original_answer = gr.TextArea(label="Original Bot Answer", interactive=False)
|
| 994 |
-
improved_answer = gr.TextArea(label="Improved Answer (Gold Standard)", interactive=True)
|
| 995 |
-
|
| 996 |
-
gr.Markdown("### Quality Ratings (1-5)")
|
| 997 |
-
with gr.Row():
|
| 998 |
-
accuracy = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Factual Accuracy")
|
| 999 |
-
completeness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Completeness")
|
| 1000 |
-
with gr.Row():
|
| 1001 |
-
relevance = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Relevance")
|
| 1002 |
-
clarity = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Clarity")
|
| 1003 |
-
legal_correctness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Legal Correctness")
|
| 1004 |
-
|
| 1005 |
-
notes = gr.TextArea(label="Evaluator Notes", placeholder="Add your notes about this response...")
|
| 1006 |
-
save_btn = gr.Button("Save Evaluation", variant="primary")
|
| 1007 |
-
evaluation_status_msg = gr.Textbox(label="Status", interactive=False)
|
| 1008 |
|
| 1009 |
-
#
|
| 1010 |
-
|
| 1011 |
-
|
| 1012 |
-
|
| 1013 |
-
|
|
|
|
| 1014 |
)
|
| 1015 |
|
| 1016 |
-
|
| 1017 |
-
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1021 |
|
| 1022 |
-
|
| 1023 |
-
|
| 1024 |
-
|
| 1025 |
-
|
| 1026 |
-
)
|
| 1027 |
|
| 1028 |
-
#
|
| 1029 |
-
|
| 1030 |
-
|
| 1031 |
-
|
| 1032 |
-
|
| 1033 |
-
)
|
| 1034 |
-
|
| 1035 |
-
|
| 1036 |
-
|
| 1037 |
-
if evt and hasattr(evt, 'data'):
|
| 1038 |
-
conversation_id = evt.data[0] # Get ID from first column
|
| 1039 |
-
# Add logic for deleting the pair from database
|
| 1040 |
-
return get_qa_pairs_dataframe(chat_evaluator) # Update table
|
| 1041 |
-
return None
|
| 1042 |
-
|
| 1043 |
-
qa_table.delete(
|
| 1044 |
-
fn=delete_qa_pair,
|
| 1045 |
-
inputs=[qa_table],
|
| 1046 |
-
outputs=[qa_table]
|
| 1047 |
-
)
|
| 1048 |
-
|
| 1049 |
-
# Load conversation for evaluation
|
| 1050 |
-
load_btn.click(
|
| 1051 |
-
fn=lambda x: load_qa_pair_for_evaluation(conversation_id=x, evaluator=chat_evaluator),
|
| 1052 |
-
inputs=[selected_conversation],
|
| 1053 |
-
outputs=[question_display, original_answer, improved_answer,
|
| 1054 |
-
accuracy, completeness, relevance, clarity, legal_correctness, notes]
|
| 1055 |
-
)
|
| 1056 |
|
| 1057 |
-
#
|
| 1058 |
-
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
-
inputs=[
|
| 1062 |
-
selected_conversation, question_display, original_answer, improved_answer,
|
| 1063 |
-
accuracy, completeness, relevance, clarity, legal_correctness, notes
|
| 1064 |
-
],
|
| 1065 |
-
outputs=[evaluation_status_msg]
|
| 1066 |
-
)
|
| 1067 |
|
| 1068 |
-
#
|
| 1069 |
-
|
| 1070 |
-
|
| 1071 |
-
|
| 1072 |
-
|
| 1073 |
-
)
|
|
|
|
| 1074 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1075 |
# Model change handler
|
| 1076 |
model_selector.change(
|
| 1077 |
fn=change_model,
|
|
|
|
| 935 |
outputs=[analysis_output]
|
| 936 |
)
|
| 937 |
|
| 938 |
+
with gr.Tab("Chat Evaluation"):
|
| 939 |
+
gr.Markdown("### Evaluation of Chat Responses")
|
| 940 |
+
|
| 941 |
+
with gr.Row():
|
| 942 |
+
with gr.Column(scale=2):
|
| 943 |
+
# Status and reports section
|
| 944 |
with gr.Row():
|
| 945 |
+
with gr.Column(scale=1):
|
| 946 |
+
evaluation_status = gr.Textbox(label="Evaluation Status", interactive=False)
|
| 947 |
+
refresh_status_btn = gr.Button("Refresh Status")
|
| 948 |
+
|
| 949 |
+
with gr.Column(scale=1):
|
| 950 |
+
evaluation_report = gr.HTML(label="Evaluation Report")
|
| 951 |
+
refresh_report_btn = gr.Button("Generate Report")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
|
| 953 |
+
# QA pairs table section
|
| 954 |
+
show_evaluated = gr.Checkbox(label="Show Already Evaluated Pairs", value=False)
|
| 955 |
+
qa_table = gr.DataFrame(
|
| 956 |
+
get_qa_pairs_dataframe(chat_evaluator),
|
| 957 |
+
interactive=False
|
| 958 |
+
# Removed column_config for compatibility
|
| 959 |
)
|
| 960 |
|
| 961 |
+
# Conversation selection section
|
| 962 |
+
gr.Markdown("### Select Conversation to Evaluate")
|
| 963 |
+
with gr.Row():
|
| 964 |
+
selected_conversation = gr.Textbox(
|
| 965 |
+
label="Conversation ID",
|
| 966 |
+
placeholder="Select from table above",
|
| 967 |
+
interactive=True
|
| 968 |
+
)
|
| 969 |
+
load_btn = gr.Button("Load Conversation")
|
| 970 |
|
| 971 |
+
# Conversation content section
|
| 972 |
+
gr.Markdown("### Evaluate Response")
|
| 973 |
+
question_display = gr.Textbox(label="User Question", interactive=False)
|
| 974 |
+
original_answer = gr.TextArea(label="Original Bot Answer", interactive=False)
|
| 975 |
+
improved_answer = gr.TextArea(label="Improved Answer (Gold Standard)", interactive=True)
|
| 976 |
|
| 977 |
+
# Ratings section
|
| 978 |
+
gr.Markdown("### Quality Ratings (1-5)")
|
| 979 |
+
with gr.Row():
|
| 980 |
+
accuracy = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Factual Accuracy")
|
| 981 |
+
completeness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Completeness")
|
| 982 |
+
with gr.Row():
|
| 983 |
+
relevance = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Relevance")
|
| 984 |
+
clarity = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Clarity")
|
| 985 |
+
legal_correctness = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Legal Correctness")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 986 |
|
| 987 |
+
# Notes and save section
|
| 988 |
+
notes = gr.TextArea(label="Evaluator Notes", placeholder="Add your notes about this response...")
|
| 989 |
+
save_btn = gr.Button("Save Evaluation", variant="primary")
|
| 990 |
+
evaluation_status_msg = gr.Textbox(label="Status", interactive=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 991 |
|
| 992 |
+
# Data export section
|
| 993 |
+
gr.Markdown("### Export Evaluation Data")
|
| 994 |
+
with gr.Row():
|
| 995 |
+
min_rating = gr.Slider(minimum=1, maximum=5, value=4, step=0.5, label="Minimum Rating for Export")
|
| 996 |
+
export_path = gr.Textbox(label="Export File Path", value="training_data.jsonl")
|
| 997 |
+
export_btn = gr.Button("Export Training Data")
|
| 998 |
+
export_status = gr.Textbox(label="Export Status", interactive=False)
|
| 999 |
|
| 1000 |
+
# Event handlers
|
| 1001 |
+
# Status update
|
| 1002 |
+
refresh_status_btn.click(
|
| 1003 |
+
fn=lambda: get_evaluation_status(chat_evaluator),
|
| 1004 |
+
inputs=[],
|
| 1005 |
+
outputs=[evaluation_status]
|
| 1006 |
+
)
|
| 1007 |
+
|
| 1008 |
+
# Report generation
|
| 1009 |
+
refresh_report_btn.click(
|
| 1010 |
+
fn=lambda: generate_evaluation_report_html(chat_evaluator),
|
| 1011 |
+
inputs=[],
|
| 1012 |
+
outputs=[evaluation_report]
|
| 1013 |
+
)
|
| 1014 |
+
|
| 1015 |
+
# Toggle evaluated pairs display
|
| 1016 |
+
show_evaluated.change(
|
| 1017 |
+
fn=lambda x: get_qa_pairs_dataframe(chat_evaluator, x),
|
| 1018 |
+
inputs=[show_evaluated],
|
| 1019 |
+
outputs=[qa_table]
|
| 1020 |
+
)
|
| 1021 |
+
|
| 1022 |
+
# Table row selection function
|
| 1023 |
+
def on_table_select(evt):
|
| 1024 |
+
try:
|
| 1025 |
+
return evt.value[0] if evt and hasattr(evt, 'value') and len(evt.value) > 0 else ""
|
| 1026 |
+
except Exception as e:
|
| 1027 |
+
print(f"Error selecting table row: {str(e)}")
|
| 1028 |
+
return ""
|
| 1029 |
+
|
| 1030 |
+
# Table row selection handler
|
| 1031 |
+
qa_table.select(
|
| 1032 |
+
fn=on_table_select,
|
| 1033 |
+
outputs=[selected_conversation]
|
| 1034 |
+
)
|
| 1035 |
+
|
| 1036 |
+
# Load pair for evaluation
|
| 1037 |
+
load_btn.click(
|
| 1038 |
+
fn=lambda x: load_qa_pair_for_evaluation(conversation_id=x, evaluator=chat_evaluator),
|
| 1039 |
+
inputs=[selected_conversation],
|
| 1040 |
+
outputs=[question_display, original_answer, improved_answer,
|
| 1041 |
+
accuracy, completeness, relevance, clarity, legal_correctness, notes]
|
| 1042 |
+
)
|
| 1043 |
+
|
| 1044 |
+
# Save evaluation
|
| 1045 |
+
save_btn.click(
|
| 1046 |
+
fn=lambda conv_id, q, orig_a, imp_a, acc, comp, rel, clar, legal, notes:
|
| 1047 |
+
save_evaluation(conv_id, q, orig_a, imp_a, acc, comp, rel, clar, legal, notes, evaluator=chat_evaluator),
|
| 1048 |
+
inputs=[
|
| 1049 |
+
selected_conversation, question_display, original_answer, improved_answer,
|
| 1050 |
+
accuracy, completeness, relevance, clarity, legal_correctness, notes
|
| 1051 |
+
],
|
| 1052 |
+
outputs=[evaluation_status_msg]
|
| 1053 |
+
)
|
| 1054 |
+
|
| 1055 |
+
# Export training data
|
| 1056 |
+
export_btn.click(
|
| 1057 |
+
fn=lambda min_r, path: export_training_data_action(min_r, path, chat_evaluator),
|
| 1058 |
+
inputs=[min_rating, export_path],
|
| 1059 |
+
outputs=[export_status]
|
| 1060 |
+
)
|
| 1061 |
+
|
| 1062 |
# Model change handler
|
| 1063 |
model_selector.change(
|
| 1064 |
fn=change_model,
|
src/analytics/chat_evaluator.py
CHANGED
|
@@ -23,7 +23,6 @@ class ChatEvaluator:
|
|
| 23 |
dataset_manager: Dataset manager for retrieving chat history
|
| 24 |
hf_token: Hugging Face token for uploading annotations
|
| 25 |
dataset_id: Hugging Face dataset ID
|
| 26 |
-
chat_history_path: Path to local chat history directory
|
| 27 |
"""
|
| 28 |
self.dataset_manager = dataset_manager or DatasetManager()
|
| 29 |
self.hf_token = hf_token
|
|
@@ -37,36 +36,11 @@ class ChatEvaluator:
|
|
| 37 |
|
| 38 |
def get_chat_history(self) -> List[Dict[str, Any]]:
|
| 39 |
"""
|
| 40 |
-
Get all chat history data from
|
| 41 |
"""
|
| 42 |
success, chat_data = self.dataset_manager.get_chat_history()
|
| 43 |
-
|
| 44 |
-
# Добавим отладочную информацию
|
| 45 |
-
print(f"Debug - Chat history fetch success: {success}")
|
| 46 |
-
print(f"Debug - Number of chat records: {len(chat_data) if chat_data else 0}")
|
| 47 |
-
|
| 48 |
if not success or not chat_data:
|
| 49 |
-
|
| 50 |
-
local_data = self._read_local_chat_history()
|
| 51 |
-
print(f"Debug - Local chat records found: {len(local_data)}")
|
| 52 |
-
return local_data
|
| 53 |
-
return chat_data
|
| 54 |
-
|
| 55 |
-
def _read_local_chat_history(self) -> List[Dict[str, Any]]:
|
| 56 |
-
"""
|
| 57 |
-
Read chat history from local files
|
| 58 |
-
"""
|
| 59 |
-
chat_data = []
|
| 60 |
-
if os.path.exists(self.chat_history_path):
|
| 61 |
-
for filename in os.listdir(self.chat_history_path):
|
| 62 |
-
if filename.endswith('.json'):
|
| 63 |
-
try:
|
| 64 |
-
filepath = os.path.join(self.chat_history_path, filename)
|
| 65 |
-
with open(filepath, 'r', encoding='utf-8') as f:
|
| 66 |
-
data = json.load(f)
|
| 67 |
-
chat_data.append(data)
|
| 68 |
-
except Exception as e:
|
| 69 |
-
print(f"Error reading chat file {filename}: {str(e)}")
|
| 70 |
return chat_data
|
| 71 |
|
| 72 |
def get_qa_pairs_for_evaluation(self, limit: int = 50) -> List[Dict[str, Any]]:
|
|
@@ -87,9 +61,7 @@ class ChatEvaluator:
|
|
| 87 |
for chat in chat_data:
|
| 88 |
conversation_id = chat.get("conversation_id", "unknown")
|
| 89 |
timestamp = chat.get("timestamp", "")
|
| 90 |
-
messages = chat.get("messages", [])
|
| 91 |
-
|
| 92 |
-
print(f"Debug - Chat {conversation_id} has {len(messages)} messages") # Debug print
|
| 93 |
|
| 94 |
# Find user-assistant pairs in messages
|
| 95 |
for i in range(len(messages) - 1):
|
|
@@ -349,3 +321,7 @@ class ChatEvaluator:
|
|
| 349 |
return metrics
|
| 350 |
|
| 351 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
dataset_manager: Dataset manager for retrieving chat history
|
| 24 |
hf_token: Hugging Face token for uploading annotations
|
| 25 |
dataset_id: Hugging Face dataset ID
|
|
|
|
| 26 |
"""
|
| 27 |
self.dataset_manager = dataset_manager or DatasetManager()
|
| 28 |
self.hf_token = hf_token
|
|
|
|
| 36 |
|
| 37 |
def get_chat_history(self) -> List[Dict[str, Any]]:
|
| 38 |
"""
|
| 39 |
+
Get all chat history data from dataset
|
| 40 |
"""
|
| 41 |
success, chat_data = self.dataset_manager.get_chat_history()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
if not success or not chat_data:
|
| 43 |
+
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
return chat_data
|
| 45 |
|
| 46 |
def get_qa_pairs_for_evaluation(self, limit: int = 50) -> List[Dict[str, Any]]:
|
|
|
|
| 61 |
for chat in chat_data:
|
| 62 |
conversation_id = chat.get("conversation_id", "unknown")
|
| 63 |
timestamp = chat.get("timestamp", "")
|
| 64 |
+
messages = chat.get("messages", [])
|
|
|
|
|
|
|
| 65 |
|
| 66 |
# Find user-assistant pairs in messages
|
| 67 |
for i in range(len(messages) - 1):
|
|
|
|
| 321 |
return metrics
|
| 322 |
|
| 323 |
|
| 324 |
+
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
|