status-law-gbot / web /evaluation_interface.py
Rulga's picture
Refactor evaluation interface: Update label for evaluated pairs checkbox and improve documentation for QA pairs dataframe function.
c564619
"""
Interface components for chat evaluation
"""
import gradio as gr
import pandas as pd
from src.analytics.chat_evaluator import ChatEvaluator
import json
import os
from typing import Dict, Any, List, Tuple
def get_evaluation_status(evaluator, force_reload=False):
"""
Get evaluation status as formatted string and refresh QA data
Args:
evaluator: ChatEvaluator instance
force_reload: If True, force reload data from dataset
Returns:
Status message, updated QA table and refresh message
"""
try:
# First, reset cache if forcing reload
if force_reload:
evaluator.reset_cache()
# Get status data
status = evaluator.get_evaluation_status(force_reload=force_reload)
# Get updated QA table
qa_table = get_qa_pairs_dataframe(evaluator, show_evaluated=False, force_reload=force_reload)
status_message = f"""
Total QA Pairs: {status['total_qa_pairs']}
Evaluated Pairs: {status['evaluated_pairs']}
Unevaluated Pairs: {status['unevaluated_pairs']}
Evaluated Conversations: {status['evaluated_conversations']}
"""
refresh_message = "Data refreshed successfully" if force_reload else ""
return status_message, qa_table, refresh_message
except Exception as e:
logger.error(f"Error getting evaluation status: {e}")
# Import pandas here to avoid circular imports
import pandas as pd
empty_df = pd.DataFrame(columns=["Conversation ID", "Question", "Answer", "Evaluated"])
return f"Error getting status: {str(e)}", empty_df, f"Error: {str(e)}"
def get_qa_pairs_dataframe(evaluator, show_evaluated=False, force_reload=False):
"""
Get QA pairs as DataFrame for the evaluation interface
Args:
evaluator: ChatEvaluator instance
show_evaluated: If True, show only evaluated pairs. If False, show all pairs
force_reload: If True, force reload from dataset
Returns:
DataFrame with QA pairs
"""
try:
# Get QA pairs
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=100, force_reload=force_reload)
# Get annotations
annotations = evaluator.get_annotations(force_reload=force_reload)
evaluated_ids = {a.get("conversation_id") for a in annotations}
# Filter pairs based on show_evaluated flag
if show_evaluated:
# Show only evaluated pairs
qa_pairs = [qa for qa in qa_pairs if qa["conversation_id"] in evaluated_ids]
# Convert to DataFrame
if qa_pairs:
import pandas as pd
df = pd.DataFrame([
{
"Conversation ID": qa["conversation_id"],
"Question": qa["question"][:50] + "..." if len(qa["question"]) > 50 else qa["question"],
"Answer": qa["original_answer"][:100] + "..." if len(qa["original_answer"]) > 100 else qa["original_answer"],
"Evaluated": "Yes" if qa["conversation_id"] in evaluated_ids else "No"
}
for qa in qa_pairs
])
return df
else:
import pandas as pd
return pd.DataFrame(columns=["Conversation ID", "Question", "Answer", "Evaluated"])
except Exception as e:
logger.error(f"Error getting QA pairs dataframe: {e}")
import pandas as pd
return pd.DataFrame(columns=["Conversation ID", "Question", "Answer", "Evaluated"])
def load_qa_pair_for_evaluation(conversation_id: str, evaluator: ChatEvaluator) -> Tuple[str, str, str, int, int, int, int, int, str]:
"""
Load a QA pair for evaluation
Args:
conversation_id: ID of the conversation to load
evaluator: ChatEvaluator instance
Returns:
Tuple of (question, original_answer, improved_answer, accuracy, completeness,
relevance, clarity, legal_correctness, notes)
"""
# Get all QA pairs
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=1000)
# Get existing annotation if any
annotation = evaluator.get_annotation(conversation_id)
if annotation:
return (
annotation.get("question", ""),
annotation.get("original_answer", ""), # Changed from original_answer
annotation.get("improved_answer", ""), # Changed from improved_answer
annotation.get("ratings", {}).get("accuracy", 1),
annotation.get("ratings", {}).get("completeness", 1),
annotation.get("ratings", {}).get("relevance", 1),
annotation.get("ratings", {}).get("clarity", 1),
annotation.get("ratings", {}).get("legal_correctness", 1),
annotation.get("notes", "")
)
# If no annotation exists, find the conversation in QA pairs
for qa_pair in qa_pairs:
if qa_pair.get("conversation_id") == conversation_id:
return (
qa_pair.get("question", ""),
qa_pair.get("original_answer", ""), # Changed from answer
"", # Empty improved answer
1, # Default ratings
1,
1,
1,
1,
"" # Empty notes
)
return "", "", "", 1, 1, 1, 1, 1, "" # Return empty values if not found
def save_evaluation(
conversation_id: str,
question: str,
original_answer: str,
improved_answer: str,
accuracy: int,
completeness: int,
relevance: int,
clarity: int,
legal_correctness: int,
notes: str,
evaluator: ChatEvaluator
) -> str:
"""
Save evaluation to file and dataset
Args:
evaluator: ChatEvaluator instance
conversation_id: ID of the conversation
question: User question
original_answer: Original bot answer
improved_answer: Improved answer
accuracy: Rating for factual accuracy (1-5)
completeness: Rating for completeness (1-5)
relevance: Rating for relevance (1-5)
clarity: Rating for clarity (1-5)
legal_correctness: Rating for legal correctness (1-5)
notes: Evaluator notes
Returns:
Status message
"""
# Create ratings dictionary
ratings = {
"accuracy": accuracy,
"completeness": completeness,
"relevance": relevance,
"clarity": clarity,
"legal_correctness": legal_correctness
}
# Save annotation
success, message = evaluator.save_annotation(
conversation_id=conversation_id,
question=question,
original_answer=original_answer,
improved_answer=improved_answer,
ratings=ratings,
notes=notes
)
return message
def generate_evaluation_report_html(evaluator: ChatEvaluator) -> str:
"""
Generate HTML report of evaluation metrics
Args:
evaluator: ChatEvaluator instance
Returns:
HTML string with report
"""
report = evaluator.generate_evaluation_report()
if report["total_evaluations"] == 0:
return "<p>No evaluations available yet.</p>"
# Format criteria averages
criteria_html = ""
for criterion, avg in report["criteria_averages"].items():
# Calculate stars representation (1-5)
stars = "β˜…" * int(avg) + "β˜†" * (5 - int(avg))
criteria_html += f"""
<tr>
<td>{criterion.capitalize()}</td>
<td>{avg:.2f}/5.0</td>
<td>{stars}</td>
</tr>
"""
# Overall stars representation
overall_stars = "β˜…" * int(report["overall_average"]) + "β˜†" * (5 - int(report["overall_average"]))
html = f"""
<div style="padding: 15px; border: 1px solid #ccc; border-radius: 5px; margin-top: 10px;">
<h3>Evaluation Report</h3>
<p><strong>Total Evaluations:</strong> {report["total_evaluations"]}</p>
<p><strong>Overall Average Rating:</strong> {report["overall_average"]:.2f}/5.0 {overall_stars}</p>
<p><strong>Improvement Rate:</strong> {report["improvement_rate"]:.1f}% of responses were improved</p>
<h4>Criteria Ratings:</h4>
<table style="width: 100%; border-collapse: collapse;">
<tr>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Criterion</th>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Average Score</th>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Rating</th>
</tr>
{criteria_html}
</table>
</div>
"""
return html
def export_training_data_action(evaluator: ChatEvaluator, min_rating: int, output_file: str) -> str:
"""
Action for exporting training data
Args:
evaluator: ChatEvaluator instance
min_rating: Minimum average rating (1-5)
output_file: Output file path
Returns:
Status message
"""
if not output_file:
output_file = os.path.join(os.path.dirname(evaluator.annotations_dir), "training_data.jsonl")
success, message = evaluator.export_training_data(output_file, min_rating)
return message