status-law-gbot / web /evaluation_interface.py
Rulga's picture
Revert "Refactor export_training_data_action parameters and enhance Gradio interface for evaluation"
48fb8a1
raw
history blame
8.11 kB
"""
Interface components for chat evaluation
"""
import gradio as gr
import pandas as pd
from src.analytics.chat_evaluator import ChatEvaluator
import json
import os
from typing import Dict, Any, List, Tuple
def get_evaluation_status(evaluator: ChatEvaluator) -> str:
"""
Format evaluation status for display
Args:
evaluator: ChatEvaluator instance
Returns:
Formatted markdown string with status information
"""
status = evaluator.get_evaluation_status()
status_md = f"""
## Evaluation Status
- **Total QA Pairs:** {status['total_qa_pairs']}
- **Evaluated Pairs:** {status['evaluated_pairs']} ({status['evaluated_pairs']/max(1, status['total_qa_pairs'])*100:.1f}%)
- **Unevaluated Pairs:** {status['unevaluated_pairs']}
- **Evaluated Conversations:** {status['evaluated_conversations']}
"""
return status_md
def get_qa_pairs_dataframe(evaluator: ChatEvaluator, show_evaluated: bool = False, limit: int = 50) -> pd.DataFrame:
"""
Get QA pairs as a pandas DataFrame for display
Args:
evaluator: ChatEvaluator instance
show_evaluated: Whether to show already evaluated pairs
limit: Maximum number of pairs to return
Returns:
DataFrame with QA pairs
"""
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=200) # Get more than needed for filtering
annotations = evaluator.get_annotations()
# Create set of evaluated conversation IDs
evaluated_ids = set(a.get("conversation_id") for a in annotations)
# Filter QA pairs based on show_evaluated parameter
if not show_evaluated:
qa_pairs = [pair for pair in qa_pairs if pair.get("conversation_id") not in evaluated_ids]
# Limit the results
qa_pairs = qa_pairs[:limit]
# Create DataFrame
if qa_pairs:
df = pd.DataFrame(qa_pairs)
# Add "Evaluated" column
df["evaluated"] = df["conversation_id"].apply(lambda x: "Yes" if x in evaluated_ids else "No")
# Select and rename columns for display
display_df = df[["conversation_id", "question", "original_answer", "evaluated"]].copy()
display_df = display_df.rename(columns={
"conversation_id": "ID",
"question": "Question",
"original_answer": "Answer",
"evaluated": "Evaluated"
})
# Truncate long text for better display
display_df["Question"] = display_df["Question"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
display_df["Answer"] = display_df["Answer"].apply(lambda x: (x[:150] + "...") if len(x) > 150 else x)
return display_df
# Return empty DataFrame if no pairs
return pd.DataFrame(columns=["ID", "Question", "Answer", "Evaluated"])
def load_qa_pair_for_evaluation(evaluator: ChatEvaluator, conversation_id: str) -> Tuple[str, str, Dict, str]:
"""
Load a QA pair for evaluation
Args:
evaluator: ChatEvaluator instance
conversation_id: ID of the conversation to load
Returns:
Tuple of (question, original_answer, existing_ratings, notes)
"""
# Get all QA pairs
qa_pairs = evaluator.get_qa_pairs_for_evaluation(limit=1000)
# Find the requested pair
for pair in qa_pairs:
if pair.get("conversation_id") == conversation_id:
question = pair.get("question", "")
original_answer = pair.get("original_answer", "")
# Check if there's an existing annotation
annotation = evaluator.get_annotation_by_conversation_id(conversation_id)
if annotation:
existing_ratings = annotation.get("ratings", {})
improved_answer = annotation.get("improved_answer", original_answer)
notes = annotation.get("notes", "")
return question, original_answer, improved_answer, existing_ratings, notes
return question, original_answer, original_answer, {}, ""
return "", "", "", {}, ""
def save_evaluation(
evaluator: ChatEvaluator,
conversation_id: str,
question: str,
original_answer: str,
improved_answer: str,
accuracy: int,
completeness: int,
relevance: int,
clarity: int,
legal_correctness: int,
notes: str
) -> str:
"""
Save evaluation to file and dataset
Args:
evaluator: ChatEvaluator instance
conversation_id: ID of the conversation
question: User question
original_answer: Original bot answer
improved_answer: Improved answer
accuracy: Rating for factual accuracy (1-5)
completeness: Rating for completeness (1-5)
relevance: Rating for relevance (1-5)
clarity: Rating for clarity (1-5)
legal_correctness: Rating for legal correctness (1-5)
notes: Evaluator notes
Returns:
Status message
"""
# Create ratings dictionary
ratings = {
"accuracy": accuracy,
"completeness": completeness,
"relevance": relevance,
"clarity": clarity,
"legal_correctness": legal_correctness
}
# Save annotation
success, message = evaluator.save_annotation(
conversation_id=conversation_id,
question=question,
original_answer=original_answer,
improved_answer=improved_answer,
ratings=ratings,
notes=notes
)
return message
def generate_evaluation_report_html(evaluator: ChatEvaluator) -> str:
"""
Generate HTML report of evaluation metrics
Args:
evaluator: ChatEvaluator instance
Returns:
HTML string with report
"""
report = evaluator.generate_evaluation_report()
if report["total_evaluations"] == 0:
return "<p>No evaluations available yet.</p>"
# Format criteria averages
criteria_html = ""
for criterion, avg in report["criteria_averages"].items():
# Calculate stars representation (1-5)
stars = "β˜…" * int(avg) + "β˜†" * (5 - int(avg))
criteria_html += f"""
<tr>
<td>{criterion.capitalize()}</td>
<td>{avg:.2f}/5.0</td>
<td>{stars}</td>
</tr>
"""
# Overall stars representation
overall_stars = "β˜…" * int(report["overall_average"]) + "β˜†" * (5 - int(report["overall_average"]))
html = f"""
<div style="padding: 15px; border: 1px solid #ccc; border-radius: 5px; margin-top: 10px;">
<h3>Evaluation Report</h3>
<p><strong>Total Evaluations:</strong> {report["total_evaluations"]}</p>
<p><strong>Overall Average Rating:</strong> {report["overall_average"]:.2f}/5.0 {overall_stars}</p>
<p><strong>Improvement Rate:</strong> {report["improvement_rate"]:.1f}% of responses were improved</p>
<h4>Criteria Ratings:</h4>
<table style="width: 100%; border-collapse: collapse;">
<tr>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Criterion</th>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Average Score</th>
<th style="text-align: left; padding: 8px; border-bottom: 1px solid #ddd;">Rating</th>
</tr>
{criteria_html}
</table>
</div>
"""
return html
def export_training_data_action(evaluator: ChatEvaluator, min_rating: int, output_file: str) -> str:
"""
Action for exporting training data
Args:
evaluator: ChatEvaluator instance
min_rating: Minimum average rating (1-5)
output_file: Output file path
Returns:
Status message
"""
if not output_file:
output_file = os.path.join(os.path.dirname(evaluator.annotations_dir), "training_data.jsonl")
success, message = evaluator.export_training_data(output_file, min_rating)
return message