Spaces:

curtizz
/

show_eval_result

Sleeping

App Files Files Community

show_eval_result / app.py

curtizz

Update app.py

94d72ef verified 7 months ago

raw

history blame contribute delete

31.9 kB

	import gradio as gr
	import json
	import pandas as pd
	import html
	import logging

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	def load_and_display_json(file):
	try:
	# Function to format tools list as HTML with each tool on a separate line
	def format_tools_list(tools):
	if not tools or not isinstance(tools, list) or len(tools) == 0:
	return "<span style='color: #888;'>No tools used</span>"

	html_list = "<ul style='margin: 0; padding-left: 20px;'>"
	for tool in tools:
	# Handle different possible formats of tool information
	if isinstance(tool, dict):
	# If tool is a dictionary, format it nicely
	tool_name = tool.get('name', 'Unknown Tool')
	tool_details = json.dumps(tool, indent=2)
	html_list += f"""<li style='margin-bottom: 8px;'>
	<strong>{html.escape(str(tool_name))}</strong>
	<pre style='margin: 5px 0 0 10px; padding: 5px; background-color: #f8f8f8; border-radius: 3px; font-size: 12px; max-height: 150px; overflow-y: auto;'>{html.escape(tool_details)}</pre>
	</li>"""
	else:
	# Simple string or other type
	html_list += f"<li style='margin-bottom: 5px;'>{html.escape(str(tool))}</li>"

	html_list += "</ul>"
	return html_list

	# Read the uploaded JSON file
	with open(file.name, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# Validate that the data is a list of dictionaries
	if not isinstance(data, list) or not all(isinstance(item, dict) for item in data):
	return "Error: JSON file must contain a list of dictionaries."

	# Prepare data for DataFrame
	table_data = []
	for idx, item in enumerate(data):
	# Extract relevant fields, handling missing keys
	eval_metrics = item.get('evaluation_metrics', [])

	# Create a formatted string with each score on its own line with 3 decimal places
	if eval_metrics:
	eval_scores = "<br>".join(
	f"{metric.get('name', 'Unknown')}: {format(float(metric.get('score', 0)), '.3f') if isinstance(metric.get('score'), (int, float)) else metric.get('score', 'N/A')}"
	for metric in eval_metrics
	)
	else:
	eval_scores = "N/A"

	# Format time_spent value if it exists and is numeric
	time_spent = item.get('time_spent', 'N/A')
	if isinstance(time_spent, (int, float)):
	formatted_time = f"{time_spent:.2f}s"
	else:
	formatted_time = time_spent

	row = {
	'Index': idx,
	'User ID': item.get('user_id', 'N/A'),
	'Question': item.get('question', 'N/A'),
	'Confidence': item.get('confidence_score', 'N/A'),
	'Send to Human': item.get('send_to_human', 'N/A'),
	'Call Human Message': item.get('call_human_message', 'N/A'),
	'Time Spent': formatted_time,
	'Eval Scores': eval_scores,
	'Response': item.get('chat_response', 'N/A'),
	'Source': item.get('source', 'N/A'),
	'Tools': item.get('tools', []), # Keep as a list
	'Retrieval Context': item.get('retrieval_context', 'N/A'),
	'Ground Truth': item.get('ground_truth', 'N/A'),
	'Evaluation Metrics': eval_metrics,
	'Information to Check': item.get('information_to_check', 'N/A')
	}
	table_data.append(row)

	# Create DataFrame
	df = pd.DataFrame(table_data)

	# Create HTML output for display
	html_output = """
	<style>
	table {
	width: 100%;
	border-collapse: collapse;
	margin-bottom: 20px;
	font-family: Arial, sans-serif;
	}
	th, td {
	border: 1px solid #e0e0e0;
	padding: 12px;
	text-align: left;
	font-size: 14px;
	vertical-align: top;
	}
	th {
	background-color: #4CAF50;
	color: white;
	font-weight: bold;
	}
	tr:nth-child(even) {
	background-color: #f9f9f9;
	}
	.send-to-human {
	background-color: #ffcccc !important;
	}
	.low-validity {
	background-color: #fff2cc !important;
	}
	.high-coverage {
	background-color: #1e8449 !important;
	color: white;
	}
	.medium-coverage {
	background-color: #a9dfbf !important;
	}
	.low-coverage {
	background-color: #e0e0e0 !important;
	}
	.expandable {
	cursor: pointer;
	color: white;
	font-weight: bold;
	text-decoration: none;
	display: inline-block;
	padding: 8px;
	transition: all 0.2s;
	border-radius: 4px;
	border: none;
	position: relative;
	text-align: center;
	}
	.expandable:hover {
	filter: brightness(110%);
	box-shadow: 0 2px 4px rgba(0,0,0,0.2);
	}
	.details {
	display: none;
	padding: 20px;
	background-color: #ffffff;
	border: 1px solid #e0e0e0;
	border-radius: 5px;
	margin-top: 10px;
	box-shadow: 0 4px 8px rgba(0,0,0,0.2);
	position: fixed;
	z-index: 1000;
	width: 80%;
	min-width: 600px;
	max-width: 1200px;
	height: auto;
	min-height: 400px;
	max-height: 85vh;
	overflow-y: auto;
	left: 50%;
	top: 50%;
	transform: translate(-50%, -50%);
	}
	.human-message-popup {
	display: none;
	padding: 20px;
	background-color: #ffffff;
	border: 1px solid #e0e0e0;
	border-radius: 5px;
	box-shadow: 0 4px 8px rgba(0,0,0,0.2);
	position: fixed;
	z-index: 1000;
	width: 70%;
	min-width: 500px;
	max-width: 1000px;
	height: auto;
	min-height: 200px;
	max-height: 80vh;
	overflow-y: auto;
	left: 50%;
	top: 50%;
	transform: translate(-50%, -50%);
	background-color: #fff9f9;
	border: 1px solid #d32f2f;
	}
	input[type="checkbox"] {
	display: none !important;
	appearance: none;
	-webkit-appearance: none;
	-moz-appearance: none;
	}
	input[type="checkbox"]:checked ~ .details {
	display: block;
	}
	input[type="checkbox"]:checked ~ .human-message-popup {
	display: block;
	}
	input[type="checkbox"]:checked + .expandable::after {
	content: " (Close)";
	}
	.details strong {
	color: #333;
	font-size: 16px;
	display: block;
	margin-bottom: 5px;
	}
	.details p {
	margin: 10px 0;
	line-height: 1.5;
	}
	.json-viewer {
	background-color: #f5f5f5;
	padding: 10px;
	border-radius: 5px;
	font-family: monospace;
	font-size: 13px;
	overflow-x: auto;
	white-space: pre-wrap;
	}
	pre {
	white-space: pre-wrap;
	word-wrap: break-word;
	margin: 0;
	}
	.color-legend {
	margin: 20px 0;
	padding: 15px;
	border: 1px solid #e0e0e0;
	border-radius: 5px;
	background-color: #f9f9f9;
	}
	.legend-item {
	display: flex;
	align-items: center;
	margin-bottom: 10px;
	}
	.color-box {
	width: 20px;
	height: 20px;
	margin-right: 10px;
	border: 1px solid #ccc;
	}
	.red-box {
	background-color: #ffcccc;
	}
	.yellow-box {
	background-color: #fff2cc;
	}
	.green-box {
	background-color: #1e8449;
	}
	.detail-container {
	position: relative;
	}
	.close-details {
	position: absolute;
	top: 5px;
	right: 5px;
	cursor: pointer;
	background-color: #f44336;
	color: white;
	border: none;
	border-radius: 50%;
	width: 24px;
	height: 24px;
	display: flex;
	align-items: center;
	justify-content: center;
	font-weight: bold;
	}
	.overlay {
	display: none;
	position: fixed;
	top: 0;
	left: 0;
	width: 100%;
	height: 100%;
	background-color: rgba(0,0,0,0.5);
	z-index: 900;
	}
	input[type="checkbox"]:checked ~ .overlay {
	display: block;
	}

	/* Column width adjustments */
	table th:nth-child(1),
	table td:nth-child(1) {
	width: 5%;
	text-align: center;
	font-weight: bold;
	}
	table th:nth-child(2),
	table td:nth-child(2) {
	width: 15%;
	white-space: nowrap;
	overflow: hidden;
	text-overflow: ellipsis;
	}
	table th:nth-child(3),
	table td:nth-child(3) {
	width: 25%;
	max-width: 350px;
	white-space: nowrap;
	overflow: hidden;
	text-overflow: ellipsis;
	}
	table th:nth-child(4),
	table td:nth-child(4),
	table th:nth-child(5),
	table td:nth-child(5),
	table th:nth-child(6),
	table td:nth-child(6) {
	width: 8%;
	min-width: 70px;
	text-align: center;
	}
	table th:nth-child(7),
	table td:nth-child(7) {
	width: 21%;
	}
	table th:nth-child(8),
	table td:nth-child(8) {
	width: 20%;
	text-align: center;
	}
	/* Add tooltips for truncated content */
	table td:nth-child(2),
	table td:nth-child(3) {
	position: relative;
	}
	table td:nth-child(2):hover::after,
	table td:nth-child(3):hover::after {
	content: attr(title);
	position: absolute;
	left: 0;
	top: 100%;
	z-index: 500;
	background-color: #333;
	color: #fff;
	padding: 5px 10px;
	border-radius: 4px;
	white-space: pre-wrap;
	max-width: 400px;
	box-shadow: 0 2px 5px rgba(0,0,0,0.2);
	}
	</style>

	<div class="color-legend">
	<h3>Row Color Legend</h3>
	<div class="legend-item">
	<div class="color-box" style="background-color: #1e8449; color: white;"></div>
	<div>Dark Green: High Information Coverage - Information Coverage score is at least 0.8 (highest priority)</div>
	</div>
	<div class="legend-item">
	<div class="color-box red-box"></div>
	<div>Red: "Send to Human" is true but with Information Coverage below 0.8</div>
	</div>
	<div class="legend-item">
	<div class="color-box" style="background-color: #a9dfbf;"></div>
	<div>Shallow Green: Medium Information Coverage - Information Coverage score is between 0.5 and 0.8</div>
	</div>
	<div class="legend-item">
	<div class="color-box" style="background-color: #e0e0e0;"></div>
	<div>Light Gray: Low Information Coverage - Information Coverage score is below 0.5</div>
	</div>
	</div>

	<style>

	</style>
	<script>
	document.addEventListener('keydown', function(event) {
	if (event.key === 'Escape') {
	// Find all checked checkboxes and uncheck them
	document.querySelectorAll('input[type="checkbox"]:checked').forEach(function(checkbox) {
	checkbox.checked = false;
	});
	}
	});
	</script>
	"""

	# Calculate statistics for each row type
	total_rows = len(df)
	send_to_human_count = 0 # Send to Human with less than 0.8 coverage
	all_send_to_human_count = 0 # All Send to Human (regardless of coverage)
	low_coverage_count = 0
	medium_coverage_count = 0
	high_coverage_count = 0

	for _, row in df.iterrows():
	# Check for metrics to determine row type
	is_send_to_human = row['Send to Human'] is True

	# Count all send to human regardless of coverage
	if is_send_to_human:
	all_send_to_human_count += 1

	# Extract coverage score
	coverage_score = None
	if row['Evaluation Metrics']:
	for metric in row['Evaluation Metrics']:
	if metric.get('name') == 'Information Coverage (GEval)':
	try:
	coverage_score = float(metric.get('score', 0))
	except (ValueError, TypeError):
	pass

	# Count row types (matching the same priority logic used for display)
	if coverage_score is not None and coverage_score >= 0.8:
	high_coverage_count += 1
	elif is_send_to_human:
	send_to_human_count += 1 # This is now "Send to Human with less than 0.8 coverage"
	elif coverage_score is not None:
	if coverage_score >= 0.5: # Between 0.5 and 0.8
	medium_coverage_count += 1
	else: # Below 0.5
	low_coverage_count += 1

	# Calculate percentages
	send_to_human_percent = (send_to_human_count / total_rows * 100) if total_rows > 0 else 0
	all_send_to_human_percent = (all_send_to_human_count / total_rows * 100) if total_rows > 0 else 0
	low_coverage_percent = (low_coverage_count / total_rows * 100) if total_rows > 0 else 0
	medium_coverage_percent = (medium_coverage_count / total_rows * 100) if total_rows > 0 else 0
	high_coverage_percent = (high_coverage_count / total_rows * 100) if total_rows > 0 else 0

	# Add statistics summary at the top
	html_output += """
	<div style="margin-bottom: 20px; padding: 15px; border: 1px solid #e0e0e0; border-radius: 5px; background-color: #f9f9f9;">
	<h3>Row Type Statistics</h3>
	<div style="display: flex; flex-wrap: wrap; gap: 15px;">
	<div style="background-color: #1e8449; color: white; padding: 10px; border-radius: 5px; min-width: 200px;">
	<strong>High Coverage:</strong> {:.1f}% ({} of {} rows)
	</div>
	<div style="background-color: #ffcccc; padding: 10px; border-radius: 5px; min-width: 200px;">
	<strong>Send to Human (<0.8 coverage):</strong> {:.1f}% ({} of {} rows)
	</div>
	<div style="background-color: #a9dfbf; padding: 10px; border-radius: 5px; min-width: 200px;">
	<strong>Medium Coverage:</strong> {:.1f}% ({} of {} rows)
	</div>
	<div style="background-color: #e0e0e0; padding: 10px; border-radius: 5px; min-width: 200px;">
	<strong>Low Coverage:</strong> {:.1f}% ({} of {} rows)
	</div>
	<div style="background-color: #f8d7da; padding: 10px; border-radius: 5px; min-width: 200px; border: 1px dashed #721c24;">
	<strong>All Send to Human:</strong> {:.1f}% ({} of {} rows)
	</div>
	</div>
	</div>
	""".format(
	high_coverage_percent, high_coverage_count, total_rows,
	send_to_human_percent, send_to_human_count, total_rows,
	medium_coverage_percent, medium_coverage_count, total_rows,
	low_coverage_percent, low_coverage_count, total_rows,
	all_send_to_human_percent, all_send_to_human_count, total_rows
	)

	# Add table
	html_output += "<table>"
	html_output += "<tr>" + "".join(f"<th>{col}</th>" for col in ['#', 'User ID', 'Question', 'Confidence', 'Send to Human', 'Time Spent', 'Eval Scores', 'Details']) + "</tr>"
	for _, row in df.iterrows():
	# Check for scores
	low_validity = False
	high_coverage = False
	medium_coverage = False
	low_coverage = False
	input_validity_found = False
	info_coverage_found = False

	if row['Evaluation Metrics']:
	logger.info(f"Checking evaluation metrics for row {row['Index']}")
	for metric in row['Evaluation Metrics']:
	# Check for metrics by name
	metric_name = metric.get('name', '')
	if isinstance(metric_name, str):
	# Check for Input Validity
	if metric_name == 'Input Validity (GEval)':
	input_validity_found = True
	input_validity_value = metric.get('score')
	logger.info(f"Found Input Validity score: {input_validity_value} (type: {type(input_validity_value).__name__})")

	# Try to convert to float and check if < 0.8
	try:
	if input_validity_value is not None:
	float_value = float(input_validity_value)
	logger.info(f"Converted to float: {float_value}")
	if float_value < 0.8:
	low_validity = True
	logger.info(f"Low Input Validity detected: {float_value}")
	except (ValueError, TypeError) as e:
	logger.warning(f"Could not convert {input_validity_value} to float: {e}")

	# Check for Information Coverage
	elif metric_name == 'Information Coverage (GEval)':
	info_coverage_found = True
	info_coverage_value = metric.get('score')
	logger.info(f"Found Information Coverage score: {info_coverage_value} (type: {type(info_coverage_value).__name__})")

	# Try to convert to float and check for high (>=0.8), medium (0.5-0.8), or low (<0.5) coverage
	try:
	if info_coverage_value is not None:
	float_value = float(info_coverage_value)
	logger.info(f"Converted to float: {float_value}")
	if float_value >= 0.8:
	high_coverage = True
	logger.info(f"High Information Coverage detected: {float_value}")
	elif float_value >= 0.5:
	medium_coverage = True
	logger.info(f"Medium Information Coverage detected: {float_value}")
	else:
	low_coverage = True
	logger.info(f"Low Information Coverage detected: {float_value}")
	except (ValueError, TypeError) as e:
	logger.warning(f"Could not convert {info_coverage_value} to float: {e}")

	# Determine row class (prioritize high coverage over send_to_human, then medium, then low)
	row_class = ""
	if high_coverage:
	row_class = " class='high-coverage'"
	logger.info(f"Row {row['Index']} marked as 'High Information Coverage'")
	elif row['Send to Human'] is True:
	row_class = " class='send-to-human'"
	logger.info(f"Row {row['Index']} marked as 'Send to Human'")
	elif medium_coverage:
	row_class = " class='medium-coverage'"
	logger.info(f"Row {row['Index']} marked as 'Medium Information Coverage'")
	elif low_coverage:
	row_class = " class='low-coverage'"
	logger.info(f"Row {row['Index']} marked as 'Low Information Coverage'")

	html_output += f"<tr{row_class}>"
	html_output += f"<td style='text-align: center;'>{row['Index'] + 1}</td>"
	html_output += f"<td title=\"{html.escape(str(row['User ID']))}\">{html.escape(str(row['User ID']))}</td>"
	html_output += f"<td title=\"{html.escape(str(row['Question']))}\">{html.escape(str(row['Question']))}</td>"
	html_output += f"<td>{row['Confidence']}</td>"

	# Add Send to Human cell with conditional message display
	if row['Send to Human'] is True:
	# Format call_human_message as JSON if it's a dictionary
	call_human_message = row['Call Human Message']
	try:
	if isinstance(call_human_message, dict):
	formatted_message = json.dumps(call_human_message, indent=2, ensure_ascii=False)
	else:
	formatted_message = str(call_human_message)
	except:
	formatted_message = str(call_human_message)

	html_output += f"""<td>
	<span style='font-weight: bold; color: #d32f2f;'>True</span>
	<input type='checkbox' id='message_toggle_{row["Index"]}'>
	<label for='message_toggle_{row["Index"]}' class='expandable' style='margin-top: 5px; background-color: #d32f2f; color: white; border: none; border-radius: 4px; padding: 5px 10px; cursor: pointer; font-size: 12px; display: block; width: calc(100% - 16px); text-align: center;'>Show Message</label>
	<div class='overlay' onclick="document.getElementById('message_toggle_{row["Index"]}').checked = false;"></div>
	<div class='human-message-popup' onclick="event.stopPropagation();">
	<button class="close-details" onclick="document.getElementById('message_toggle_{row["Index"]}').checked = false;">×</button>
	<strong style='color: #d32f2f; font-size: 16px; margin-bottom: 10px;'>Call Human Message:</strong>
	<div style='font-size: 14px; color: #000; white-space: pre-wrap; overflow-x: auto; background-color: #f5f5f5; padding: 10px; border-radius: 5px; border: 1px solid #ddd;'>
	{html.escape(formatted_message)}
	</div>
	</div>
	</td>"""
	else:
	html_output += f"<td><span style='color: #555;'>False</span></td>"

	html_output += f"<td>{row['Time Spent']}</td>"
	html_output += f"<td>{row['Eval Scores']}</td>"

	# Convert JSON objects to pretty-printed strings
	retrieval_context_json = html.escape(str(row['Retrieval Context'])) if row['Retrieval Context'] else "N/A"

	# Format evaluation metrics with 3 decimal places for scores
	if row['Evaluation Metrics']:
	formatted_metrics = []
	for metric in row['Evaluation Metrics']:
	metric_copy = dict(metric)
	if 'score' in metric_copy and isinstance(metric_copy['score'], (int, float)):
	metric_copy['score'] = format(float(metric_copy['score']), '.3f')
	formatted_metrics.append(metric_copy)
	eval_metrics_json = json.dumps(formatted_metrics, indent=2, ensure_ascii=False)
	else:
	eval_metrics_json = "N/A"

	# Add buttons for different popups in the same row
	html_output += f"""
	<td class="detail-container">
	<div style="display: flex; flex-direction: row; justify-content: space-around; gap: 3px;">
	<!-- Details Button -->
	<div style="flex: 1;">
	<input type='checkbox' id='toggle_{row["Index"]}'>
	<label for='toggle_{row["Index"]}' class='expandable' style="width: 100%; box-sizing: border-box; margin: 0; padding: 5px 2px; font-size: 12px; background-color: #4CAF50;">Details</label>
	<div class='overlay' onclick="document.getElementById('toggle_{row["Index"]}').checked = false;"></div>
	<div class='details' onclick="event.stopPropagation();">
	<button class="close-details" onclick="document.getElementById('toggle_{row["Index"]}').checked = false;">×</button>
	<strong>Question:</strong>
	<p>{html.escape(str(row['Question']))}</p>
	<strong>Ground Truth:</strong>
	<p>{html.escape(str(row['Ground Truth']))}</p>
	<strong>Response:</strong>
	<p>{html.escape(str(row['Response']))}</p>
	<strong>Source:</strong>
	<p>{html.escape(str(row['Source']))}</p>
	<strong>Tools:</strong>
	<div style="margin-top: 5px; margin-bottom: 10px;">
	{format_tools_list(row['Tools'])}
	</div>
	{f"<strong style='color: #d32f2f;'>Call Human Message:</strong><p style='color: #d32f2f; white-space: pre-wrap;'>{html.escape(formatted_message)}</p>" if row['Send to Human'] is True else ""}
	<strong>Information to Check:</strong>
	<p>{html.escape(str(row['Information to Check']))}</p>
	</div>
	</div>

	<!-- Evaluation Metrics Button -->
	<div style="flex: 1;">
	<input type='checkbox' id='metrics_toggle_{row["Index"]}'>
	<label for='metrics_toggle_{row["Index"]}' class='expandable' style="width: 100%; box-sizing: border-box; margin: 0; padding: 5px 2px; font-size: 12px; background-color: #2196F3;">Metrics</label>
	<div class='overlay' onclick="document.getElementById('metrics_toggle_{row["Index"]}').checked = false;"></div>
	<div class='details' onclick="event.stopPropagation();">
	<button class="close-details" onclick="document.getElementById('metrics_toggle_{row["Index"]}').checked = false;">×</button>
	<strong>Information to Check:</strong>
	<p style="margin-bottom: 15px; padding: 8px; background-color: #f5f5f5; border-left: 4px solid #2196F3; border-radius: 3px;">{html.escape(str(row['Information to Check']))}</p>
	<strong>Evaluation Metrics:</strong>
	<div class='json-viewer'><pre>{html.escape(eval_metrics_json)}</pre></div>
	</div>
	</div>

	<!-- Retrieval Context Button -->
	<div style="flex: 1;">
	<input type='checkbox' id='context_toggle_{row["Index"]}'>
	<label for='context_toggle_{row["Index"]}' class='expandable' style="width: 100%; box-sizing: border-box; margin: 0; padding: 5px 2px; font-size: 12px; background-color: #FF9800;">Context</label>
	<div class='overlay' onclick="document.getElementById('context_toggle_{row["Index"]}').checked = false;"></div>
	<div class='details' onclick="event.stopPropagation();">
	<button class="close-details" onclick="document.getElementById('context_toggle_{row["Index"]}').checked = false;">×</button>
	<strong>Retrieval Context:</strong>
	<div class='json-viewer'><pre>{retrieval_context_json}</pre></div>
	</div>
	</div>
	</div>
	</td>
	"""
	html_output += "</tr>"
	html_output += "</table>"

	return html_output
	except Exception as e:
	return f"Error processing JSON file: {str(e)}"

	# Create Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("# JSON Data Visualization")
	gr.Markdown("Upload a JSON file containing a list of dictionaries to visualize the data.")

	with gr.Accordion("Row Color Legend", open=True):
	gr.Markdown("""
	* Dark Green rows: High Information Coverage - Information Coverage score is at least 0.8 (highest priority)
	* Red rows: "Send to Human" is true but with Information Coverage below 0.8
	* Shallow Green rows: Medium Information Coverage - Information Coverage score is between 0.5 and 0.8
	* Light Gray rows: Low Information Coverage - Information Coverage score is below 0.5

	The statistics section also includes "All Send to Human" count (including those with high coverage).
	""")

	file_input = gr.File(label="Upload JSON File", file_types=[".json"])
	output = gr.HTML(label="Data Visualization")
	file_input.change(load_and_display_json, inputs=file_input, outputs=output)

	# Launch the interface
	demo.launch()