import gradio as gr import json import pandas as pd import html import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) def load_and_display_json(file): try: # Function to format tools list as HTML with each tool on a separate line def format_tools_list(tools): if not tools or not isinstance(tools, list) or len(tools) == 0: return "No tools used" html_list = "" return html_list # Read the uploaded JSON file with open(file.name, 'r', encoding='utf-8') as f: data = json.load(f) # Validate that the data is a list of dictionaries if not isinstance(data, list) or not all(isinstance(item, dict) for item in data): return "Error: JSON file must contain a list of dictionaries." # Prepare data for DataFrame table_data = [] for idx, item in enumerate(data): # Extract relevant fields, handling missing keys eval_metrics = item.get('evaluation_metrics', []) # Create a formatted string with each score on its own line with 3 decimal places if eval_metrics: eval_scores = "
".join( f"{metric.get('name', 'Unknown')}: {format(float(metric.get('score', 0)), '.3f') if isinstance(metric.get('score'), (int, float)) else metric.get('score', 'N/A')}" for metric in eval_metrics ) else: eval_scores = "N/A" # Format time_spent value if it exists and is numeric time_spent = item.get('time_spent', 'N/A') if isinstance(time_spent, (int, float)): formatted_time = f"{time_spent:.2f}s" else: formatted_time = time_spent row = { 'Index': idx, 'User ID': item.get('user_id', 'N/A'), 'Question': item.get('question', 'N/A'), 'Confidence': item.get('confidence_score', 'N/A'), 'Send to Human': item.get('send_to_human', 'N/A'), 'Call Human Message': item.get('call_human_message', 'N/A'), 'Time Spent': formatted_time, 'Eval Scores': eval_scores, 'Response': item.get('chat_response', 'N/A'), 'Source': item.get('source', 'N/A'), 'Tools': item.get('tools', []), # Keep as a list 'Retrieval Context': item.get('retrieval_context', 'N/A'), 'Ground Truth': item.get('ground_truth', 'N/A'), 'Evaluation Metrics': eval_metrics, 'Information to Check': item.get('information_to_check', 'N/A') } table_data.append(row) # Create DataFrame df = pd.DataFrame(table_data) # Create HTML output for display html_output = """

Row Color Legend

Dark Green: High Information Coverage - Information Coverage score is at least 0.8 (highest priority)
Red: "Send to Human" is true but with Information Coverage below 0.8
Shallow Green: Medium Information Coverage - Information Coverage score is between 0.5 and 0.8
Light Gray: Low Information Coverage - Information Coverage score is below 0.5
""" # Calculate statistics for each row type total_rows = len(df) send_to_human_count = 0 # Send to Human with less than 0.8 coverage all_send_to_human_count = 0 # All Send to Human (regardless of coverage) low_coverage_count = 0 medium_coverage_count = 0 high_coverage_count = 0 for _, row in df.iterrows(): # Check for metrics to determine row type is_send_to_human = row['Send to Human'] is True # Count all send to human regardless of coverage if is_send_to_human: all_send_to_human_count += 1 # Extract coverage score coverage_score = None if row['Evaluation Metrics']: for metric in row['Evaluation Metrics']: if metric.get('name') == 'Information Coverage (GEval)': try: coverage_score = float(metric.get('score', 0)) except (ValueError, TypeError): pass # Count row types (matching the same priority logic used for display) if coverage_score is not None and coverage_score >= 0.8: high_coverage_count += 1 elif is_send_to_human: send_to_human_count += 1 # This is now "Send to Human with less than 0.8 coverage" elif coverage_score is not None: if coverage_score >= 0.5: # Between 0.5 and 0.8 medium_coverage_count += 1 else: # Below 0.5 low_coverage_count += 1 # Calculate percentages send_to_human_percent = (send_to_human_count / total_rows * 100) if total_rows > 0 else 0 all_send_to_human_percent = (all_send_to_human_count / total_rows * 100) if total_rows > 0 else 0 low_coverage_percent = (low_coverage_count / total_rows * 100) if total_rows > 0 else 0 medium_coverage_percent = (medium_coverage_count / total_rows * 100) if total_rows > 0 else 0 high_coverage_percent = (high_coverage_count / total_rows * 100) if total_rows > 0 else 0 # Add statistics summary at the top html_output += """

Row Type Statistics

High Coverage: {:.1f}% ({} of {} rows)
Send to Human (<0.8 coverage): {:.1f}% ({} of {} rows)
Medium Coverage: {:.1f}% ({} of {} rows)
Low Coverage: {:.1f}% ({} of {} rows)
All Send to Human: {:.1f}% ({} of {} rows)
""".format( high_coverage_percent, high_coverage_count, total_rows, send_to_human_percent, send_to_human_count, total_rows, medium_coverage_percent, medium_coverage_count, total_rows, low_coverage_percent, low_coverage_count, total_rows, all_send_to_human_percent, all_send_to_human_count, total_rows ) # Add table html_output += "" html_output += "" + "".join(f"" for col in ['#', 'User ID', 'Question', 'Confidence', 'Send to Human', 'Time Spent', 'Eval Scores', 'Details']) + "" for _, row in df.iterrows(): # Check for scores low_validity = False high_coverage = False medium_coverage = False low_coverage = False input_validity_found = False info_coverage_found = False if row['Evaluation Metrics']: logger.info(f"Checking evaluation metrics for row {row['Index']}") for metric in row['Evaluation Metrics']: # Check for metrics by name metric_name = metric.get('name', '') if isinstance(metric_name, str): # Check for Input Validity if metric_name == 'Input Validity (GEval)': input_validity_found = True input_validity_value = metric.get('score') logger.info(f"Found Input Validity score: {input_validity_value} (type: {type(input_validity_value).__name__})") # Try to convert to float and check if < 0.8 try: if input_validity_value is not None: float_value = float(input_validity_value) logger.info(f"Converted to float: {float_value}") if float_value < 0.8: low_validity = True logger.info(f"Low Input Validity detected: {float_value}") except (ValueError, TypeError) as e: logger.warning(f"Could not convert {input_validity_value} to float: {e}") # Check for Information Coverage elif metric_name == 'Information Coverage (GEval)': info_coverage_found = True info_coverage_value = metric.get('score') logger.info(f"Found Information Coverage score: {info_coverage_value} (type: {type(info_coverage_value).__name__})") # Try to convert to float and check for high (>=0.8), medium (0.5-0.8), or low (<0.5) coverage try: if info_coverage_value is not None: float_value = float(info_coverage_value) logger.info(f"Converted to float: {float_value}") if float_value >= 0.8: high_coverage = True logger.info(f"High Information Coverage detected: {float_value}") elif float_value >= 0.5: medium_coverage = True logger.info(f"Medium Information Coverage detected: {float_value}") else: low_coverage = True logger.info(f"Low Information Coverage detected: {float_value}") except (ValueError, TypeError) as e: logger.warning(f"Could not convert {info_coverage_value} to float: {e}") # Determine row class (prioritize high coverage over send_to_human, then medium, then low) row_class = "" if high_coverage: row_class = " class='high-coverage'" logger.info(f"Row {row['Index']} marked as 'High Information Coverage'") elif row['Send to Human'] is True: row_class = " class='send-to-human'" logger.info(f"Row {row['Index']} marked as 'Send to Human'") elif medium_coverage: row_class = " class='medium-coverage'" logger.info(f"Row {row['Index']} marked as 'Medium Information Coverage'") elif low_coverage: row_class = " class='low-coverage'" logger.info(f"Row {row['Index']} marked as 'Low Information Coverage'") html_output += f"" html_output += f"" html_output += f"" html_output += f"" html_output += f"" # Add Send to Human cell with conditional message display if row['Send to Human'] is True: # Format call_human_message as JSON if it's a dictionary call_human_message = row['Call Human Message'] try: if isinstance(call_human_message, dict): formatted_message = json.dumps(call_human_message, indent=2, ensure_ascii=False) else: formatted_message = str(call_human_message) except: formatted_message = str(call_human_message) html_output += f"""""" else: html_output += f"" html_output += f"" html_output += f"" # Convert JSON objects to pretty-printed strings retrieval_context_json = html.escape(str(row['Retrieval Context'])) if row['Retrieval Context'] else "N/A" # Format evaluation metrics with 3 decimal places for scores if row['Evaluation Metrics']: formatted_metrics = [] for metric in row['Evaluation Metrics']: metric_copy = dict(metric) if 'score' in metric_copy and isinstance(metric_copy['score'], (int, float)): metric_copy['score'] = format(float(metric_copy['score']), '.3f') formatted_metrics.append(metric_copy) eval_metrics_json = json.dumps(formatted_metrics, indent=2, ensure_ascii=False) else: eval_metrics_json = "N/A" # Add buttons for different popups in the same row html_output += f""" """ html_output += "" html_output += "
{col}
{row['Index'] + 1}{html.escape(str(row['User ID']))}{html.escape(str(row['Question']))}{row['Confidence']} True
Call Human Message:
{html.escape(formatted_message)}
False{row['Time Spent']}{row['Eval Scores']}
Question:

{html.escape(str(row['Question']))}

Ground Truth:

{html.escape(str(row['Ground Truth']))}

Response:

{html.escape(str(row['Response']))}

Source:

{html.escape(str(row['Source']))}

Tools:
{format_tools_list(row['Tools'])}
{f"Call Human Message:

{html.escape(formatted_message)}

" if row['Send to Human'] is True else ""} Information to Check:

{html.escape(str(row['Information to Check']))}

Information to Check:

{html.escape(str(row['Information to Check']))}

Evaluation Metrics:
{html.escape(eval_metrics_json)}
Retrieval Context:
{retrieval_context_json}
" return html_output except Exception as e: return f"Error processing JSON file: {str(e)}" # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("# JSON Data Visualization") gr.Markdown("Upload a JSON file containing a list of dictionaries to visualize the data.") with gr.Accordion("Row Color Legend", open=True): gr.Markdown(""" * **Dark Green rows**: High Information Coverage - Information Coverage score is at least 0.8 (highest priority) * **Red rows**: "Send to Human" is true but with Information Coverage below 0.8 * **Shallow Green rows**: Medium Information Coverage - Information Coverage score is between 0.5 and 0.8 * **Light Gray rows**: Low Information Coverage - Information Coverage score is below 0.5 The statistics section also includes "All Send to Human" count (including those with high coverage). """) file_input = gr.File(label="Upload JSON File", file_types=[".json"]) output = gr.HTML(label="Data Visualization") file_input.change(load_and_display_json, inputs=file_input, outputs=output) # Launch the interface demo.launch()