import datetime import os from typing import Any, Dict, Optional, Tuple import gradio as gr import pandas as pd from loguru import logger from src.config import HISTORY_PATH from src.data_manager import get_random_example, load_models from src.judge import JudgeManager from src.ui import UI # Global state for evaluations eval1: Optional[Dict[str, Any]] = None eval2: Optional[Dict[str, Any]] = None selected_judges: list = [] current_test_type: str = "grounding" # Add more detailed logging logger.info("EvalArena starting up") # Check if benchmarks directory exists if os.path.exists("benchmarks") and os.path.isdir("benchmarks"): benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))] logger.info(f"Found benchmark directories: {benchmark_dirs}") # Log CSV files in each directory for d in benchmark_dirs: dir_path = os.path.join("benchmarks", d) files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")] logger.info(f"Benchmark directory '{d}' contains files: {files}") else: logger.warning("Benchmarks directory not found or not accessible") def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame: """Format the leaderboard dataframe for display in the UI. This ensures consistent display across environments like Huggingface Spaces.""" # Create a copy of the dataframe with only the columns we want to display display_df = pd.DataFrame() display_df["Judge Name"] = df["judge_name"] display_df["ELO Score"] = df["elo_score"] display_df["Wins"] = df["wins"] display_df["Losses"] = df["losses"] display_df["Total Evaluations"] = df["total_evaluations"] return display_df def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]: """Load benchmark data for the selected benchmark type and dataset. Args: benchmark_type: The type of benchmark (e.g., 'prompt-injections') dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak') Returns: Tuple containing: - DataFrame formatted for display - Markdown string with benchmark information """ # Create empty dataframe with the expected columns empty_df = pd.DataFrame( columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"] ) # Handle case when None or empty values are passed if not benchmark_type or not dataset_name: logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}") return empty_df, "Please select both a benchmark type and dataset" try: # Construct the path to the benchmark metrics file metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv") logger.info(f"Loading benchmark from {metrics_file}") if not os.path.exists(metrics_file): error_message = f"Error: Could not find metrics file at {metrics_file}" logger.error(error_message) return empty_df, error_message # Load the CSV file df = pd.read_csv(metrics_file) logger.info(f"Loaded benchmark with {len(df)} rows") # Check if the file has the required columns required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"] missing_columns = [col for col in required_columns if col not in df.columns] if missing_columns: error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}" logger.error(error_message) return empty_df, error_message # Format the dataframe for display display_df = pd.DataFrame() display_df["Judge Name"] = df["judge_name"] display_df["F1 Score"] = df["f1"].round(3) display_df["Balanced Accuracy"] = df["bacc"].round(3) display_df["Avg Latency (s)"] = df["avg_latency"].round(2) display_df["Correct"] = df["correct"] display_df["Total"] = df["count"] # Sort by balanced accuracy descending display_df = display_df.sort_values("Balanced Accuracy", ascending=False) # Generate information about the benchmark total_samples = df["count"].iloc[0] if not df.empty else 0 info_md = f""" # Benchmark: {dataset_name} **Type**: {benchmark_type} **Total Samples**: {total_samples} This table shows how different AI judge models performed on this benchmark. Higher F1 score and balanced accuracy indicate better performance. """ return display_df, info_md except pd.errors.EmptyDataError: error_message = "Error: The CSV file is empty" logger.error(error_message) return empty_df, error_message except pd.errors.ParserError: error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format" logger.error(error_message) return empty_df, error_message except Exception as e: error_message = f"Error loading benchmark data: {str(e)}" logger.error(error_message) return empty_df, error_message def initialize(): """Initialize the application.""" # Load models from file judges = load_models() logger.info(f"Loaded {len(judges)} judges") # Initialize judge manager judge_manager = JudgeManager(judges) # Set default test type default_test_type = "grounding" global current_test_type current_test_type = default_test_type # Create UI ui = UI( refresh_fn=lambda test_type: refresh_example(test_type, judge_manager), submit_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: submit_example( text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type, judge_manager, ), evaluate1_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation1( text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type, judge_manager, ), evaluate2_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation2( text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type, judge_manager, ), winner1_fn=lambda: select_winner("Evaluation 1", judge_manager), winner2_fn=lambda: select_winner("Evaluation 2", judge_manager), both_correct_fn=lambda: handle_both_correct(judge_manager), both_incorrect_fn=lambda: handle_both_incorrect(judge_manager), refresh_leaderboard_fn=lambda: format_leaderboard_for_display( judge_manager.leaderboard_df, ), leaderboard_df=format_leaderboard_for_display( judge_manager.leaderboard_df, ), load_benchmark_fn=load_benchmark_data, ) return ui.create_interface() def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple: """Get a random example for the given test type.""" try: # Get example from the dataset logger.info(f"Getting example for test type: {test_type}") example = get_random_example(test_type) # Default values for all return fields input_text = "" output_text = "" text_input = "" claim_input = "" single_text_input = "" policy_input = "" policy_output = "" policy_assertion = "" # Populate fields based on test type if test_type == "grounding": text_input = example["text"] claim_input = example["claim"] elif test_type in ["prompt_injections", "safety"]: single_text_input = example["text"] elif test_type == "policy": policy_input = example["input"] policy_output = example["output"] policy_assertion = example["assertion"] else: # Legacy format input_text = example.get("text", f"Sample input for {test_type}") output_text = example.get("claim", f"Sample output for {test_type}") return ( input_text, output_text, text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, ) except Exception as e: logger.error(f"Error getting example: {e}") # Return empty strings for all fields return ( "", "", "", "", "", "", "", "", ) def submit_example( text_input: str, claim_input: str, single_text_input: str, policy_input: str, policy_output: str, policy_assertion: str, test_type: str, judge_manager: JudgeManager, ) -> Tuple: """Prepare for evaluation and select random judges.""" global selected_judges, current_test_type, eval1, eval2 try: logger.info(f"Preparing evaluation for test type: {test_type}") current_test_type = test_type # Reset evaluations eval1 = None eval2 = None # Select random judges selected_judges = judge_manager.pick_random_judges() if len(selected_judges) < 2: return ( "Error: Not enough judges available", "Error: Not enough judges available", None, None, None, None, None, None, None, gr.update(visible=False), ) # Format inputs for Qualifire evaluation input_text, output_text = format_inputs_for_evaluation( text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type, ) # Get a single Qualifire evaluation to be shared by both judges try: qualifire_result, time_elapsed = judge_manager.evaluate_with_qualifire( input_text, output_text, test_type, as_raw=True, # Get raw result to share between judges ) logger.info("Completed Qualifire evaluation") # Store the Qualifire result for both judges to use judge_manager.shared_qualifire_result = qualifire_result judge_manager.shared_qualifire_time = time_elapsed except Exception as e: logger.error(f"Error during Qualifire evaluation: {str(e)}") # Continue even if Qualifire fails - judges can still work without it # Show loading messages while evaluations are in progress status_text = "Evaluations starting... Both judges will evaluate in parallel." return ( "Loading evaluation 1...", "Loading evaluation 2...", gr.update(value=text_input), gr.update(value=claim_input), gr.update(value=single_text_input), gr.update(value=policy_input), gr.update(value=policy_output), gr.update(value=policy_assertion), gr.update(value=test_type), gr.update(visible=True, value=status_text), ) except Exception as e: logger.error(f"Error preparing evaluation: {e}") return ( f"Error: {str(e)}", f"Error: {str(e)}", gr.update(value=text_input), gr.update(value=claim_input), gr.update(value=single_text_input), gr.update(value=policy_input), gr.update(value=policy_output), gr.update(value=policy_assertion), gr.update(value=test_type), gr.update(visible=False), ) def get_evaluation1( text_input: str, claim_input: str, single_text_input: str, policy_input: str, policy_output: str, policy_assertion: str, test_type: str, judge_manager: JudgeManager, ) -> Tuple[str, Any]: """Get evaluation from the first judge.""" global eval1, selected_judges try: if not selected_judges or len(selected_judges) < 1: return "No judges selected", gr.update(visible=False) logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}") # Format inputs based on test type input_text, output_text = format_inputs_for_evaluation( text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type, ) # Get evaluation from the first judge eval1 = judge_manager.get_evaluation( selected_judges[0], input_text, output_text, test_type, use_shared_result=True, ) logger.info("Completed evaluation 1") # Display the evaluation (time is already included in the evaluation) display_eval = eval1["display_evaluation"] # Make the selection button visible once the evaluation is ready return display_eval, gr.update(visible=True) except Exception as e: logger.error(f"Error getting evaluation 1: {e}") return f"Error: {str(e)}", gr.update(visible=False) def get_evaluation2( text_input: str, claim_input: str, single_text_input: str, policy_input: str, policy_output: str, policy_assertion: str, test_type: str, judge_manager: JudgeManager, ) -> Tuple[str, Any, Any]: """Get evaluation from the second judge.""" global eval2, selected_judges try: if not selected_judges or len(selected_judges) < 2: return ( "No judges selected", gr.update( visible=False, ), gr.update( visible=False, ), ) logger.info( f"Starting evaluation 2 with judge {selected_judges[1]['name']}", ) # Format inputs based on test type input_text, output_text = format_inputs_for_evaluation( text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type, ) # Get evaluation from the second judge eval2 = judge_manager.get_evaluation( selected_judges[1], input_text, output_text, test_type, use_shared_result=True, ) logger.info("Completed evaluation 2") # Display the evaluation (time is already included in the evaluation) display_eval = eval2["display_evaluation"] return ( display_eval, gr.update(visible=True), gr.update(visible=True), ) except Exception as e: logger.error(f"Error getting evaluation 2: {e}") return ( f"Error: {str(e)}", gr.update( visible=False, ), gr.update( visible=False, ), ) def format_inputs_for_evaluation( text_input: str, claim_input: str, single_text_input: str, policy_input: str, policy_output: str, policy_assertion: str, test_type: str, ) -> Tuple[str, str]: """Format inputs based on test type to be compatible with the evaluation function.""" if test_type == "grounding": input_text = text_input output_text = claim_input elif test_type in ["prompt_injections", "safety"]: input_text = single_text_input output_text = "" elif test_type == "policy": input_text = f"Input: {policy_input}\nAssertion: {policy_assertion}" output_text = policy_output else: # Default fallback - this should not happen with the UI constraints input_text = text_input or single_text_input or policy_input output_text = claim_input or policy_output return input_text, output_text def save_to_history( input_text: str, output_text: str, judge1_id: str, judge1_name: str, judge1_evaluation: str, judge1_time: float, judge2_id: str, judge2_name: str, judge2_evaluation: str, judge2_time: float, winner_id: str, ) -> None: """Save the evaluation results to history CSV file.""" try: # Create a new row for the history history_row = { "timestamp": datetime.datetime.now().isoformat(), "input": input_text, "output": output_text, "judge1_id": judge1_id, "judge1_name": judge1_name, "judge1_evaluation": judge1_evaluation, "judge1_time": judge1_time, "judge2_id": judge2_id, "judge2_name": judge2_name, "judge2_evaluation": judge2_evaluation, "judge2_time": judge2_time, "winner_id": winner_id, } # Try to load existing history try: history_df = pd.read_csv(HISTORY_PATH) except (FileNotFoundError, pd.errors.EmptyDataError): # Create a new history dataframe if file doesn't exist or is empty history_df = pd.DataFrame(columns=list(history_row.keys())) # Append the new row history_df = pd.concat( [history_df, pd.DataFrame([history_row])], ignore_index=True, ) # Save to CSV history_df.to_csv(HISTORY_PATH, index=False) logger.info("Saved evaluation to history") except Exception as e: logger.error(f"Error saving to history: {e}") def select_winner(choice: str, judge_manager: JudgeManager) -> str: """Select a winner from the evaluations.""" global eval1, eval2, current_test_type try: if not eval1 or not eval2: return "Error: No evaluations available" # Get the input and output text that was evaluated input_text, output_text = "", "" if "input_text" in eval1 and "output_text" in eval1: input_text = eval1.get("input_text", "") output_text = eval1.get("output_text", "") if choice == "Evaluation 1": winner_eval = eval1 loser_eval = eval2 winner_id = eval1["judge"]["id"] else: winner_eval = eval2 loser_eval = eval1 winner_id = eval2["judge"]["id"] # Update leaderboard updated_board = judge_manager.update_leaderboard( winner_eval["judge"]["id"], loser_eval["judge"]["id"], result_type="win", ) # Save to history save_to_history( input_text=input_text, output_text=output_text, judge1_id=eval1["judge"]["id"], judge1_name=eval1["judge"]["name"], judge1_evaluation=eval1["anonymous_evaluation"], judge1_time=eval1["elapsed_time"], judge2_id=eval2["judge"]["id"], judge2_name=eval2["judge"]["name"], judge2_evaluation=eval2["anonymous_evaluation"], judge2_time=eval2["elapsed_time"], winner_id=winner_id, ) # Construct result message with revealed judges' names result_message = f"You selected: {choice}\n\n" result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n" result_message += ( f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n" ) # Get the winner's new ELO score winner_mask = updated_board["judge_id"] == winner_id winner_elo = updated_board[winner_mask]["elo_score"].values[0] result_message += f"Winner: {winner_eval['judge']['name']} " result_message += f"(New ELO: {winner_elo:.2f})\n" result_message += f"Test Type: {current_test_type}\n" return result_message except Exception as e: logger.error(f"Error selecting winner: {e}") return f"Error: {str(e)}" def handle_both_correct(judge_manager: JudgeManager) -> str: """Handle case where both evaluations are correct.""" global eval1, eval2, current_test_type try: if not eval1 or not eval2: return "Error: No evaluations available" # Get the input and output text that was evaluated input_text, output_text = "", "" if "input_text" in eval1 and "output_text" in eval1: input_text = eval1.get("input_text", "") output_text = eval1.get("output_text", "") # Update leaderboard for both judges updated_board = judge_manager.update_leaderboard( eval1["judge"]["id"], eval2["judge"]["id"], result_type="both_correct", ) # Save to history with both as winners save_to_history( input_text=input_text, output_text=output_text, judge1_id=eval1["judge"]["id"], judge1_name=eval1["judge"]["name"], judge1_evaluation=eval1["anonymous_evaluation"], judge1_time=eval1["elapsed_time"], judge2_id=eval2["judge"]["id"], judge2_name=eval2["judge"]["name"], judge2_evaluation=eval2["anonymous_evaluation"], judge2_time=eval2["elapsed_time"], winner_id="both", ) # Construct result message with revealed judges' names result_message = "You selected: Both Correct\n\n" result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n" result_message += ( f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n" ) # Get the new ELO scores judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"] judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"] judge1_elo = updated_board[judge1_mask]["elo_score"].values[0] judge2_elo = updated_board[judge2_mask]["elo_score"].values[0] result_message += "\nBoth judges performed well!\n" result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n" result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n" result_message += f"Test Type: {current_test_type}\n" return result_message except Exception as e: logger.error(f"Error handling both correct: {e}") return f"Error: {str(e)}" def handle_both_incorrect(judge_manager: JudgeManager) -> str: """Handle case where both evaluations are incorrect.""" global eval1, eval2, current_test_type try: if not eval1 or not eval2: return "Error: No evaluations available" # Get the input and output text that was evaluated input_text, output_text = "", "" if "input_text" in eval1 and "output_text" in eval1: input_text = eval1.get("input_text", "") output_text = eval1.get("output_text", "") # Update leaderboard for both judges updated_board = judge_manager.update_leaderboard( eval1["judge"]["id"], eval2["judge"]["id"], result_type="both_incorrect", ) # Save to history with neither as winner save_to_history( input_text=input_text, output_text=output_text, judge1_id=eval1["judge"]["id"], judge1_name=eval1["judge"]["name"], judge1_evaluation=eval1["anonymous_evaluation"], judge1_time=eval1["elapsed_time"], judge2_id=eval2["judge"]["id"], judge2_name=eval2["judge"]["name"], judge2_evaluation=eval2["anonymous_evaluation"], judge2_time=eval2["elapsed_time"], winner_id="none", ) # Construct result message with revealed judges' names result_message = "You selected: Both Incorrect\n\n" result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n" result_message += ( f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n" ) # Get the new ELO scores judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"] judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"] judge1_elo = updated_board[judge1_mask]["elo_score"].values[0] judge2_elo = updated_board[judge2_mask]["elo_score"].values[0] result_message += "\nBoth judges need improvement.\n" result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n" result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n" result_message += f"Test Type: {current_test_type}\n" return result_message except Exception as e: logger.error(f"Error handling both incorrect: {e}") return f"Error: {str(e)}" def main(): """Initialize the application.""" demo = initialize() demo.launch(server_name="0.0.0.0") if __name__ == "__main__": main()