Spaces:

lofitolstoy
/

ABxJudge

Runtime error

App Files Files Community

ABxJudge / app.py

lofitolstoy

Update app.py

5172c19 verified about 1 year ago

raw

history blame contribute delete

138 kB

	pip install tenacity

	import sys
	import os
	import venv
	import subprocess
	import signal # Added for CLI stop handling

	# --- Venv Setup ---
	# Determine if we need to set up or reactivate the virtual environment

	VENV_DIR = ".venv"
	REQUIRED_PACKAGES = [
	"gradio",
	"pandas",
	"requests",
	"tenacity",
	"Pillow", # For image handling (needed for dummy image in CLI test)
	]

	def ensure_venv():
	"""Checks for venv, creates/installs if needed, and re-executes if not active."""
	venv_path = os.path.abspath(VENV_DIR)
	# Check if the current Python executable is from the target venv
	is_in_venv = sys.prefix == venv_path
	venv_exists = os.path.isdir(venv_path)

	if is_in_venv:
	# Already running in the correct venv, proceed
	print(f"Running inside the '{VENV_DIR}' virtual environment.")
	return True # Indicate we are ready to proceed

	print(f"Not running inside the target '{VENV_DIR}' virtual environment.")

	if not venv_exists:
	print(f"Creating virtual environment in '{venv_path}'...")
	try:
	venv.create(venv_path, with_pip=True)
	print("Virtual environment created successfully.")
	except Exception as e:
	print(f"Error creating virtual environment: {e}", file=sys.stderr)
	sys.exit(1) # Exit if creation fails

	# Determine the Python executable path within the venv
	if sys.platform == "win32":
	python_executable = os.path.join(venv_path, "Scripts", "python.exe")
	pip_executable = os.path.join(venv_path, "Scripts", "pip.exe")
	else:
	python_executable = os.path.join(venv_path, "bin", "python")
	pip_executable = os.path.join(venv_path, "bin", "pip")

	if not os.path.exists(python_executable):
	print(f"Error: Python executable not found at '{python_executable}'. Venv creation might have failed.", file=sys.stderr)
	sys.exit(1)
	if not os.path.exists(pip_executable):
	print(f"Error: Pip executable not found at '{pip_executable}'. Venv creation might have failed.", file=sys.stderr)
	sys.exit(1)


	# Install requirements into the venv using pip from the venv
	print(f"Installing/checking required packages in '{venv_path}'...")
	install_command = [pip_executable, "install"] + REQUIRED_PACKAGES
	try:
	# Run pip install, capture output for clarity/debugging
	result = subprocess.run(install_command, check=True, capture_output=True, text=True, encoding='utf-8')
	print("Packages installed/verified successfully.")
	# print(result.stdout) # Uncomment to see pip output
	if result.stderr:
	# Show pip's stderr for warnings etc.
	print("--- pip stderr ---\n", result.stderr, "\n--- end pip stderr ---")
	except subprocess.CalledProcessError as e:
	print(f"Error installing packages using command: {' '.join(e.cmd)}", file=sys.stderr)
	print(f"Pip stdout:\n{e.stdout}", file=sys.stderr)
	print(f"Pip stderr:\n{e.stderr}", file=sys.stderr)
	sys.exit(1) # Exit if installation fails
	except Exception as e:
	print(f"An unexpected error occurred during package installation: {e}", file=sys.stderr)
	sys.exit(1)


	# Re-execute the script using the venv's Python interpreter
	print(f"\nRestarting script using Python from '{venv_path}'...\n{'='*20}\n")
	script_path = os.path.abspath(sys.argv[0])
	# os.execv replaces the current process, inheriting stdio etc.
	# Arguments must include the executable name as argv[0] for the new process
	try:
	os.execv(python_executable, [python_executable, script_path] + sys.argv[1:])
	# If execv is successful, this line is never reached
	except OSError as e:
	print(f"Error restarting script with '{python_executable}': {e}", file=sys.stderr)
	# Fallback attempt with subprocess if execv fails (less ideal)
	print("Attempting restart with subprocess as fallback...")
	try:
	subprocess.check_call([python_executable, script_path] + sys.argv[1:])
	sys.exit(0) # Exit cleanly if subprocess worked
	except Exception as sub_e:
	print(f"Subprocess restart also failed: {sub_e}", file=sys.stderr)
	sys.exit(1)
	except Exception as e:
	print(f"Unexpected error during script restart: {e}", file=sys.stderr)
	sys.exit(1)

	# This should not be reached if re-execution happens
	return False # Indicate re-execution was attempted

	# --- Original Script Imports (ensure they are accessible after venv check) ---
	# It's generally okay to keep imports here, as the script restarts if not in venv
	import gradio as gr
	import json
	import logging
	import time
	import pandas as pd
	# import os # Already imported above
	import re
	import requests
	from dataclasses import dataclass, field
	from typing import Any, Dict, List, Optional, Tuple, Union
	from tenacity import retry, stop_after_attempt, wait_exponential
	import csv
	import io
	import tempfile # Added for JSONL download
	from urllib.parse import urlparse
	import base64
	import mimetypes
	# import signal # Already imported above

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	)
	logger = logging.getLogger("model_tester")

	@dataclass
	class ModelEndpoint:
	"""Simple model endpoint configuration."""
	name: str
	api_url: str
	api_key: Optional[str] # API key can be optional (e.g., for local Ollama)
	model_id: str
	max_tokens: int = 1024
	temperature: float = 0.0
	file_upload_method: str = "JSON (Embedded Data)" # Options: "JSON (Embedded Data)", "Multipart Form Data"

	def to_dict(self):
	"""Convert to dictionary."""
	return {
	"name": self.name,
	"api_url": self.api_url,
	"model_id": self.model_id,
	"max_tokens": self.max_tokens,
	"temperature": self.temperature,
	}

	@dataclass
	class TestCase:
	"""Test case containing a key to query and actual value for evaluation."""
	key: str # The input/prompt for the model
	value: str # The reference value/ground truth
	image_path_or_url: Optional[str] = None # Path or URL to an image for multimodal input
	id: Optional[str] = None # Unique ID for the test case

	@dataclass
	class ModelResponse:
	"""Model response for a test case."""
	test_id: str
	model_name: str
	output: str
	latency: float

	@dataclass
	class EvaluationResult:
	"""Evaluation result from the LM judge."""
	test_id: str
	champion_output: str
	challenger_output: str
	winner: str # "MODEL_A_WINS", "MODEL_B_WINS", or "TIE" (extracted from reasoning)
	confidence: float # Extracted confidence score (e.g., 4/5 -> 0.8)
	reasoning: str # Full response from the judge model

	# Global preprocessing settings (can be updated through UI)
	PREPROCESS_ENABLED = True
	MAX_LENGTH = 8000
	REMOVE_SPECIAL_CHARS = True
	NORMALIZE_WHITESPACE = True

	# CSV preprocessing settings (specific to CSV format)
	DETECT_DELIMITER = True
	FIX_QUOTES = True
	REMOVE_CONTROL_CHARS = True
	NORMALIZE_NEWLINES = True
	SKIP_BAD_LINES = True
	SHOW_SAMPLE = True # Show sample data after loading & preprocessing

	# Global flag to signal stopping the test run
	STOP_REQUESTED = False

	# --- Text Preprocessing Function ---
	def preprocess_text(text, max_length=None, remove_special_chars=None, normalize_whitespace=None):
	"""
	Preprocess text (key or value) before using in prompts or comparisons.
	- Truncate to prevent token limits
	- Remove problematic characters
	- Normalize whitespace
	"""
	# Use global settings if not specified
	if max_length is None: max_length = MAX_LENGTH
	if remove_special_chars is None: remove_special_chars = REMOVE_SPECIAL_CHARS
	if normalize_whitespace is None: normalize_whitespace = NORMALIZE_WHITESPACE

	# Skip preprocessing if disabled globally
	if not PREPROCESS_ENABLED:
	return str(text) if text is not None else ""

	if text is None: return ""
	text = str(text) # Ensure it's a string

	# Truncate
	if len(text) > max_length:
	text = text[:max_length] + "... [truncated]"

	if remove_special_chars:
	# Remove control characters and other potentially problematic characters
	text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
	# Remove any XML/HTML-like tags that might interfere
	text = re.sub(r'<[^>]+>', '', text)

	if normalize_whitespace:
	# Normalize whitespace (multiple spaces, tabs, newlines to single space)
	text = re.sub(r'\s+', ' ', text)
	# But preserve paragraph breaks for readability (optional, maybe confusing)
	# text = re.sub(r'\n\s*\n', '\n\n', text)
	text = text.strip()

	return text

	# --- Model Runner Class ---
	class ModelRunner:
	"""Handles model API calls."""

	def __init__(self, endpoint: ModelEndpoint, prompt_template: str):
	self.endpoint = endpoint
	self.prompt_template = prompt_template

	def _load_and_encode_file(self, file_path_or_url: str) -> Tuple[Optional[str], Optional[str]]:
	"""Loads file from path/URL, base64 encodes it, and returns (base64_string, mime_type) or (None, None)."""
	try:
	file_bytes = None
	if urlparse(file_path_or_url).scheme in ['http', 'https']:
	logger.info(f"Downloading file from URL: {file_path_or_url}")
	response = requests.get(file_path_or_url, stream=True, timeout=20) # Increased timeout
	response.raise_for_status()
	file_bytes = response.content
	logger.info(f"Successfully downloaded {len(file_bytes)} bytes from URL.")
	# Try to get mime type from headers first
	mime_type = response.headers.get('content-type')
	else:
	logger.info(f"Reading file from local path: {file_path_or_url}")
	if not os.path.exists(file_path_or_url):
	raise FileNotFoundError(f"File not found at path: {file_path_or_url}")
	with open(file_path_or_url, "rb") as f:
	file_bytes = f.read()
	mime_type, _ = mimetypes.guess_type(file_path_or_url)

	if not file_bytes:
	raise ValueError("Failed to load file bytes.")

	# Use application/octet-stream as a generic default if type cannot be guessed
	mime_type = mime_type or 'application/octet-stream'
	base64_data = base64.b64encode(file_bytes).decode('utf-8')
	logger.info(f"Successfully encoded file to base64. Mime type: {mime_type}")
	logger.info(f"Successfully loaded and encoded file from {file_path_or_url[:50]}... Type: {mime_type}, Size: {len(base64_data)} chars base64")
	return base64_data, mime_type
	except FileNotFoundError:
	logger.error(f"File not found: {file_path_or_url}")
	return None, None
	except requests.exceptions.RequestException as e:
	logger.error(f"Failed to download file from URL {file_path_or_url}: {e}")
	return None, None
	except Exception as e:
	logger.error(f"Failed to load or encode file {file_path_or_url}: {e}")
	return None, None

	def _prepare_base64_data(self, file_path_or_url: str) -> Tuple[Optional[str], Optional[str]]:
	"""
	Loads file from path/URL, determines mime type, base64 encodes it.
	Returns (base64_string, mime_type) or (None, None) on error.
	(This is essentially the same logic as the original _load_and_encode_file)
	"""
	# Re-using the logic from _load_and_encode_file for now.
	# Consider consolidating later if _load_and_encode_file is removed.
	return self._load_and_encode_file(file_path_or_url)

	def _prepare_local_file_path(self, file_path_or_url: str) -> Optional[str]:
	"""
	Ensures a local file path exists for the given input.
	If input is a URL, downloads it to a temporary file.
	Returns the local path or None on error. Tracks temp files for cleanup.
	"""
	try:
	parsed_url = urlparse(file_path_or_url)
	if parsed_url.scheme in ['http', 'https']:
	logger.info(f"Downloading URL for multipart upload: {file_path_or_url}")
	response = requests.get(file_path_or_url, stream=True, timeout=30) # Increased timeout for downloads
	response.raise_for_status()

	# Create a temporary file
	# Suffix might help identify the file type, but not strictly necessary
	suffix = os.path.splitext(parsed_url.path)[1]
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=suffix) # Keep file after close
	with temp_file:
	for chunk in response.iter_content(chunk_size=8192):
	temp_file.write(chunk)
	local_path = temp_file.name
	logger.info(f"Downloaded URL to temporary file: {local_path}")
	# Track temporary files for cleanup (needs instance variable)
	if not hasattr(self, '_temp_files'):
	self._temp_files = []
	self._temp_files.append(local_path)
	return local_path
	else:
	# It's already a local path, verify it exists
	if os.path.exists(file_path_or_url):
	logger.info(f"Using existing local file path for multipart: {file_path_or_url}")
	return file_path_or_url
	else:
	logger.error(f"Local file path not found: {file_path_or_url}")
	return None
	except requests.exceptions.RequestException as e:
	logger.error(f"Failed to download URL {file_path_or_url} for multipart: {e}")
	return None
	except Exception as e:
	logger.error(f"Error preparing local file path {file_path_or_url}: {e}")
	return None

	def _cleanup_temp_files(self):
	"""Removes any temporary files created during URL downloads."""
	if hasattr(self, '_temp_files'):
	for temp_path in self._temp_files:
	try:
	os.remove(temp_path)
	logger.info(f"Cleaned up temporary file: {temp_path}")
	except OSError as e:
	logger.warning(f"Failed to clean up temporary file {temp_path}: {e}")
	self._temp_files = [] # Reset the list


	@retry(
	stop=stop_after_attempt(3),
	wait=wait_exponential(multiplier=1, min=2, max=10),
	)
	def generate(
	self,
	test_case: TestCase,
	# Allow passing pre-loaded data, e.g., from the judge
	base64_data: Optional[str] = None,
	mime_type_override: Optional[str] = None
	) -> ModelResponse:
	"""Call the model API with the test case, potentially including file data based on endpoint configuration."""
	start_time = time.time()
	# Variables to hold prepared file data based on method
	base64_data_loaded = None
	mime_type_loaded = None
	local_file_path_for_multipart = None # Path to file (potentially temporary) for multipart upload

	logger.info(f"Inside generate for model '{self.endpoint.name}', test_id: {test_case.id}. File path/URL from test case: {test_case.image_path_or_url}")
	try:
	# Determine the required upload method early
	upload_method = self.endpoint.file_upload_method
	logger.info(f"Using file upload method: '{upload_method}' for endpoint '{self.endpoint.name}'")

	# Preprocess the input key using the global settings
	preprocessed_key = preprocess_text(test_case.key)

	# Format prompt using the preprocessed key
	prompt = ""
	try:
	# For judge prompts, the "key" is already the full prompt
	if test_case.id and test_case.id.startswith("judge"):
	prompt = preprocessed_key # Use directly, assume already preprocessed if needed
	else:
	# Use simple replacement first, escaping existing braces in the key
	safe_key = preprocessed_key.replace("{", "{{").replace("}", "}}")
	prompt = self.prompt_template.replace("{key}", safe_key)
	except Exception as e:
	logger.warning(f"Error formatting prompt template with replace: {str(e)}. Falling back.")
	try:
	prompt = self.prompt_template.format(key=preprocessed_key)
	except Exception as e2:
	logger.error(f"Error formatting prompt template: {str(e2)}. Using concatenation.")
	prompt = f"{self.prompt_template}\n\nINPUT: {preprocessed_key}"

	# --- File Handling Logic (Prepare based on selected method) ---
	if test_case.image_path_or_url:
	logger.info(f"Test case {test_case.id} includes file reference: {test_case.image_path_or_url}. Preparing based on method '{upload_method}'.")
	if upload_method == "JSON (Embedded Data)":
	# Use existing logic for now, will move to _prepare_base64_data helper later
	# Prioritize pre-loaded data if provided (e.g., by judge)
	if base64_data:
	logger.info(f"Using pre-loaded base64 data for JSON method (test case {test_case.id}). Mime type override: {mime_type_override}")
	base64_data_loaded = base64_data
	mime_type_loaded = mime_type_override or 'application/octet-stream' # Use override or default
	else:
	logger.info(f"Loading file for JSON method: {test_case.image_path_or_url}")
	base64_data_loaded, mime_type_loaded = self._load_and_encode_file(test_case.image_path_or_url)
	logger.info(f"Result from _load_and_encode_file - Has data: {bool(base64_data_loaded)}, Mime type: {mime_type_loaded}")
	if base64_data_loaded is None:
	# Handle file loading failure
	logger.error(f"Failed to load file for JSON method (test case {test_case.id}). Returning error.")
	return ModelResponse(test_id=test_case.id or "unknown", model_name=self.endpoint.name, output=f"Error: Failed to load file {test_case.image_path_or_url} for JSON method", latency=time.time() - start_time)
	elif upload_method == "Multipart Form Data":
	# Call the actual helper function
	local_file_path_for_multipart = self._prepare_local_file_path(test_case.image_path_or_url)
	if local_file_path_for_multipart is None:
	logger.error(f"Failed to prepare local file for Multipart method (test case {test_case.id}). Returning error.")
	return ModelResponse(test_id=test_case.id or "unknown", model_name=self.endpoint.name, output=f"Error: Failed to prepare file {test_case.image_path_or_url} for Multipart method", latency=time.time() - start_time)
	else:
	logger.error(f"Unknown file_upload_method configured: {upload_method}")
	return ModelResponse(test_id=test_case.id or "unknown", model_name=self.endpoint.name, output=f"Error: Invalid file_upload_method '{upload_method}'", latency=time.time() - start_time)
	# else: No file involved in this test case, proceed with text-only call

	# --- API Call Routing ---
	response_text = ""
	try:
	if upload_method == "JSON (Embedded Data)":
	logger.info("Routing to _call_json_api.")
	# Call the new JSON API wrapper function
	response_text = self._call_json_api(prompt, base64_data_loaded, mime_type_loaded)

	elif upload_method == "Multipart Form Data":
	logger.info("Routing to _call_multipart_api.")
	# Call the new Multipart API function
	response_text = self._call_multipart_api(prompt, local_file_path_for_multipart)

	else:
	# Should have been caught during file prep, but defensive check
	logger.error(f"Invalid file_upload_method '{upload_method}' reached API call routing.")
	response_text = f"Error: Invalid file upload method configuration '{upload_method}'."

	# Exception handling remains the same, but applies to the new call structure
	except requests.exceptions.RequestException as req_err:
	logger.error(f"API request failed for {self.endpoint.name}: {req_err}")
	if hasattr(req_err, 'response') and req_err.response is not None:
	logger.error(f"Response status: {req_err.response.status_code}, Response text: {req_err.response.text[:500]}")
	response_text = f"Error: API request failed. Details: {str(req_err)}"
	except (KeyError, IndexError, TypeError, json.JSONDecodeError, ValueError) as parse_err:
	logger.error(f"Failed to parse response or invalid response structure from {self.endpoint.name}: {parse_err}")
	response_text = f"Error: Failed to parse API response. Details: {str(parse_err)}"
	except Exception as e:
	logger.error(f"Unexpected error calling API for {self.endpoint.name}: {str(e)}", exc_info=True)
	response_text = f"Error: An unexpected error occurred. Details: {str(e)}"

	end_time = time.time()
	# Clean up temporary files regardless of success or failure
	self._cleanup_temp_files()

	return ModelResponse(
	test_id=test_case.id or "unknown", # Ensure test_id is never None
	model_name=self.endpoint.name,
	output=str(response_text), # Ensure output is always string
	latency=end_time - start_time,
	)
	except Exception as e:
	logger.error(f"Unexpected error in generate method for {self.endpoint.name}: {str(e)}", exc_info=True)
	# Re-raise to trigger tenacity retry
	raise
	finally:
	# Ensure cleanup happens even if retry fails or other unexpected errors occur
	self._cleanup_temp_files()

	def _prepare_headers(self, is_json_request=True):
	"""Prepares common headers. Adjusts Content-Type based on request type."""
	# Start with common headers
	headers = {}
	# Only add Authorization header if api_key is present and not empty
	if self.endpoint.api_key and self.endpoint.api_key.strip():
	# Check for specific API key types if needed (e.g., Anthropic uses x-api-key)
	if "anthropic" in self.endpoint.api_url.lower():
	headers["x-api-key"] = self.endpoint.api_key
	headers["anthropic-version"] = "2023-06-01" # Required header
	elif "generativelanguage.googleapis.com" in self.endpoint.api_url.lower():
	# Gemini API key is usually in the URL, not header
	pass
	else:
	# Default to Bearer token for OpenAI compatible and others
	headers["Authorization"] = f"Bearer {self.endpoint.api_key}"

	# Set Content-Type based on whether it's a JSON request or not (e.g., multipart)
	if is_json_request:
	headers["Content-Type"] = "application/json"
	# For multipart/form-data, requests library handles Content-Type automatically when 'files' param is used.

	# Add OpenRouter specific headers if applicable
	# Only add Authorization header if api_key is present and not empty
	if self.endpoint.api_key and self.endpoint.api_key.strip():
	headers["Authorization"] = f"Bearer {self.endpoint.api_key}"

	# Add OpenRouter specific headers if applicable
	if self.endpoint.api_url and "openrouter.ai" in self.endpoint.api_url.lower():
	# These might be optional now, but good practice
	headers["HTTP-Referer"] = "http://localhost" # Can be anything, localhost is common
	headers["X-Title"] = "Model A/B Testing Tool"
	return headers

	# --- New API Call Functions ---

	def _call_json_api(self, prompt: str, base64_data: Optional[str], mime_type: Optional[str]) -> str:
	"""
	Wrapper for JSON-based API calls. Detects API type and calls the appropriate formatter.
	"""
	logger.info(f"Executing JSON API call for endpoint: {self.endpoint.name}")
	headers = self._prepare_headers(is_json_request=True) # Ensure JSON content type

	# Determine API type based on URL (similar to previous logic)
	api_url_lower = self.endpoint.api_url.lower() if self.endpoint.api_url else ""
	is_openai_compatible = "/v1/chat/completions" in api_url_lower or \
	"openai" in api_url_lower or \
	"openrouter.ai" in api_url_lower or \
	"mistral" in api_url_lower or \
	"together.ai" in api_url_lower or \
	"groq.com" in api_url_lower or \
	"fireworks.ai" in api_url_lower or \
	"deepinfra.com" in api_url_lower or \
	"lmstudio.ai" in api_url_lower or \
	":1234/v1" in api_url_lower
	is_anthropic_compatible = "/v1/messages" in api_url_lower or "anthropic" in api_url_lower
	is_gemini = "generativelanguage.googleapis.com" in api_url_lower
	is_ollama = ("/api/generate" in api_url_lower and \
	("localhost:11434" in api_url_lower or "127.0.0.1:11434" in api_url_lower)) or \
	("ollama" in api_url_lower and "/api/generate" in api_url_lower)

	payload = {}
	api_url_to_call = self.endpoint.api_url # Default URL

	# Call appropriate formatting function
	if is_openai_compatible:
	payload = self._format_openai_json(prompt, base64_data, mime_type)
	elif is_anthropic_compatible:
	payload = self._format_anthropic_json(prompt, base64_data, mime_type)
	elif is_gemini:
	payload = self._format_gemini_json(prompt, base64_data, mime_type)
	# Gemini API key goes in URL parameter
	if self.endpoint.api_key:
	api_url_to_call = f"{self.endpoint.api_url}?key={self.endpoint.api_key}"
	else:
	raise ValueError("Gemini API key is required but not provided.")
	elif is_ollama:
	payload = self._format_ollama_json(prompt, base64_data)
	else:
	# Fallback: Use generic API call logic (which currently assumes OpenAI text-only)
	logger.warning(f"Could not determine specific JSON API type for {self.endpoint.api_url}. Using generic fallback.")
	# The generic call handles its own request formatting and execution
	return self._call_generic_api(prompt) # Return directly

	# Make the actual request
	try:
	payload_size_kb = len(json.dumps(payload)) / 1024
	logger.info(f"Sending JSON request to {api_url_to_call}. Payload size: {payload_size_kb:.2f} KB")
	if payload_size_kb > 4000: logger.warning(f"Payload size ({payload_size_kb:.2f} KB) is large.")

	response = requests.post(api_url_to_call, headers=headers, json=payload, timeout=180)
	response.raise_for_status()
	result = response.json()

	# Parse response based on API type (This parsing logic should ideally move too)
	# TODO: Move response parsing into dedicated functions or handle within formatters?
	if is_openai_compatible:
	if not result.get("choices") or not result["choices"][0].get("message"): raise ValueError("Invalid OpenAI response format")
	content = result["choices"][0]["message"].get("content")
	return content if content is not None else f"Error: Response content was null (Finish Reason: {result['choices'][0].get('finish_reason')})"
	elif is_anthropic_compatible:
	if not result.get("content"): raise ValueError("Invalid Anthropic response format")
	text_content = next((block.get("text", "") for block in result.get("content", []) if block.get("type") == "text"), "")
	return text_content if text_content else "[No text content found in response]"
	elif is_gemini:
	if not result.get("candidates"): raise ValueError("Invalid Gemini response format")
	candidate = result["candidates"][0]
	if not candidate.get("content") or not candidate["content"].get("parts"): raise ValueError("Invalid Gemini response format")
	text_response = "".join(part["text"] for part in candidate["content"]["parts"] if "text" in part)
	return text_response if text_response else "[No text content found in response]"
	elif is_ollama:
	if "response" in result: return result["response"]
	elif "error" in result: raise ValueError(f"Ollama API Error: {result['error']}")
	else: raise ValueError("Invalid Ollama response format")
	else:
	# Should not be reached if generic fallback worked
	raise ValueError("Unhandled API type in JSON response parsing.")

	except requests.exceptions.RequestException as e:
	logger.error(f"JSON API request failed: {str(e)}")
	if hasattr(e, 'response') and e.response is not None: logger.error(f"Response content: {e.response.text[:500]}")
	raise # Re-raise to be caught by the main generate method's handler
	except (KeyError, IndexError, ValueError, json.JSONDecodeError) as e:
	logger.error(f"Failed to parse JSON API response: {str(e)}")
	logger.error(f"Full response: {result if 'result' in locals() else 'Response not available'}")
	raise # Re-raise

	def _call_multipart_api(self, prompt: str, local_file_path: Optional[str]) -> str:
	"""
	Handles API calls using multipart/form-data.
	(Placeholder - Needs implementation based on target API, e.g., Whisper)
	"""
	logger.info(f"Executing Multipart API call for endpoint: {self.endpoint.name}")
	if not local_file_path:
	return "Error: No local file path provided for multipart upload."

	# Prepare headers (Requests handles Content-Type for multipart)
	headers = self._prepare_headers(is_json_request=False)

	# Prepare data and files dictionary - THIS IS HIGHLY API-SPECIFIC
	# Example for OpenAI Whisper:
	data = {'model': self.endpoint.model_id}
	if prompt: # Whisper uses 'prompt' for context/hints
	data['prompt'] = prompt
	# Add other potential fields like 'language', 'response_format' based on API

	files = {}
	try:
	# Use a context manager to ensure the file is closed
	with open(local_file_path, 'rb') as f:
	files['file'] = (os.path.basename(local_file_path), f)
	logger.info(f"Preparing multipart request with file: {os.path.basename(local_file_path)}")

	# Make the request
	response = requests.post(
	self.endpoint.api_url,
	headers=headers,
	data=data,
	files=files,
	timeout=180 # Timeout for upload + processing
	)
	response.raise_for_status()
	result = response.json()

	# Parse the response - AGAIN, API-SPECIFIC
	# Example for Whisper:
	if 'text' in result:
	return result['text']
	else:
	raise ValueError(f"Unexpected response format from multipart API: {result}")

	except requests.exceptions.RequestException as e:
	logger.error(f"Multipart API request failed: {str(e)}")
	if hasattr(e, 'response') and e.response is not None: logger.error(f"Response content: {e.response.text[:500]}")
	raise
	except (KeyError, ValueError, json.JSONDecodeError) as e:
	logger.error(f"Failed to parse Multipart API response: {str(e)}")
	logger.error(f"Full response: {result if 'result' in locals() else 'Response not available'}")
	raise
	except FileNotFoundError:
	logger.error(f"File not found for multipart upload: {local_file_path}")
	return f"Error: File not found at {local_file_path}"
	except Exception as e:
	logger.error(f"Unexpected error during multipart call: {e}", exc_info=True)
	raise

	# --- JSON Formatting Functions (Placeholders) ---

	def _format_openai_json(self, prompt: str, base64_data: Optional[str], mime_type: Optional[str]) -> Dict[str, Any]:
	"""Formats the payload for OpenAI-compatible chat completion APIs."""
	logger.debug("Formatting payload for OpenAI JSON")
	messages = []
	if base64_data:
	mime_type = mime_type or 'image/jpeg' # Default mime type
	messages.append({
	"role": "user",
	"content": [
	{"type": "text", "text": prompt},
	{"type": "image_url", "image_url": {"url": f"data:{mime_type};base64,{base64_data}"}}
	]
	})
	else:
	messages.append({"role": "user", "content": prompt})
	return {
	"model": self.endpoint.model_id,
	"messages": messages,
	"max_tokens": self.endpoint.max_tokens,
	"temperature": self.endpoint.temperature,
	}

	def _format_anthropic_json(self, prompt: str, base64_data: Optional[str], mime_type: Optional[str]) -> Dict[str, Any]:
	"""Formats the payload for Anthropic messages API."""
	logger.debug("Formatting payload for Anthropic JSON")
	content = [{"type": "text", "text": prompt}]
	if base64_data:
	mime_type = mime_type or 'image/jpeg'
	supported_mime_types = ['image/jpeg', 'image/png', 'image/gif', 'image/webp']
	if mime_type not in supported_mime_types:
	logger.warning(f"MIME type '{mime_type}' may not be directly supported by Claude.")
	content.append({
	"type": "image",
	"source": {"type": "base64", "media_type": mime_type, "data": base64_data}
	})
	return {
	"model": self.endpoint.model_id,
	"messages": [{"role": "user", "content": content}],
	"max_tokens": self.endpoint.max_tokens,
	"temperature": self.endpoint.temperature,
	}

	def _format_gemini_json(self, prompt: str, base64_data: Optional[str], mime_type: Optional[str]) -> Dict[str, Any]:
	"""Formats the payload for Google Gemini API."""
	logger.debug("Formatting payload for Gemini JSON")
	parts = [{"text": prompt}]
	if base64_data:
	mime_type = mime_type or 'application/octet-stream' # Gemini supports various types
	parts.append({"inline_data": {"mime_type": mime_type, "data": base64_data}})
	return {
	"contents": [{"parts": parts}],
	"generationConfig": {
	"temperature": self.endpoint.temperature,
	"maxOutputTokens": self.endpoint.max_tokens,
	}
	}

	def _format_ollama_json(self, prompt: str, base64_data: Optional[str]) -> Dict[str, Any]:
	"""Formats the payload for Ollama generate API."""
	logger.debug("Formatting payload for Ollama JSON")
	data = {
	"model": self.endpoint.model_id,
	"prompt": prompt,
	"stream": False,
	}
	if base64_data:
	data["images"] = [base64_data] # Ollama expects a list
	return data


	class LMJudge:
	"""Uses a language model to judge between champion and challenger outputs."""

	DEFAULT_EVALUATION_PROMPT = """
	# Model Response Evaluation

	You are evaluating two AI model responses based on the input query, potentially an accompanying image, and potentially a reference value.

	## Input Query

	{key}

	{image_context_section}
	{reference_section}

	## Model A (Champion: {champion_name}) Response

	{champion_output}

	## Model B (Challenger: {challenger_name}) Response

	{challenger_output}

	## Evaluation Instructions
	Compare Model A and Model B based on the Input Query{reference_value_instruction}. Consider:
	1. Relevance and accuracy in addressing the Input Query.
	{reference_value_criteria}
	{clarity_criteria_number}. Clarity, conciseness, and quality of the response.
	{overall_criteria_number}. Overall usefulness.

	## Required Response Format
	You MUST start your response with a clear verdict and confidence rating:

	VERDICT: [Choose ONE: MODEL_A_WINS, MODEL_B_WINS, or TIE]
	CONFIDENCE: [Number]/5 (where 1=low confidence, 5=high confidence)

	Then provide a detailed explanation of your reasoning. Be explicit about which model performed better and why, or why they were tied. Include specific examples from each response that influenced your decision.

	Example format:
	VERDICT: MODEL_A_WINS
	CONFIDENCE: 4/5

	[Your detailed reasoning here...]
	"""

	def __init__(
	self,
	endpoint: ModelEndpoint,
	evaluation_prompt_template: str = DEFAULT_EVALUATION_PROMPT,
	):
	self.endpoint = endpoint
	self.evaluation_prompt_template = evaluation_prompt_template
	# The judge runner uses a simple placeholder template, as the full prompt
	# is formatted within the evaluate method before being passed as the 'key'.
	# Judge model runner needs access to the file loading method.
	self.model_runner = ModelRunner(endpoint, "{key}") # Pass-through template for prompt

	def evaluate(
	self,
	test_case: TestCase,
	champion_response: ModelResponse,
	challenger_response: ModelResponse
	) -> EvaluationResult:
	"""Evaluate champion vs challenger outputs using a dynamically built prompt."""
	# Preprocess all inputs to ensure they're clean strings
	# Use the same preprocess_text function for consistency
	# Note: We don't pass the image to the judge, only the text inputs/outputs.
	preprocessed_key = preprocess_text(test_case.key)
	preprocessed_value = preprocess_text(test_case.value) # Preprocess reference value too
	preprocessed_champion = preprocess_text(champion_response.output)
	preprocessed_challenger = preprocess_text(challenger_response.output)

	# Prepare context for the evaluation prompt template
	has_reference = bool(preprocessed_value)
	reference_section_text = f"\n## Reference Value\n\n{preprocessed_value}\n" if has_reference else "\n## Reference Value\nN/A"
	reference_value_instruction_text = ' and Reference Value' if has_reference else ''
	reference_value_criteria_text = '2. Factual correctness compared to the Reference Value (if provided).' if has_reference else ''
	clarity_criteria_number_text = '3' if has_reference else '2'
	overall_criteria_number_text = '4' if has_reference else '3'

	# Add image context section if an image was provided in the original test case
	has_image = bool(test_case.image_path_or_url)
	image_context_section_text = "\n## Input Image\nAn image was provided with the input query. Consider it as context when evaluating the responses.\n" if has_image else ""

	# Format the evaluation prompt using the template and context
	try:
	evaluation_prompt = self.evaluation_prompt_template.format(
	key=preprocessed_key,
	image_context_section=image_context_section_text, # Added image context
	reference_section=reference_section_text,
	champion_name=champion_response.model_name,
	champion_output=preprocessed_champion,
	challenger_name=challenger_response.model_name,
	challenger_output=preprocessed_challenger,
	reference_value_instruction=reference_value_instruction_text,
	reference_value_criteria=reference_value_criteria_text,
	clarity_criteria_number=clarity_criteria_number_text,
	overall_criteria_number=overall_criteria_number_text
	)
	except KeyError as e:
	logger.error(f"Missing key in judge prompt template: {e}. Using default prompt structure.")
	# Fallback to a basic structure if formatting fails
	evaluation_prompt = f"Evaluate Model A vs Model B.\nInput: {preprocessed_key}\nRef: {preprocessed_value}\nA: {preprocessed_champion}\nB: {preprocessed_challenger}\nFormat: VERDICT: [MODEL_A_WINS/MODEL_B_WINS/TIE]\nCONFIDENCE: [1-5]/5\nReasoning: ..."
	except Exception as e:
	logger.error(f"Error formatting judge prompt template: {e}. Using basic prompt.")
	evaluation_prompt = f"Evaluate Model A vs Model B.\nInput: {preprocessed_key}\nRef: {preprocessed_value}\nA: {preprocessed_champion}\nB: {preprocessed_challenger}\nFormat: VERDICT: [MODEL_A_WINS/MODEL_B_WINS/TIE]\nCONFIDENCE: [1-5]/5\nReasoning: ..."

	# Log the prompt for debugging (truncated)
	logger.info(f"Using Judge evaluation prompt (truncated): {evaluation_prompt[:500]}...")

	# Create a TestCase specifically for the judge call.
	# Crucially, pass the original image_path_or_url from the test_case.
	# The judge's model_runner.generate method will handle loading the file
	# based on the judge's endpoint.file_upload_method configuration.
	judge_test_case = TestCase(
	key=evaluation_prompt,
	value="", # No value needed for judge call itself
	image_path_or_url=test_case.image_path_or_url, # Pass original file reference
	id=f"judge-{test_case.id or 'unknown'}"
	)

	# Call the judge's generate method. It will handle file loading internally.
	# We no longer need to pass base64_data or mime_type_override here.
	judge_response_obj = self.model_runner.generate(
	test_case=judge_test_case
	)

	# Log the response for debugging (truncated)
	logger.info(f"Judge raw response (truncated): {judge_response_obj.output[:500]}...")

	# Parse the judge's decision from the raw output string
	parsed_result = self.parse_judge_response(judge_response_obj.output)

	return EvaluationResult(
	test_id=test_case.id or "unknown",
	champion_output=champion_response.output, # Store original, not preprocessed
	challenger_output=challenger_response.output, # Store original, not preprocessed
	winner=parsed_result["winner"],
	confidence=parsed_result["confidence"],
	reasoning=judge_response_obj.output, # Store the full raw response as reasoning
	)

	def parse_judge_response(self, response_text: str) -> Dict[str, Any]:
	"""
	Parse the judge's raw response string to extract verdict and confidence.
	Uses more flexible regex patterns to handle various response formats.
	"""
	verdict = "UNDETERMINED"
	confidence = 0.0

	# Log the first part of the response for debugging
	logger.debug(f"Parsing judge response (first 100 chars): {response_text[:100]}")

	# 1. Extract VERDICT (Case-insensitive search for the explicit line)
	verdict_match = re.search(r"^\sVERDICT:\s(MODEL_A_WINS\|MODEL_B_WINS\|TIE)\s*$", response_text, re.IGNORECASE \| re.MULTILINE)
	if verdict_match:
	verdict = verdict_match.group(1).upper()
	logger.info(f"Parsed VERDICT line: {verdict}")
	else:
	# Fallback: Look for bracketed verdicts (common LLM-as-judge pattern)
	bracket_match = re.search(r"\[\[\s(MODEL_A_WINS\|MODEL_B_WINS\|TIE)\s\]\]", response_text, re.IGNORECASE)
	if bracket_match:
	verdict = bracket_match.group(1).upper()
	logger.info(f"Parsed bracketed verdict: {verdict}")
	else:
	# Fallback: Look for simpler A/B/TIE in brackets
	simple_bracket_match = re.search(r"\[\[\s([AB]\|TIE)\s\]\]", response_text, re.IGNORECASE)
	if simple_bracket_match:
	verdict_text = simple_bracket_match.group(1).upper()
	if verdict_text == "A": verdict = "MODEL_A_WINS"
	elif verdict_text == "B": verdict = "MODEL_B_WINS"
	else: verdict = "TIE"
	logger.info(f"Parsed simple bracketed verdict: {verdict}")


	# 2. Extract CONFIDENCE (Case-insensitive search for the explicit line)
	confidence_match = re.search(r"^\sCONFIDENCE:\s(\d(?:\.\d)?)\s/\s5\s*$", response_text, re.IGNORECASE \| re.MULTILINE)
	if confidence_match:
	try:
	confidence_score = float(confidence_match.group(1))
	# Clamp confidence between 1 and 5, then normalize to 0.2-1.0 range
	confidence = max(0.2, min(1.0, confidence_score / 5.0))
	logger.info(f"Parsed CONFIDENCE line: {confidence_score}/5 -> {confidence}")
	except ValueError:
	logger.warning(f"Could not parse CONFIDENCE value: {confidence_match.group(1)}")
	else:
	# Fallback: Look for rating/score patterns if confidence line missing
	score_match = re.search(r"(?:rating\|score)[:\s](\d(?:\.\d)?)\s/\s*(\d+)", response_text, re.IGNORECASE)
	if score_match:
	try:
	score = float(score_match.group(1))
	scale = float(score_match.group(2))
	if scale > 0:
	# Normalize to 0-1 range, clamping between 0.2 and 1.0
	confidence = max(0.2, min(1.0, score / scale))
	logger.info(f"Parsed score/rating: {score}/{scale} -> {confidence}")
	except ValueError:
	pass # Ignore if parsing fails

	# 3. Final checks and fallbacks if parsing failed
	if verdict == "UNDETERMINED":
	logger.warning(f"Could not reliably parse VERDICT from judge response: {response_text[:200]}...")
	# Simple keyword check as a last resort (less reliable)
	if "model a wins" in response_text.lower() and "model b wins" not in response_text.lower():
	verdict = "MODEL_A_WINS"
	elif "model b wins" in response_text.lower() and "model a wins" not in response_text.lower():
	verdict = "MODEL_B_WINS"
	elif "tie" in response_text.lower() or "comparable" in response_text.lower():
	verdict = "TIE"

	# If we have a verdict but no confidence, assign a default moderate confidence
	if verdict != "UNDETERMINED" and confidence == 0.0:
	confidence = 0.6 # Default confidence when parsing fails but verdict is found
	logger.info(f"Could not parse CONFIDENCE, assigning default {confidence} for verdict {verdict}")

	# Log the final parsed values
	logger.info(f"Final parsed judge result - Winner: {verdict}, Confidence: {confidence:.2f}")

	return {
	"winner": verdict,
	"confidence": confidence,
	# Reasoning is the full response text, handled in evaluate method
	}

	class ResultAggregator:
	"""Collects evaluation results and calculates summary statistics."""

	def aggregate(self, evaluation_results: List[EvaluationResult]) -> Dict[str, Any]:
	"""Aggregates results, calculating counts and percentages."""
	total_evaluations = len(evaluation_results)
	verdict_counts = {"MODEL_A_WINS": 0, "MODEL_B_WINS": 0, "TIE": 0, "UNDETERMINED": 0, "JUDGE_ERROR": 0}
	confidence_sum = 0
	valid_verdicts = 0

	# Track which test cases had undetermined verdicts for logging
	undetermined_cases = []
	judge_error_cases = []

	for result in evaluation_results:
	verdict = result.winner # Use the pre-parsed winner
	if verdict in verdict_counts:
	verdict_counts[verdict] += 1
	if verdict != "UNDETERMINED" and verdict != "JUDGE_ERROR":
	confidence_sum += result.confidence
	valid_verdicts += 1
	elif verdict == "UNDETERMINED":
	undetermined_cases.append(result.test_id)
	elif verdict == "JUDGE_ERROR":
	judge_error_cases.append(result.test_id)
	else:
	# Should not happen if parsing is robust, but handle defensively
	logger.warning(f"Unexpected verdict '{verdict}' encountered for test_id {result.test_id}. Counting as UNDETERMINED.")
	verdict_counts["UNDETERMINED"] += 1
	undetermined_cases.append(result.test_id)


	# Log summary of problematic cases
	if undetermined_cases:
	logger.warning(f"Found {len(undetermined_cases)} undetermined verdicts: {undetermined_cases[:5]}" +
	(f"... and {len(undetermined_cases)-5} more" if len(undetermined_cases) > 5 else ""))
	if judge_error_cases:
	logger.warning(f"Found {len(judge_error_cases)} judge errors: {judge_error_cases[:5]}" +
	(f"... and {len(judge_error_cases)-5} more" if len(judge_error_cases) > 5 else ""))

	# Calculate percentages based on determined verdicts only (excluding UNDETERMINED and JUDGE_ERROR)
	determined_verdicts = total_evaluations - verdict_counts["UNDETERMINED"] - verdict_counts["JUDGE_ERROR"]
	verdict_percentages = {}
	if determined_verdicts > 0:
	verdict_percentages["MODEL_A_WINS"] = round(
	(verdict_counts["MODEL_A_WINS"] / determined_verdicts) * 100, 2
	)
	verdict_percentages["MODEL_B_WINS"] = round(
	(verdict_counts["MODEL_B_WINS"] / determined_verdicts) * 100, 2
	)
	verdict_percentages["TIE"] = round(
	(verdict_counts["TIE"] / determined_verdicts) * 100, 2
	)
	else:
	verdict_percentages = {"MODEL_A_WINS": 0, "MODEL_B_WINS": 0, "TIE": 0}

	average_confidence = (confidence_sum / valid_verdicts) if valid_verdicts > 0 else 0

	# Convert EvaluationResult objects to dictionaries for JSON serialization
	raw_eval_dicts = []
	for res in evaluation_results:
	try:
	# Assuming EvaluationResult is a dataclass or has a simple structure
	raw_eval_dicts.append({
	"test_id": res.test_id,
	"winner": res.winner,
	"confidence": res.confidence,
	"champion_output": res.champion_output,
	"challenger_output": res.challenger_output,
	"reasoning": res.reasoning,
	})
	except AttributeError as e:
	logger.error(f"Error converting EvaluationResult to dict for test_id {res.test_id}: {e}")
	# Add a placeholder or skip
	raw_eval_dicts.append({"test_id": getattr(res, 'test_id', 'unknown'), "error": "Failed to serialize result"})


	return {
	"total_evaluations": total_evaluations,
	"verdict_counts": verdict_counts,
	"verdict_percentages": verdict_percentages, # Based on determined verdicts
	"average_confidence": round(average_confidence, 3), # Avg confidence for non-undetermined/error
	"raw_evaluations": raw_eval_dicts # Keep raw for output (as dicts)
	}

	class ModelTester:
	"""Main class that orchestrates the A/B testing process."""

	def __init__(
	self,
	champion_endpoint: ModelEndpoint,
	challenger_endpoint: ModelEndpoint,
	judge_endpoint: ModelEndpoint,
	model_prompt_template: str,
	judge_prompt_template: str = LMJudge.DEFAULT_EVALUATION_PROMPT # Add judge template param
	):
	self.champion_runner = ModelRunner(champion_endpoint, model_prompt_template)
	self.challenger_runner = ModelRunner(challenger_endpoint, model_prompt_template)
	# Pass the judge prompt template to the LMJudge constructor
	self.judge = LMJudge(judge_endpoint, evaluation_prompt_template=judge_prompt_template)
	self.aggregator = ResultAggregator() # Aggregator just collects/counts
	self.champion_endpoint = champion_endpoint
	self.challenger_endpoint = challenger_endpoint
	self.judge_endpoint = judge_endpoint

	def run_test(
	self,
	test_cases: List[TestCase],
	batch_size: int = 5,
	progress=None,
	batch_retry_attempts: int = 0, # Number of retry attempts for batches
	batch_backoff_factor: float = 2.0, # Exponential backoff factor
	batch_max_wait: int = 60, # Maximum wait time between retries in seconds
	batch_retry_trigger_strings: Optional[List[str]] = None # Strings that trigger a retry
	) -> Dict[str, Any]:
	"""
	Run the complete test process: generate responses, evaluate, aggregate.

	Includes batch retry mechanism for transient errors or problematic responses.
	Args:
	test_cases: List of test cases (potentially including image paths/URLs)
	batch_size: Number of test cases per batch
	progress: Gradio progress callback
	batch_retry_attempts: Max retries per batch
	batch_backoff_factor: Exponential backoff factor
	batch_max_wait: Max wait time between retries
	batch_retry_trigger_strings: List of strings triggering retry if found in outputs/reasoning
	"""
	all_evaluation_results: List[EvaluationResult] = []
	champion_metrics = {"total_latency": 0.0, "total_output_chars": 0, "success_count": 0, "error_count": 0, "image_load_errors": 0}
	challenger_metrics = {"total_latency": 0.0, "total_output_chars": 0, "success_count": 0, "error_count": 0, "image_load_errors": 0}
	judge_metrics = {"total_latency": 0.0, "total_output_chars": 0, "success_count": 0, "error_count": 0}

	num_cases = len(test_cases)
	if num_cases == 0:
	logger.warning("No test cases provided to run_test.")
	return {"evaluations": [], "summary": {"error": "No test cases loaded."}}

	total_batches = (num_cases + batch_size - 1) // batch_size
	processed_case_count = 0 # Track actual processed cases for progress
	global STOP_REQUESTED # Access the global flag

	previous_update_payload = None # Store the last yielded update

	# Process in batches
	for i in range(0, num_cases, batch_size):
	if STOP_REQUESTED:
	logger.warning(f"Stop requested. Finishing early after processing {processed_case_count} cases.")
	if progress:
	progress(processed_case_count / num_cases, f"Stopping early after {processed_case_count} cases...")
	break # Exit the batch loop

	current_batch = test_cases[i:min(i + batch_size, num_cases)]
	batch_num = i // batch_size + 1
	logger.info(f"--- Processing Batch {batch_num}/{total_batches} (Cases {i+1}-{min(i+batch_size, num_cases)}) ---")

	# Initialize retry counter and success flag for this batch
	retry_count = 0
	batch_success = False
	batch_eval_results: List[EvaluationResult] = [] # Store results for this successful batch attempt

	# Process this batch with retries if configured
	while not batch_success and retry_count <= batch_retry_attempts:
	if retry_count > 0:
	# Calculate backoff delay with exponential increase, capped at max_wait
	delay = min(batch_backoff_factor ** (retry_count - 1), batch_max_wait)
	logger.info(f"Retrying batch {batch_num} (attempt {retry_count}/{batch_retry_attempts}) after {delay:.2f}s delay")
	if progress is not None:
	# Update progress based on already processed cases, not 'i'
	progress(processed_case_count / num_cases, f"Retrying Batch {batch_num} ({retry_count}/{batch_retry_attempts})")
	time.sleep(delay)
	else:
	if progress is not None:
	progress(processed_case_count / num_cases, f"Running Batch {batch_num}/{total_batches}")

	# Reset batch-specific stores for this attempt
	current_attempt_champ_responses: Dict[str, ModelResponse] = {}
	current_attempt_chall_responses: Dict[str, ModelResponse] = {}
	current_attempt_eval_results: List[EvaluationResult] = []
	has_trigger_string_in_attempt = False

	# 1. Get responses from Champion and Challenger models for the current batch attempt
	for batch_idx, test_case in enumerate(current_batch):
	# Check if stop requested before processing this case
	if STOP_REQUESTED:
	logger.warning(f"Stop requested. Skipping remaining cases in batch {batch_num}.")
	break # Exit the inner loop for this batch

	# Generate a consistent ID if not present, using overall index 'i' + batch_idx
	case_id = test_case.id or f"case-{i + batch_idx + 1}"
	test_case.id = case_id # Ensure the test case object has the ID

	# --- Yield previous results before starting current case ---
	if previous_update_payload:
	# Check stop flag before yielding previous update
	if STOP_REQUESTED:
	logger.info("Stop requested before yielding previous update. Breaking batch.")
	break # Exit inner loop for this batch
	yield previous_update_payload # Indented correctly inside the if

	# --- Champion ---
	try:
	champ_resp = self.champion_runner.generate(test_case)
	current_attempt_champ_responses[case_id] = champ_resp
	# Only count metrics if not an image loading error generated by our code
	if not champ_resp.output.startswith("Error: Failed to load image"):
	champion_metrics["total_latency"] += champ_resp.latency
	champion_metrics["total_output_chars"] += len(champ_resp.output)
	if not champ_resp.output.startswith("Error:"): champion_metrics["success_count"] += 1
	else: champion_metrics["error_count"] += 1
	else:
	champion_metrics["image_load_errors"] += 1
	champion_metrics["error_count"] += 1 # Count as an error

	except Exception as e:
	logger.error(f"Critical error generating champion response for case {case_id}: {e}", exc_info=True)
	current_attempt_champ_responses[case_id] = ModelResponse(case_id, self.champion_endpoint.name, f"Error: Generation failed critically - {e}", 0)
	champion_metrics["error_count"] += 1

	# --- Challenger ---
	try:
	chall_resp = self.challenger_runner.generate(test_case)
	current_attempt_chall_responses[case_id] = chall_resp
	if not chall_resp.output.startswith("Error: Failed to load image"):
	challenger_metrics["total_latency"] += chall_resp.latency
	challenger_metrics["total_output_chars"] += len(chall_resp.output)
	if not chall_resp.output.startswith("Error:"): challenger_metrics["success_count"] += 1
	else: challenger_metrics["error_count"] += 1
	else:
	challenger_metrics["image_load_errors"] += 1
	challenger_metrics["error_count"] += 1
	except Exception as e:
	logger.error(f"Critical error generating challenger response for case {case_id}: {e}", exc_info=True)
	current_attempt_chall_responses[case_id] = ModelResponse(case_id, self.challenger_endpoint.name, f"Error: Generation failed critically - {e}", 0)
	challenger_metrics["error_count"] += 1

	# --- Yield intermediate update ---
	# Check if the test case actually had an image associated
	image_to_display = test_case.image_path_or_url if test_case.image_path_or_url else None # Use None if no image
	# Calculate combined latency (handle potential errors where response might be missing)
	champ_resp = current_attempt_champ_responses.get(case_id)
	chall_resp = current_attempt_chall_responses.get(case_id)
	champ_lat = champ_resp.latency if champ_resp else 0
	chall_lat = chall_resp.latency if chall_resp else 0
	combined_latency = round(champ_lat + chall_lat, 3)

	# Removed the intermediate yield from here. It will be moved inside the evaluation loop below.
	# 2. Evaluate with LM Judge for the current batch attempt
	if progress is not None:
	progress((processed_case_count + len(current_batch) * 0.5) / num_cases, f"Evaluating Batch {batch_num}")

	for test_case in current_batch:
	case_id = test_case.id # Should have been set above
	champ_response = current_attempt_champ_responses.get(case_id)
	chall_response = current_attempt_chall_responses.get(case_id)

	# Skip evaluation if either model failed critically or had image load error
	if not champ_response or not chall_response or \
	champ_response.output.startswith("Error:") or \
	chall_response.output.startswith("Error:"):
	logger.warning(f"Skipping evaluation for case {case_id} due to generation error in one or both models.")
	# Create a dummy eval result indicating skip? Or just don't add? Let's not add.
	# We need a placeholder if retry depends on judge output, otherwise skip.
	# For simplicity now, we'll create an error result if a model failed.
	eval_reason = f"Skipped: Champion Error: {champ_response.output[:100]}... Challenger Error: {chall_response.output[:100]}..." if champ_response and chall_response else "Skipped: Model generation failed."
	current_attempt_eval_results.append(EvaluationResult(
	test_id=case_id,
	champion_output=champ_response.output if champ_response else "GENERATION FAILED",
	challenger_output=chall_response.output if chall_response else "GENERATION FAILED",
	winner="JUDGE_ERROR", # Count as judge error if models failed
	confidence=0.0,
	reasoning=eval_reason
	))
	judge_metrics["error_count"] += 1
	continue # Skip to next test case in batch

	# Check for trigger strings in model responses before calling judge if retry is enabled
	if batch_retry_attempts > 0 and batch_retry_trigger_strings:
	for trigger in batch_retry_trigger_strings:
	if trigger in champ_response.output or trigger in chall_response.output:
	logger.warning(f"Trigger string '{trigger}' found in model responses for case {case_id}. Batch will be retried.")
	has_trigger_string_in_attempt = True
	break # No need to check other triggers for this case
	if has_trigger_string_in_attempt:
	# Add a placeholder result indicating retry trigger
	current_attempt_eval_results.append(EvaluationResult(
	test_id=case_id, champion_output=champ_response.output, challenger_output=chall_response.output,
	winner="UNDETERMINED", confidence=0.0, reasoning=f"Retry triggered by model output string."
	))
	continue # Skip judge call for this case if retry is triggered by models

	# If no model trigger, proceed to judge evaluation
	try:
	start_time = time.time()
	evaluation_result = self.judge.evaluate(
	test_case,
	champ_response,
	chall_response,
	)
	judge_latency = time.time() - start_time
	judge_metrics["total_latency"] += judge_latency
	judge_metrics["total_output_chars"] += len(evaluation_result.reasoning) # Judge output length
	current_attempt_eval_results.append(evaluation_result)
	# --- Yield intermediate update AFTER judge evaluation for this case ---
	image_to_display = test_case.image_path_or_url if test_case.image_path_or_url else None
	champ_lat = round(champ_response.latency, 3) if champ_response else 0.0
	chall_lat = round(chall_response.latency, 3) if chall_response else 0.0
	# Use the latency calculated just before (judge_latency variable)
	judge_lat = round(judge_latency, 3)

	# Check stop flag before yielding intermediate update
	if STOP_REQUESTED:
	logger.info("Stop requested during evaluation loop. Breaking batch.")
	break # Exit evaluation loop for this batch

	# Yield the full evaluation result (as dict) for intermediate updates
	# Store this update to be yielded before the next case starts
	new_update_payload = {
	"type": "update",
	"image_path": image_to_display,
	"champ_latency": champ_lat,
	"chall_latency": chall_lat,
	"judge_latency": judge_lat,
	"evaluation": evaluation_result.__dict__ # Pass evaluation details as dict
	}
	previous_update_payload = new_update_payload # Store for next iteration
	# Yield the current update
	yield new_update_payload
	# Check for trigger strings in judge reasoning if retry is configured
	if batch_retry_attempts > 0 and batch_retry_trigger_strings and not has_trigger_string_in_attempt:
	for trigger in batch_retry_trigger_strings:
	if trigger in evaluation_result.reasoning:
	logger.warning(f"Trigger string '{trigger}' found in judge reasoning for case {case_id}. Batch will be retried.")
	has_trigger_string_in_attempt = True
	# Overwrite the winner to UNDETERMINED if retry triggered by judge
	evaluation_result.winner = "UNDETERMINED"
	evaluation_result.reasoning += "\n[Retry triggered by judge reasoning]"
	break

	# Update judge success/error counts based on final verdict (after potential trigger overwrite)
	if evaluation_result.winner != "UNDETERMINED" and evaluation_result.winner != "JUDGE_ERROR": judge_metrics["success_count"] += 1
	else: judge_metrics["error_count"] += 1 # Count undetermined/judge_error as errors for judge metrics

	except Exception as e:
	logger.error(f"Error during judge evaluation for case {case_id}: {e}", exc_info=True)
	# Create a placeholder eval result indicating judge failure
	current_attempt_eval_results.append(EvaluationResult(
	test_id=case_id,
	champion_output=champ_response.output,
	challenger_output=chall_response.output,
	winner="JUDGE_ERROR",
	confidence=0.0,
	reasoning=f"Error: Judge evaluation failed critically - {e}"
	))
	judge_metrics["error_count"] += 1
	# If judge fails critically, maybe trigger retry? For now, just mark as error.
	# has_trigger_string_in_attempt = True # Option: Trigger retry on judge exception

	# --- Batch Retry Logic ---
	if has_trigger_string_in_attempt and retry_count < batch_retry_attempts:
	logger.warning(f"Batch {batch_num} attempt {retry_count+1} failed due to trigger strings. Retrying...")
	retry_count += 1
	# Clear temporary results for this failed attempt, metrics were already counted above
	current_attempt_eval_results = []
	continue # Go to the next iteration of the while loop (retry)
	else:
	# Conditions to accept the batch results:
	# 1. No trigger strings were found in this attempt.
	# 2. Trigger strings were found, but we've exhausted retry attempts.
	batch_success = True
	batch_eval_results = current_attempt_eval_results # Store the results of the successful (or final) attempt

	if has_trigger_string_in_attempt and retry_count >= batch_retry_attempts:
	logger.warning(f"Accepting batch {batch_num} results despite trigger strings after exhausting {batch_retry_attempts} retry attempts. Some results may be marked UNDETERMINED.")

	# Log summary for the completed batch attempt
	batch_summary = self.aggregator.aggregate(batch_eval_results) # Aggregate results of this specific batch
	log_prefix = f"Batch {batch_num} completed"
	if retry_count > 0: log_prefix += f" after {retry_count} retries"
	logger.info(f"{log_prefix}. Verdict Counts: {batch_summary['verdict_counts']}")


	# --- End of Batch Processing ---
	# Add the results of the successful batch attempt to the overall list
	all_evaluation_results.extend(batch_eval_results)
	processed_case_count += len(current_batch) # Update processed count after successful batch completion


	# 3. Aggregate final results across all successful batches
	aggregated_summary = self.aggregator.aggregate(all_evaluation_results)

	# 4. Calculate final metrics (using totals accumulated across all attempts)
	# Note: Parameter renamed from total_cases to processed_cases for clarity
	def calculate_avg_metrics(metrics, processed_cases):
	# Base counts on total cases attempted, errors include generation/image load issues
	total_attempts = metrics["success_count"] + metrics["error_count"]
	# Avg latency based on total attempts where latency was recorded (excludes critical failures before generation)
	valid_latency_runs = metrics["success_count"] + (metrics["error_count"] - metrics.get("image_load_errors", 0)) # Approx.
	avg_latency = round(metrics["total_latency"] / valid_latency_runs, 2) if valid_latency_runs > 0 else 0
	# Avg chars based only on successful generations
	avg_chars = int(metrics["total_output_chars"] / metrics["success_count"]) if metrics["success_count"] > 0 else 0
	# Success rate based on total test cases processed before stopping
	success_rate = round((metrics["success_count"] / processed_cases) * 100, 1) if processed_cases > 0 else 0

	return {
	"avg_latency_s": avg_latency,
	"avg_output_chars": avg_chars,
	"success_rate_pct": success_rate, # Now calculated based on processed cases
	"errors": metrics["error_count"],
	"image_load_errors": metrics.get("image_load_errors", 0)
	}

	# Use processed_case_count for denominators as it reflects actual attempts before potential early stopping
	# Pass processed_case_count to the updated function parameter
	champion_avg_metrics = calculate_avg_metrics(champion_metrics, processed_case_count)
	challenger_avg_metrics = calculate_avg_metrics(challenger_metrics, processed_case_count)
	# Judge metrics are based on cases where evaluation was attempted
	judge_attempts = judge_metrics["success_count"] + judge_metrics["error_count"]
	judge_avg_metrics = calculate_avg_metrics(judge_metrics, judge_attempts)


	# 5. Determine overall decision based on aggregated results
	decision = "MAINTAIN_CHAMPION" # Default
	reason = "Insufficient data or challenger did not significantly outperform."
	win_margin_threshold = 5 # Challenger needs to win by at least 5% points
	min_determined_verdicts = max(3, int(0.1 * processed_case_count)) # Need at least 3 or 10% determined verdicts

	percentages = aggregated_summary["verdict_percentages"]
	determined_verdicts = processed_case_count - aggregated_summary["verdict_counts"].get("UNDETERMINED", 0) - aggregated_summary["verdict_counts"].get("JUDGE_ERROR", 0)

	if determined_verdicts >= min_determined_verdicts:
	champ_wins_pct = percentages.get("MODEL_A_WINS", 0)
	chall_wins_pct = percentages.get("MODEL_B_WINS", 0)
	ties_pct = percentages.get("TIE", 0)

	# Calculate confidence-weighted percentages if we have confidence scores
	avg_confidence = aggregated_summary["average_confidence"]
	confidence_factor = f" with {avg_confidence:.2f} average confidence" if avg_confidence > 0 else ""

	if chall_wins_pct > champ_wins_pct + win_margin_threshold:
	decision = "REPLACE_WITH_CHALLENGER"
	reason = f"Challenger won {chall_wins_pct:.1f}% vs Champion's {champ_wins_pct:.1f}%{confidence_factor} (>{win_margin_threshold}% margin based on {determined_verdicts} determined verdicts)."
	elif champ_wins_pct > chall_wins_pct + win_margin_threshold:
	decision = "MAINTAIN_CHAMPION"
	reason = f"Champion won {champ_wins_pct:.1f}% vs Challenger's {chall_wins_pct:.1f}%{confidence_factor} (based on {determined_verdicts} determined verdicts)."
	else:
	# Closer results, consider ties or maintain status quo
	decision = "MAINTAIN_CHAMPION"
	reason = f"Results close ({champ_wins_pct:.1f}% vs {chall_wins_pct:.1f}%, {ties_pct:.1f}% ties){confidence_factor}. Challenger did not show clear superiority (based on {determined_verdicts} determined verdicts)."
	else:
	# Not enough determined verdicts for a reliable decision
	decision = "MAINTAIN_CHAMPION"
	reason = f"Insufficient determined verdicts ({determined_verdicts}/{processed_case_count}, need >= {min_determined_verdicts}) to make a reliable decision. Defaulting to maintaining champion."

	# Log final summary
	logger.info(f"--- Final Aggregated Results ({processed_case_count} cases processed) ---")
	logger.info(f"Verdict Counts: {aggregated_summary['verdict_counts']}")
	logger.info(f"Verdict Percentages (Determined Only): {aggregated_summary['verdict_percentages']}")
	logger.info(f"Average Confidence (Determined Only): {aggregated_summary['average_confidence']:.3f}")
	logger.info(f"Champion Metrics: {champion_avg_metrics}")
	logger.info(f"Challenger Metrics: {challenger_avg_metrics}")
	logger.info(f"Judge Metrics: {judge_avg_metrics}")
	logger.info(f"Decision: {decision} - {reason}")

	if progress is not None:
	final_status = "Testing completed" if not STOP_REQUESTED else "Testing stopped early"
	progress(1.0, final_status)

	final_summary = {
	"total_test_cases_processed": processed_case_count,
	"total_test_cases_loaded": num_cases,
	"verdicts": aggregated_summary["verdict_counts"],
	"verdict_percentages": aggregated_summary["verdict_percentages"],
	"average_confidence": aggregated_summary["average_confidence"],
	"decision": decision,
	"reason": reason,
	"champion_metrics": champion_avg_metrics,
	"challenger_metrics": challenger_avg_metrics,
	"judge_metrics": judge_avg_metrics,
	"champion_name": self.champion_endpoint.name,
	"challenger_name": self.challenger_endpoint.name,
	"judge_name": self.judge_endpoint.name,
	}

	# Yield the final results dictionary
	yield {
	"type": "final",
	"evaluations": aggregated_summary["raw_evaluations"],
	"summary": final_summary
	}

	# --- Gradio UI Components & Logic ---

	def parse_test_data(
	file_obj,
	text_data,
	key_field_name: str = "key",
	value_field_name: str = "value",
	image_field_name: str = "image_url" # Added image field name parameter
	) -> List[TestCase]:
	"""
	Parses test data from Gradio file upload or text input.
	Uses specified field names for key, value, and image path/URL.
	"""
	test_cases = []
	raw_data = None

	if file_obj is not None:
	# Use the temporary file path provided by Gradio
	file_path = file_obj.name
	logger.info(f"Loading test data from uploaded file: {file_path}")
	try:
	# Determine file type from extension, not relying on original name if temp name is different
	_, file_ext = os.path.splitext(file_path)
	file_ext = file_ext.lower()

	if file_ext == ".json":
	with open(file_path, 'r', encoding='utf-8') as f:
	raw_data = json.load(f)
	elif file_ext == ".csv":
	# Read CSV into pandas DataFrame first for easier handling
	try:
	# Try detecting delimiter, handle potential bad lines
	# Use sensible defaults, allow overriding later if needed
	df = pd.read_csv(
	file_path,
	sep=None, # Auto-detect
	engine='python',
	on_bad_lines='warn',
	quoting=csv.QUOTE_MINIMAL, # Default quoting
	escapechar='\\' # Common escape character
	)
	logger.info(f"CSV loaded successfully. Columns: {df.columns.tolist()}")
	# Convert NaN/NaT to None for cleaner processing -> convert to empty string later
	df = df.fillna('')
	# Convert DataFrame rows to list of dictionaries
	raw_data = df.to_dict(orient='records')
	except Exception as e:
	logger.error(f"Error reading CSV file '{file_path}': {e}")
	raise ValueError(f"Error reading CSV: {e}")
	elif file_ext in (".jsonl", ".ndjson"):
	# Handle JSONL (newline-delimited JSON)
	raw_data = []
	with open(file_path, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f):
	line = line.strip()
	if not line: continue # Skip empty lines
	try:
	item = json.loads(line)
	raw_data.append(item)
	except json.JSONDecodeError:
	logger.warning(f"Skipping invalid JSON line #{line_num + 1} in file '{file_path}': {line[:100]}...")
	if not raw_data:
	raise ValueError("No valid JSON objects found in JSONL file.")
	else:
	allowed_extensions = ['.csv', '.json', '.jsonl', '.ndjson']
	raise ValueError(f"Invalid file type ({file_ext}). Please upload a file that is one of these formats: {allowed_extensions}")

	except Exception as e:
	logger.error(f"Error processing uploaded file {file_path}: {e}", exc_info=True)
	raise ValueError(f"Failed to process file: {e}")

	elif text_data and text_data.strip():
	logger.info("Loading test data from text input.")
	try:
	# Try parsing as JSON list first
	raw_data = json.loads(text_data)
	if not isinstance(raw_data, list):
	raise ValueError("Pasted text is valid JSON, but not a list of objects.")
	except json.JSONDecodeError as json_err:
	# If JSON fails, try treating it as line-delimited JSON (JSONL)
	logger.warning(f"Could not parse text as JSON list ({json_err}), trying as JSONL...")
	try:
	raw_data = [json.loads(line) for line in text_data.strip().splitlines() if line.strip()]
	if not raw_data:
	raise ValueError("No valid JSON objects found in text input lines.")
	except json.JSONDecodeError as line_err:
	logger.error(f"Invalid JSON format in text input (checked as list and line-by-line): {line_err}")
	raise ValueError(f"Invalid JSON format in text input. Ensure it's a list of objects `[ {{\"key\": ...}}, ... ]` or one JSON object per line.")
	except Exception as e:
	logger.error(f"Error processing text input data: {e}", exc_info=True)
	raise ValueError(f"Failed to process text data: {e}")

	else:
	raise ValueError("No test data provided. Please upload a file or paste JSON/JSONL.")

	# Convert raw_data (list of dicts) to TestCase objects
	if isinstance(raw_data, list):
	for i, item in enumerate(raw_data):
	if isinstance(item, dict):
	try:
	# Ensure the specified key field exists, value field is optional
	# 'id' is optional (defaults to None, ModelTester assigns later if needed)
	# 'image' field is optional
	key = item.get(key_field_name)
	if key is None:
	logger.warning(f"Skipping item {i+1} due to missing '{key_field_name}' field. Data: {item}")
	continue

	# Get image path/url if field exists and is not empty/None
	image_val = item.get(image_field_name) if image_field_name else None
	image_path_or_url = str(image_val).strip() if image_val and str(image_val).strip() else None

	test_cases.append(TestCase(
	id=str(item.get('id', f"item-{i+1}")), # Ensure ID is string, use item index
	key=str(key), # Ensure key is string
	value=str(item.get(value_field_name, '')), # Ensure value is string, default empty
	image_path_or_url=image_path_or_url,
	))
	except Exception as e:
	logger.warning(f"Skipping item {i+1} due to error during TestCase creation: {e}. Data: {item}")
	else:
	logger.warning(f"Skipping item {i+1} as it is not a dictionary. Data: {item}")
	else:
	raise ValueError("Parsed data is not a list of test cases (expected list of dictionaries).")

	if not test_cases:
	raise ValueError("No valid test cases could be loaded from the provided data.")

	logger.info(f"Successfully loaded {len(test_cases)} test cases.")
	return test_cases


	def format_summary_output(summary_data: Dict[str, Any]) -> str:
	"""Formats the summary dictionary into a readable string."""
	if not summary_data or summary_data.get("error"):
	return f"Error generating summary: {summary_data.get('error', 'Unknown error')}"

	output = f"--- Test Summary ---\n"
	output += f"Champion: {summary_data.get('champion_name', 'N/A')}\n"
	output += f"Challenger: {summary_data.get('challenger_name', 'N/A')}\n"
	output += f"Judge: {summary_data.get('judge_name', 'N/A')}\n"
	output += f"Test Cases Loaded: {summary_data.get('total_test_cases_loaded', 'N/A')}\n"
	output += f"Test Cases Processed: {summary_data.get('total_test_cases_processed', 'N/A')}\n"

	output += "\nVerdicts (Based on Processed Cases):\n"
	for verdict, count in summary_data.get('verdicts', {}).items():
	output += f" {verdict}: {count}\n"

	output += "\nVerdict Percentages (Based on Determined Verdicts):\n"
	determined = summary_data.get('total_test_cases_processed', 0) - \
	summary_data.get('verdicts', {}).get('UNDETERMINED', 0) - \
	summary_data.get('verdicts', {}).get('JUDGE_ERROR', 0)
	output += f" (Calculated from {determined} determined verdicts)\n"
	for verdict, pct in summary_data.get('verdict_percentages', {}).items():
	output += f" {verdict}: {pct:.1f}%\n"

	avg_conf = summary_data.get('average_confidence', 0)
	output += f"\nAverage Confidence (Determined Only): {avg_conf:.3f}\n"

	output += "\nMetrics (Avg Latency / Avg Output Chars / Success Rate / Errors / Image Load Errors):\n"
	champ_metrics = summary_data.get('champion_metrics', {})
	chall_metrics = summary_data.get('challenger_metrics', {})
	judge_metrics = summary_data.get('judge_metrics', {}) # Judge metrics are calculated differently
	output += (f" Champion: {champ_metrics.get('avg_latency_s', 0):.2f}s / "
	f"{champ_metrics.get('avg_output_chars', 0)} / "
	f"{champ_metrics.get('success_rate_pct', 0):.1f}% / "
	f"{champ_metrics.get('errors', 0)} / "
	f"{champ_metrics.get('image_load_errors', 0)}\n")
	output += (f" Challenger: {chall_metrics.get('avg_latency_s', 0):.2f}s / "
	f"{chall_metrics.get('avg_output_chars', 0)} / "
	f"{chall_metrics.get('success_rate_pct', 0):.1f}% / "
	f"{chall_metrics.get('errors', 0)} / "
	f"{chall_metrics.get('image_load_errors', 0)}\n")
	# Judge metrics are slightly different (no image errors, success based on valid eval)
	output += (f" Judge: {judge_metrics.get('avg_latency_s', 0):.2f}s / "
	f"{judge_metrics.get('avg_output_chars', 0)} / "
	f"{judge_metrics.get('success_rate_pct', 0):.1f}% / "
	f"{judge_metrics.get('errors', 0)} (Errors + Undetermined)\n")


	output += f"\nDecision: {summary_data.get('decision', 'N/A')}\n"
	output += f"Reason: {summary_data.get('reason', 'N/A')}\n"

	return output

	def run_test_from_ui(
	# Model Configs (18 inputs now, including upload method)
	champ_name, champ_api_url, champ_model_id, champ_temp, champ_max_tokens, champ_file_upload_method,
	chall_name, chall_api_url, chall_model_id, chall_temp, chall_max_tokens, chall_file_upload_method,
	judge_name, judge_api_url, judge_model_id, judge_temp, judge_max_tokens, judge_file_upload_method,
	# API Key (1 input)
	api_key_input,
	# Prompts (2 inputs)
	model_prompt_template_input,
	judge_prompt_template_input,
	# Test Data (2 inputs)
	test_data_file,
	test_data_text,
	# Parameters (5 inputs)
	batch_size_input,
	batch_retry_attempts_input,
	batch_backoff_factor_input,
	batch_max_wait_input,
	batch_retry_trigger_strings_input,
	# Data Field Names (3 inputs) - Added image field name
	key_field_name_input,
	value_field_name_input,
	image_field_name_input, # Added image field name input
	# Gradio progress object
	progress=gr.Progress(track_tqdm=True)
	):
	"""
	Handles the logic for running the A/B test triggered by the Gradio UI button.
	"""
	global STOP_REQUESTED
	STOP_REQUESTED = False # Reset stop flag at the beginning of each UI run
	logger.info("Starting test run from Gradio UI...")
	progress(0, desc="Initializing...")

	try:
	# 1. Get API Key from UI input (Treat as optional, let endpoint logic handle needs)
	# Also check environment variable as a fallback/override
	api_key_env = os.getenv("OPENROUTER_API_KEY") or os.getenv("ANTHROPIC_API_KEY") or os.getenv("GOOGLE_API_KEY") # Add other common key names if needed
	api_key_ui = str(api_key_input).strip() if api_key_input else None

	# Prioritize UI input if provided, otherwise use environment variable
	api_key = api_key_ui if api_key_ui else api_key_env

	if api_key_ui and api_key_env and api_key_ui != api_key_env:
	logger.warning("API Key provided via UI input overrides the environment variable.")
	elif api_key:
	logger.info(f"API Key found ({'UI input' if api_key_ui else 'environment variable'}).")
	else:
	logger.info("API Key not provided via UI input or environment variables. Only local/keyless endpoints will work.")


	progress(0.1, desc="Loading test data...")
	# 2. Load Test Cases (Pass field names from UI)
	try:
	key_field = str(key_field_name_input).strip() or "key"
	value_field = str(value_field_name_input).strip() or "value"
	image_field = str(image_field_name_input).strip() or "image_url" # Default if empty
	logger.info(f"Using data fields - Key: '{key_field}', Value: '{value_field}', Image: '{image_field}'")
	test_cases = parse_test_data(test_data_file, test_data_text, key_field, value_field, image_field)
	logger.info(f"Loaded {len(test_cases)} test cases.")
	except ValueError as e:
	logger.error(f"Failed to load test data: {e}")
	raise gr.Error(f"Test Data Error: {e}")
	except Exception as e:
	logger.exception("Unexpected error loading test data.")
	raise gr.Error(f"Unexpected error loading test data: {e}")

	if not test_cases:
	raise gr.Error("No valid test cases were loaded.")

	progress(0.2, desc="Configuring models...")
	# 3. Create Model Endpoints (Pass the potentially found api_key and upload method)
	try:
	# Helper to create endpoint, ensuring types and including upload method
	def create_ep(name, url, model_id, temp, max_tok, key, upload_method):
	# Strip whitespace from URL and model ID
	url = str(url).strip() if url else ""
	model_id = str(model_id).strip() if model_id else ""
	# Basic validation
	if not name: raise ValueError("Model Display Name cannot be empty.")
	if not url: raise ValueError(f"API URL cannot be empty for model '{name}'.")
	if not model_id: raise ValueError(f"Model ID cannot be empty for model '{name}'.")
	# Validate upload method
	if upload_method not in ["JSON (Embedded Data)", "Multipart Form Data"]:
	raise ValueError(f"Invalid file upload method '{upload_method}' for model '{name}'.")
	# Use the potentially loaded key and upload method
	return ModelEndpoint(
	name=str(name), api_url=url, api_key=key, model_id=model_id,
	temperature=float(temp), max_tokens=int(max_tok),
	file_upload_method=str(upload_method) # Add file upload method
	)

	champion_endpoint = create_ep(champ_name, champ_api_url, champ_model_id, champ_temp, champ_max_tokens, api_key, champ_file_upload_method)
	challenger_endpoint = create_ep(chall_name, chall_api_url, chall_model_id, chall_temp, chall_max_tokens, api_key, chall_file_upload_method)
	judge_endpoint = create_ep(judge_name, judge_api_url, judge_model_id, judge_temp, judge_max_tokens, api_key, judge_file_upload_method)

	# Log endpoints being used (mask key if present, add upload method)
	logger.info(f"Champion Endpoint: {champion_endpoint.name}, URL: {champion_endpoint.api_url}, Model: {champion_endpoint.model_id}, Upload: {champion_endpoint.file_upload_method}, Key Provided: {'Yes' if champion_endpoint.api_key else 'No'}")
	logger.info(f"Challenger Endpoint: {challenger_endpoint.name}, URL: {challenger_endpoint.api_url}, Model: {challenger_endpoint.model_id}, Upload: {challenger_endpoint.file_upload_method}, Key Provided: {'Yes' if challenger_endpoint.api_key else 'No'}")
	logger.info(f"Judge Endpoint: {judge_endpoint.name}, URL: {judge_endpoint.api_url}, Model: {judge_endpoint.model_id}, Upload: {judge_endpoint.file_upload_method}, Key Provided: {'Yes' if judge_endpoint.api_key else 'No'}")

	except ValueError as ve:
	logger.error(f"Model Configuration Error: {ve}")
	raise gr.Error(f"Model Configuration Error: {ve}")
	except Exception as e:
	logger.error(f"Error creating ModelEndpoint objects: {e}", exc_info=True)
	raise gr.Error(f"Model Configuration Error: {e}")

	# 4. Instantiate ModelTester
	try:
	tester = ModelTester(
	champion_endpoint=champion_endpoint,
	challenger_endpoint=challenger_endpoint,
	judge_endpoint=judge_endpoint,
	model_prompt_template=str(model_prompt_template_input),
	judge_prompt_template=str(judge_prompt_template_input)
	)
	except Exception as e:
	logger.error(f"Error instantiating ModelTester: {e}", exc_info=True)
	raise gr.Error(f"Tester Initialization Error: {e}")

	# 5. Run the Test
	batch_size = int(batch_size_input) if batch_size_input is not None and batch_size_input > 0 else 1
	logger.info(f"Running test with {len(test_cases)} cases, batch size {batch_size}...")
	progress(0.3, desc="Running A/B test...")
	try:
	# Process batch retry parameters
	batch_retry_attempts = int(batch_retry_attempts_input) if batch_retry_attempts_input is not None else 0
	batch_backoff_factor = float(batch_backoff_factor_input) if batch_backoff_factor_input is not None else 2.0
	batch_max_wait = int(batch_max_wait_input) if batch_max_wait_input is not None else 60

	# Process trigger strings (convert from comma-separated string to list)
	batch_retry_trigger_strings = None
	if batch_retry_trigger_strings_input and batch_retry_trigger_strings_input.strip():
	batch_retry_trigger_strings = [s.strip().lower() for s in batch_retry_trigger_strings_input.split(',') if s.strip()] # Lowercase for case-insensitive match later
	logger.info(f"Using batch retry trigger strings: {batch_retry_trigger_strings}")

	# Make trigger strings case-insensitive in the run_test method check
	# (Already done in list comprehension above)

	# Iterate through the generator yielded by run_test
	final_results = None
	last_image_path = None
	last_champ_latency = ""
	last_chall_latency = ""
	last_judge_latency = ""
	last_winner = ""
	running_eval_results = [] # List to store results incrementally
	current_summary = "" # Placeholder for summary updates
	current_details_df = pd.DataFrame() # Placeholder for details updates
	try: # Inner try for the loop
	for result_update in tester.run_test(
	test_cases,
	batch_size=batch_size,
	progress=progress,
	batch_retry_attempts=batch_retry_attempts,
	batch_backoff_factor=batch_backoff_factor,
	batch_max_wait=batch_max_wait,
	batch_retry_trigger_strings=batch_retry_trigger_strings
	):
	if STOP_REQUESTED: # Check stop flag during iteration
	logger.info("Stop requested, halting UI updates.")
	break # Exit the for loop

	if result_update.get("type") == "update":
	# Extract monitoring values
	last_image_path = result_update.get("image_path")
	last_champ_latency = str(result_update.get("champ_latency", ""))
	last_chall_latency = str(result_update.get("chall_latency", ""))
	last_judge_latency = str(result_update.get("judge_latency", ""))

	# Process evaluation details for incremental updates
	evaluation = result_update.get("evaluation") # Get the full evaluation dict
	if evaluation:
	running_eval_results.append(evaluation) # Add current result to list
	last_winner = str(evaluation.get("winner", "N/A"))
	# Format summary for the current case
	current_summary = (
	f"--- Last Processed Case ---\n"
	f"Case ID: {evaluation.get('test_id', 'N/A')}\n"
	f"Winner: {last_winner}\n"
	f"Confidence: {evaluation.get('confidence', 0.0):.2f}\n"
	f"Reasoning Snippet: {evaluation.get('reasoning', '')[:200]}..."
	)
	# Update DataFrame with all results so far
	try:
	display_columns_update = ['test_id', 'winner', 'confidence', 'champion_output', 'challenger_output', 'reasoning']
	current_details_df = pd.DataFrame(running_eval_results)
	for col in display_columns_update:
	if col not in current_details_df.columns:
	current_details_df[col] = None
	current_details_df = current_details_df[display_columns_update] # Reorder/select columns
	except Exception as df_err:
	logger.error(f"Error creating incremental DataFrame: {df_err}")
	current_details_df = pd.DataFrame([{"Error": "Failed to update details"}])
	else:
	last_winner = "Update Error"
	current_summary = "Error: No evaluation data in update."

	# Yield 8 values for incremental update
	yield current_summary, current_details_df, last_image_path, last_champ_latency, last_chall_latency, last_judge_latency, last_winner, running_eval_results # Yield current results to state
	elif result_update.get("type") == "final":
	# Store final results
	final_results = result_update
	# Don't break here, let the loop finish naturally to reach finally
	else:
	logger.warning(f"Received unexpected update type from run_test: {result_update.get('type')}")
	# End of the main 'for' loop

	except Exception as loop_err: # Catch errors during loop execution
	if STOP_REQUESTED:
	logger.warning(f"Caught exception during test loop after stop request (likely progress bar issue): {loop_err}")
	pass # Suppress error, proceed to finally
	else:
	logger.exception("An unexpected error occurred during the test execution loop.")
	raise loop_err # Re-raise unexpected errors

	finally: # Executes after loop finishes (normally or via break) or if suppressed exception occurred
	# --- Post-Loop Processing ---
	if 'result_update' not in locals() and not STOP_REQUESTED: # Handle case where loop didn't run and wasn't stopped
	final_results = None

	if STOP_REQUESTED:
	logger.info("Test run stopped by user.")
	summary_output = "Test run stopped by user."
	details_df = current_details_df if 'current_details_df' in locals() and not current_details_df.empty else pd.DataFrame(columns=['test_id', 'winner', 'confidence', 'champion_output', 'challenger_output', 'reasoning'])
	raw_evals = running_eval_results if 'running_eval_results' in locals() else []
	elif 'final_results' not in locals() or final_results is None:
	logger.error("Test run finished, but no final results structure was received.")
	summary_output = "Test Execution Error: No final results generated."
	details_df = pd.DataFrame(columns=['test_id', 'winner', 'confidence', 'champion_output', 'challenger_output', 'reasoning'])
	raw_evals = []
	last_image_path = last_image_path if 'last_image_path' in locals() else None
	last_champ_latency = last_champ_latency if 'last_champ_latency' in locals() else ""
	last_chall_latency = last_chall_latency if 'last_chall_latency' in locals() else ""
	last_judge_latency = last_judge_latency if 'last_judge_latency' in locals() else ""
	last_winner = last_winner if 'last_winner' in locals() else ""
	else:
	# Completed normally, process final results
	logger.info("Test run completed normally.")
	summary_data = final_results.get("summary", {})
	raw_evals = final_results.get("evaluations", [])
	summary_output = format_summary_output(summary_data)
	display_columns = ['test_id', 'winner', 'confidence', 'champion_output', 'challenger_output', 'reasoning']
	try:
	if raw_evals:
	details_df = pd.DataFrame(raw_evals)
	for col in display_columns:
	if col not in details_df.columns:
	details_df[col] = None
	details_df = details_df[display_columns]
	else:
	details_df = pd.DataFrame(columns=display_columns)
	summary_output += "\n\nNote: No evaluation results were generated."
	except Exception as df_err:
	logger.error(f"Error creating DataFrame from final results: {df_err}")
	summary_output += f"\n\nError displaying detailed results: {df_err}"
	details_df = pd.DataFrame(columns=display_columns)
	raw_evals = []

	# Ensure monitoring variables exist
	last_image_path = last_image_path if 'last_image_path' in locals() else None
	last_champ_latency = last_champ_latency if 'last_champ_latency' in locals() else ""
	last_chall_latency = last_chall_latency if 'last_chall_latency' in locals() else ""
	last_judge_latency = last_judge_latency if 'last_judge_latency' in locals() else ""
	last_winner = last_winner if 'last_winner' in locals() else ""
	raw_evals = raw_evals if 'raw_evals' in locals() else []

	# Final yield statement - yields 8 values (conditionally None for raw_evals if stopped)
	yield summary_output, details_df, last_image_path, last_champ_latency, last_chall_latency, last_judge_latency, last_winner, raw_evals
	# End of inner try...except...finally
	except Exception as test_exec_err: # Corresponds to try at line 1701
	logger.exception("An error occurred during the main test execution phase.")
	# Potentially yield an error state back to the UI if needed
	# For now, just re-raise to be caught by the outermost handler
	raise test_exec_err

	# --- Outer Exception Handling ---
	# --- Outer Exception Handling (indentation matches outer 'try' at line 1614) ---
	except gr.Error as e: # Catch Gradio-specific errors first
	logger.error(f"Gradio Error: {e}")
	error_message = str(e)
	error_df = pd.DataFrame([{"Error": error_message}])
	yield error_message, error_df, None, None, None, None, None, None # Yield 8 error values
	except Exception as e: # Catch any other exceptions (setup, re-raised from inner loop)
	logger.exception("An unexpected error occurred in run_test_from_ui.")
	error_message = f"An unexpected error occurred: {e}"
	error_df = pd.DataFrame([{"Error": error_message}])
	yield error_message, error_df, None, None, None, None, None, None # Yield 8 error values
	finally: # Outer finally (indentation matches outer 'try' at line 1614)
	# Ensure the stop requested flag is reset regardless of how the function exits
	STOP_REQUESTED = False

	# Function to be called by the Stop button
	# --- Helper Function for Downloads ---



	# --- Stop Request Handling ---
	def request_stop():
	"""Sets the global STOP_REQUESTED flag and returns a status message."""
	global STOP_REQUESTED
	status_message = ""
	if not STOP_REQUESTED:
	STOP_REQUESTED = True
	logger.warning("Stop requested via UI button.")
	status_message = "Stop request received. Finishing current batch..."
	else:
	logger.warning("Stop already requested.")
	status_message = "Stop already requested. Please wait..."
	return status_message
	# Function to generate JSONL file for download
	from typing import Optional # Add Optional import

	def generate_jsonl_download(results_list: Optional[List[Dict[str, Any]]]) -> Optional[gr.File]:
	"""
	Takes an optional list of evaluation result dictionaries, saves them as a JSONL file,
	and returns a Gradio File object for download. Returns None if input is None (e.g., run stopped).
	"""
	if not results_list:
	# Still handle the case of a completed run with zero results
	logger.warning("generate_jsonl_download called with empty results list (run completed but no results).")
	results_list = [] # Ensure it's an empty list for file creation below

	logger.info(f"generate_jsonl_download received results_list: type={type(results_list)}, len={len(results_list) if results_list is not None else 'None'}")
	try:
	# Use io.StringIO to build the JSONL string in memory
	jsonl_content = io.StringIO()
	for result in results_list:
	# Ensure result is serializable (convert dataclasses if needed)
	# The results passed should already be dicts from ModelTester.run_test final yield
	if isinstance(result, dict):
	serializable_result = result
	else:
	# Handle unexpected types if necessary
	logger.warning(f"Skipping non-dict item in results: {type(result)}")
	continue
	jsonl_content.write(json.dumps(serializable_result) + '\n')

	# Get the string content
	jsonl_string = jsonl_content.getvalue()
	jsonl_content.close()

	logger.info(f"Generated JSONL string length: {len(jsonl_string)}")
	# Create a temporary file path
	timestamp = time.strftime("%Y%m%d-%H%M%S")
	temp_dir = tempfile.gettempdir()
	file_path = os.path.join(temp_dir, f"abx_results_{timestamp}.jsonl")

	# Write the string content to the file
	with open(file_path, "w", encoding="utf-8") as f:
	f.write(jsonl_string)

	logger.info(f"Generated JSONL file for download at: {file_path}")
	logger.info(f"--- generate_jsonl_download returning file: {file_path}")
	# Return the file path wrapped in gr.File for download
	# Note: Gradio handles the cleanup of the temp file after download
	return gr.File(value=file_path, label="Download Results (JSONL)")

	except Exception as e:
	logger.error(f"Error generating JSONL file: {e}", exc_info=True)
	# How to signal error to user? Gradio download button doesn't easily show errors.
	# Maybe return None or raise an error that Gradio might catch?
	# Returning None might just make the download fail silently.
	# Let's re-raise for now, Gradio might handle it.
	raise gr.Error(f"Failed to generate JSONL download: {e}")


	def _generate_download_wrapper(results_state, *args):
	"""Wrapper to call generate_jsonl_download, ignoring extra args from .then()."""
	# *args will capture any extra positional arguments Gradio might pass
	logger.info(f"Download wrapper called. results_state type: {type(results_state)}, len: {len(results_state) if isinstance(results_state, list) else 'N/A'}. Ignoring {len(args)} extra args.")
	return generate_jsonl_download(results_state)


	# --- UI Creation ---
	# Removed incorrectly indented line


	def create_ui():
	"""Creates the Gradio web interface for the A/B testing tool."""
	logger.info("Creating Gradio UI...")

	# Default values for UI components
	default_api_url_openrouter = "https://openrouter.ai/api/v1/chat/completions"
	default_api_url_ollama = "http://localhost:11434/api/generate" # Default Ollama URL
	default_model_prompt = "User: {key}\nAssistant:" # Example prompt
	# Use the default judge prompt from the LMJudge class
	default_judge_prompt = LMJudge.DEFAULT_EVALUATION_PROMPT

	css = """
	.model-config-group .gr-form { background-color: #f0f0f0; padding: 10px; border-radius: 5px; margin-bottom: 10px; }
	.model-config-group .gr-form > :first-child { font-weight: bold; margin-bottom: 5px; } /* Style the label */
	.results-box { border: 1px solid #ccc; padding: 15px; border-radius: 5px; margin-top: 15px; }
	.api-key-warning { color: #cc5500; font-weight: bold; margin-bottom: 15px; }
	"""

	with gr.Blocks(css=css, theme=gr.themes.Soft()) as iface:
	gr.Markdown("# A/B x Judge: AI Testing & Auto-Evaluation")
	gr.Markdown(
	"1) Configure Champion, Challenger, and Judge.\n"
	"2) Provide Test Data & Reference Input.\n"
	"3) Run Evaluations & Compare Performance."
	)
	gr.Markdown(
	"""API Key: Enter as needed for Cloud Endpoints; env defaults auto-evaluated (see: [code](https://github.com/rabbidave/ZeroDay.Tools/blob/main/ABxJudge.py))""",
	elem_classes="api-key-warning"
	)
	gr.Markdown(
	"""Multimodal Input:
	1. Ensure your test data (CSV/JSON/JSONL) includes a column/field containing the local path or public URL to the file.
	2. Specify this column/field name in the 'Input Field Name' box below.
	3. Ensure your models and endpoints support multimodal input
	4. Prompt should contextualize the input (e.g., 'Describe this image.', 'Transcribe the audio.').""",
	elem_classes="api-key-warning"
	)


	with gr.Tabs():
	with gr.TabItem("Configuration"):
	# Add API Key input field
	with gr.Row():
	api_key_input = gr.Textbox(
	label="API Key (Optional)", # Simplified label
	type="password",
	placeholder="Enter key if required for cloud endpoints",
	info="Overrides environment variables (e.g., OPENROUTER_API_KEY). Leave blank to use ENV or for local models."
	)
	with gr.Row():
	# Champion Model Configuration
	with gr.Column(scale=1):
	with gr.Group(elem_classes="model-config-group"):
	gr.Label("Champion Model (Model A)")
	# Updated example for Ollama Mistral 3.1 (as requested default)
	champ_name = gr.Textbox(label="Display Name", value="Champion (LM Studio Gemma 3 12B)")
	champ_api_url = gr.Textbox(label="API URL", value="http://localhost:1234/v1/chat/completions") # LM Studio OpenAI endpoint
	champ_model_id = gr.Textbox(label="Model ID", value="gemma-3-12b-it") # User specified
	champ_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, step=0.1, value=0.1)
	champ_max_tokens = gr.Number(label="Max Tokens", value=8192, precision=0)
	champ_file_upload_method = gr.Dropdown(
	label="File Upload Method",
	choices=["JSON (Embedded Data)", "Multipart Form Data"],
	value="JSON (Embedded Data)",
	info="How to send file data (if any) to this endpoint."
	)
	# Challenger Model Configuration
	with gr.Column(scale=1):
	with gr.Group(elem_classes="model-config-group"):
	gr.Label("Challenger Model (Model B)")
	# Updated examples for Ollama Gemma 3 27B (as requested default)
	chall_name = gr.Textbox(label="Display Name", value="Challenger (LM Studio Gemma 3 4B)")
	chall_api_url = gr.Textbox(label="API URL", value="http://localhost:1234/v1/chat/completions") # LM Studio OpenAI endpoint
	chall_model_id = gr.Textbox(label="Model ID", value="gemma-3-4b-it") # User specified
	chall_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, step=0.1, value=0.1)
	chall_max_tokens = gr.Number(label="Max Tokens", value=8192, precision=0)
	chall_file_upload_method = gr.Dropdown(
	label="File Upload Method",
	choices=["JSON (Embedded Data)", "Multipart Form Data"],
	value="JSON (Embedded Data)",
	info="How to send file data (if any) to this endpoint."
	)
	# Judge Model Configuration
	with gr.Column(scale=1):
	with gr.Group(elem_classes="model-config-group"):
	gr.Label("Judge Model")
	judge_name = gr.Textbox(label="Display Name", value="Judge (LM Studio Gemma 3 27B)")
	judge_api_url = gr.Textbox(label="API URL", value="http://localhost:1234/v1/chat/completions") # LM Studio OpenAI endpoint
	judge_model_id = gr.Textbox(label="Model ID", value="gemma-3-27b-it") # User specified
	judge_temp = gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.0) # Judge usually deterministic
	judge_max_tokens = gr.Number(label="Max Tokens", value=8192, precision=0) # Judge might need more tokens
	judge_file_upload_method = gr.Dropdown(
	label="File Upload Method",
	choices=["JSON (Embedded Data)", "Multipart Form Data"],
	value="JSON (Embedded Data)",
	info="How to send file data (if any) to this endpoint."
	)

	with gr.Row():
	# Model Prompt Template
	with gr.Column(scale=1):
	gr.Markdown("### Model Prompt Template")
	model_prompt_template_input = gr.Textbox(
	label="Template for Champion/Challenger (use {key} for input)",
	value="{key}\nUser: Provide a detailed description\nAssistant:",
	lines=5,
	show_copy_button=True
	)
	# Judge Prompt Template
	with gr.Column(scale=1):
	gr.Markdown("### Judge Prompt Template")
	judge_prompt_template_input = gr.Textbox(
	label="Template for Judge (see code/docs for available placeholders)",
	value=default_judge_prompt,
	lines=15,
	show_copy_button=True
	)

	with gr.Row():
	# Test Data Input
	with gr.Column(scale=1):
	gr.Markdown("### Test Data")
	gr.Markdown("Upload a CSV/JSON/JSONL file or paste data below. Specify the field names containing the model input (key), optional reference answer (value), and optional input path/URL. Add an `id` field for stable identification (recommended).")
	test_data_file = gr.File(label="Upload Test Data (CSV, JSON, JSONL/NDJSON)", file_types=[".csv", ".json", ".jsonl", ".ndjson"])
	test_data_text = gr.Textbox(label="Or Paste Test Data (JSON list or JSONL format)", lines=8, placeholder='[{"id": "t1", "prompt": "Describe image", "image_url": "/path/to/img.jpg", "reference": "..."}]\n{"id": "t2", "prompt": "Question text", "image_url": null, "reference": "..."}')
	with gr.Row():
	key_field_name_input = gr.Textbox(label="Key Field Name", value="name", info="Field containing the text input/prompt.") # Swapped default
	value_field_name_input = gr.Textbox(label="Value Field Name", value="caption", info="Field containing the reference/ground truth (optional).") # Swapped default
	image_field_name_input = gr.Textbox(label="Image Field Name", value="image_url", info="Field containing the image path or URL (optional).") # Added image field input

	# Test Run Parameters
	with gr.Column(scale=1):
	gr.Markdown("### Test Parameters")
	batch_size_input = gr.Number(label="Batch Size", value=5, minimum=1, precision=0)

	# Add batch retry parameters
	gr.Markdown("#### Batch Retry Settings")
	batch_retry_attempts_input = gr.Number(
	label="Batch Retry Attempts",
	value=1, # Default to 1 retry
	precision=0,
	minimum=0,
	info="Number of times to retry a batch if trigger strings are found (0 = no retries)"
	)
	batch_backoff_factor_input = gr.Slider(
	label="Backoff Factor",
	value=2.0,
	minimum=1.0,
	maximum=5.0,
	step=0.1,
	info="Factor for exponential backoff between retries (e.g., 2.0 waits 1s, 2s, 4s...)"
	)
	batch_max_wait_input = gr.Number(
	label="Maximum Wait Time (seconds)",
	value=60,
	precision=0,
	minimum=1,
	info="Maximum wait time between retries in seconds"
	)
	batch_retry_trigger_strings_input = gr.Textbox(
	label="Retry Trigger Strings (Comma-separated)",
	placeholder="e.g., rate limit,error,timeout,empty response",
	info="Retry batch if these strings appear in model/judge output (case-insensitive check)"
	)
	# Add preprocessing options later if needed

	with gr.TabItem("Monitoring"):
	gr.Markdown("### Test Execution & Results")
	with gr.Row():
	run_button = gr.Button("Run A/B Test", variant="primary", scale=3) # Adjusted scale
	stop_button = gr.Button("Stop Test", variant="stop", scale=1)
	with gr.Row(): # New row for results display
	with gr.Column(scale=1): # Column for the new status window
	with gr.Group(elem_classes="results-box"):
	gr.Markdown("#### Last Processed")
	# Placeholder for the most recent image
	last_image_display = gr.Image(label="Last Image", type="filepath", interactive=False, height=200, value=None, show_label=True) # Explicitly show label, ensure preview area renders
	# Placeholder for the runtime of the last image
	# Replace single runtime display with three separate ones
	last_champ_latency_display = gr.Textbox(label="Champion Latency (s)", value="", interactive=False)
	last_chall_latency_display = gr.Textbox(label="Challenger Latency (s)", value="", interactive=False)
	last_judge_latency_display = gr.Textbox(label="Judge Latency (s)", value="", interactive=False)

	with gr.Column(scale=2): # Column for winner and download
	with gr.Group(elem_classes="results-box"):
	gr.Markdown("#### Last Winner")
	# Textbox to show the last winner
	last_winner_output = gr.Textbox(label="Last Test Case Winner", lines=2, show_copy_button=True, interactive=False)
	with gr.Group(elem_classes="results-box"):
	gr.Markdown("#### Download Results")
	# State to hold the raw evaluation results (list of dicts) for download
	results_state = gr.State([])
	# Download Button
	download_button = gr.DownloadButton(label="Download All Evaluations (JSONL)", value=None) # This button is triggered later

	# Add placeholders for final summary and details display
	with gr.Group(elem_classes="results-box"):
	gr.Markdown("#### Overall Results")
	summary_output = gr.Textbox(label="Summary", lines=10, interactive=False)
	# Moved Detailed Evaluations DataFrame here
	detailed_evaluations_output = gr.DataFrame(label="Individual Case Results", interactive=False)
	# Removed the separate "Detailed Evaluations" group
	# Define interactions & state

	# Define the single, complete run event listener
	run_event = run_button.click(
	fn=run_test_from_ui,
	inputs=[ # Add the new dropdown inputs
	champ_name, champ_api_url, champ_model_id, champ_temp, champ_max_tokens, champ_file_upload_method,
	chall_name, chall_api_url, chall_model_id, chall_temp, chall_max_tokens, chall_file_upload_method,
	judge_name, judge_api_url, judge_model_id, judge_temp, judge_max_tokens, judge_file_upload_method,
	api_key_input,
	model_prompt_template_input,
	judge_prompt_template_input,
	test_data_file,
	test_data_text,
	batch_size_input,
	batch_retry_attempts_input,
	batch_backoff_factor_input,
	batch_max_wait_input,
	batch_retry_trigger_strings_input,
	key_field_name_input,
	value_field_name_input,
	image_field_name_input
	],
	outputs=[
	# Map the 8 yielded values from run_test_from_ui
	summary_output, # 1. Final summary text / Stop message
	detailed_evaluations_output, # 2. Final details dataframe / Last partial DF
	last_image_display, # 3. Intermediate image path / Last image
	last_champ_latency_display, # 4. Intermediate champ latency / Last latency
	last_chall_latency_display, # 5. Intermediate chall latency / Last latency
	last_judge_latency_display, # 6. Intermediate judge latency / Last latency
	last_winner_output, # 7. Intermediate winner text / Last winner
	results_state # 8. Hidden state for final/partial raw evaluations
	],
	# cancels=[run_event] # Remove self-cancellation
	)

	# Now, trigger the download file generation after the run completes, using the state
	run_event.then(
	fn=_generate_download_wrapper,
	inputs=[results_state],
	outputs=[download_button] # Output the gr.File object to the button itself
	)

	# --- Add Stop Button Interaction ---
	# Connect the stop button to the request_stop function and make it cancel the run_event
	stop_event = stop_button.click(
	fn=request_stop,
	inputs=None, # request_stop takes no inputs from UI
	outputs=None, # request_stop doesn't need to update UI directly anymore
	cancels=[run_event] # Make the stop button cancel the main test run
	)


	return iface

	def run_cli_test():
	"""Runs the A/B test from the command line using hardcoded examples."""
	logger.info("Starting CLI execution of ModelTester...")

	# --- Configuration (API Key optional for local models) ---
	# Load API keys from .env file if it exists
	try:
	from dotenv import load_dotenv
	if load_dotenv():
	logger.info("Loaded environment variables from .env file.")
	else:
	logger.info(".env file not found or empty, relying on system environment variables or UI input.")
	except ImportError:
	logger.warning("python-dotenv not installed, cannot load .env file. Run 'pip install python-dotenv' or ensure packages are installed.")

	OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY") # Still useful for judge or cloud models
	OLLAMA_API_URL = "http://localhost:11434/api/generate"

	# Define Model Endpoints
	# Using Ollama Mistral as Champion
	champion_model = ModelEndpoint(
	name="Champion (Ollama Mistral)",
	api_url=OLLAMA_API_URL,
	api_key=None, # No key needed for local Ollama
	model_id="mistral:latest", # Adjust if your model name is different
	temperature=0.1
	)

	# Using Ollama Gemma 2 9B as Challenger
	challenger_model = ModelEndpoint(
	name="Challenger (Ollama Gemma2 9B)",
	api_url=OLLAMA_API_URL,
	api_key=None, # No key needed
	model_id="hf.co/stduhpf/google-gemma-3-27b-it-qat-q4_0-gguf-small:latest", # Updated to match ollama list
	temperature=0.1,
	max_tokens=2048
	)

	# Using OpenRouter GPT-4o Mini as Judge (Requires API Key)
	if not OPENROUTER_API_KEY:
	logger.warning("OPENROUTER_API_KEY not set (checked ENV and .env). Using Champion model as Judge (less ideal).")
	judge_model = champion_model # Fallback judge
	judge_model.name = "Judge (Fallback - Ollama Mistral)"
	else:
	logger.info("Using OpenRouter API Key for Judge.")
	judge_model = ModelEndpoint(
	name="Judge (GPT-4o Mini - OR)",
	api_url="https://openrouter.ai/api/v1/chat/completions",
	api_key=OPENROUTER_API_KEY,
	model_id="openai/gpt-4o-mini",
	temperature=0.0,
	max_tokens=2048
	)

	# Define Model Prompt Template
	model_prompt = "User: {key}\nAssistant:"

	# Define Sample Test Cases (Including Multimodal Example)
	# Create a dummy image file for testing if it doesn't exist
	dummy_image_path = "dummy_test_image.png"
	if not os.path.exists(dummy_image_path):
	try:
	from PIL import Image, ImageDraw
	img = Image.new('RGB', (100, 50), color = (73, 109, 137)) # Blueish background
	d = ImageDraw.Draw(img)
	d.text((10,10), "Test Img", fill=(255,255,0)) # Yellow text
	img.save(dummy_image_path)
	logger.info(f"Created dummy image file: {dummy_image_path}")
	except ImportError:
	logger.warning("Pillow (PIL) not installed, cannot create dummy image for CLI test. Ensure packages are installed via venv setup.")
	dummy_image_path = None # Cannot use image test case
	except Exception as e:
	logger.error(f"Failed to create dummy image: {e}")
	dummy_image_path = None


	test_cases = [
	TestCase(id="q1", key="What is the capital of France?", value="Paris", image_path_or_url=None),
	TestCase(id="q2", key="Summarize the plot of the movie 'Inception'.", value="A thief steals information by entering people's dreams.", image_path_or_url=None),
	]
	if dummy_image_path:
	test_cases.append(TestCase(id="img1", key=f"Describe this image.", value="Blue rectangle with yellow text 'Test Img'", image_path_or_url=dummy_image_path))
	else:
	logger.warning("Skipping multimodal test case in CLI as dummy image could not be created.")


	# --- Execution ---
	try:
	# Instantiate the tester
	tester = ModelTester(
	champion_endpoint=champion_model,
	challenger_endpoint=challenger_model,
	judge_endpoint=judge_model,
	model_prompt_template=model_prompt,
	judge_prompt_template=LMJudge.DEFAULT_EVALUATION_PROMPT # Use default judge prompt
	)

	logger.info(f"Running CLI test with {len(test_cases)} test cases...")
	# Run the test
	results_generator = tester.run_test( # Renamed variable for clarity
	test_cases,
	batch_size=2,
	batch_retry_attempts=1,
	batch_backoff_factor=2.0,
	batch_max_wait=30,
	batch_retry_trigger_strings=["rate limit", "error", "timeout"]
	)

	# Consume the generator to get the final result
	final_results_dict = None
	for res in results_generator:
	# Assuming intermediate yields might be progress updates (optional logging)
	# logger.debug(f"Intermediate result/progress: {res}")
	final_results_dict = res # Keep overwriting until the last one

	if final_results_dict is None:
	logger.error("Test run generator did not yield a final result.")
	# Handle error appropriately, maybe raise or return early
	# For now, let's just log and proceed, it might fail later anyway
	final_results_dict = {} # Assign empty dict to avoid immediate crash below

	# --- Output Results ---
	logger.info("Test completed. Final Results:")

	# Use the formatter function
	summary_output = format_summary_output(final_results_dict.get("summary", {})) # Use final_results_dict
	print("\n" + summary_output)


	# Optionally save full results to JSON
	results_filename = f"cli_results_{time.strftime('%Y%m%d-%H%M%S')}.json"
	try:
	# Need to ensure results are serializable (dataclasses might need conversion)
	# The aggregator already converts raw evals to dicts. Summary should be fine.
	with open(results_filename, 'w', encoding='utf-8') as f:
	json.dump(final_results_dict, f, indent=2, ensure_ascii=False) # Use final_results_dict
	logger.info(f"Full results saved to {results_filename}")
	print(f"\nFull results saved to: {results_filename}")
	except TypeError as e:
	logger.error(f"Failed to save results to JSON due to serialization issue: {e}")
	print(f"\nWarning: Could not save full results to JSON: {e}")
	except Exception as e:
	logger.error(f"Failed to save results to JSON: {e}")
	print(f"\nWarning: Could not save full results to JSON: {e}")


	except Exception as e:
	logger.exception("An error occurred during the CLI execution.")
	print(f"\nAn error occurred during CLI execution: {e}")
	finally:
	# Clean up dummy image if created and path exists
	if dummy_image_path and os.path.exists(dummy_image_path):
	try:
	os.remove(dummy_image_path)
	logger.info(f"Removed dummy image file: {dummy_image_path}")
	except Exception as e:
	logger.warning(f"Could not remove dummy image file {dummy_image_path}: {e}")

	# ==============================================================================
	# Main Execution Logic
	# ==============================================================================

	def main():
	"""Main function to parse arguments and run either CLI or UI."""
	# Basic argument parsing: run CLI test by default, or launch UI with --ui flag
	import argparse
	parser = argparse.ArgumentParser(description="Model A/B Testing Tool")
	parser.add_argument("--ui", action="store_true", help="Launch the Gradio web UI instead of running the CLI test.")
	args = parser.parse_args()

	if args.ui:
	logger.info("Launching Gradio UI...")
	iface = create_ui()
	if iface:
	# Add share=True for public link if needed, auth=("user", "pass") for security
	# Add server_name="0.0.0.0" to listen on all interfaces if running in Docker/remote
	iface.launch(share=True)
	else:
	logger.error("Failed to create Gradio UI.")
	print("Error: Could not create the Gradio UI.")
	else:
	# Set up signal handler for CLI stop (Ctrl+C)
	def signal_handler(sig, frame):
	global STOP_REQUESTED
	if not STOP_REQUESTED:
	print("\nCtrl+C detected. Requesting stop after current batch...")
	logger.warning("Stop requested via Ctrl+C.")
	STOP_REQUESTED = True
	else:
	# Allow force exit on second Ctrl+C
	print("\nCtrl+C detected again. Forcing exit.")
	logger.error("Forced exit via second Ctrl+C.")
	sys.exit(1)

	signal.signal(signal.SIGINT, signal_handler)

	# Run the command-line test
	run_cli_test()


	if __name__ == "__main__":
	# Ensure we are running in the correct virtual environment
	# ensure_venv() will handle creation, installation, and re-execution if necessary.
	# If ensure_venv() returns True, it means we are now in the correct venv.
	if ensure_venv():
	# Now that we are confirmed to be in the venv, execute the main logic
	main()
	# If ensure_venv() returned False (or exited), the script either failed or restarted itself.