diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b87cde4ef37e7285d3c0477b2b76c1909fb790b5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+
+# Virtual environments
+venv/
+env/
+ENV/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Build artifacts
+*.egg-info/
+dist/
+build/
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02e8e6352061c1c5eff24631d8dc314c56e599c8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+
+
+---
+title: Universal Prompt Optimizer
+emoji: ๐งฌ
+colorFrom: blue
+colorTo: cyan
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# Universal Prompt Optimizer
+
+A powerful genetic evolutionary prompt optimization tool built with GEPA (Genetic Evolutionary Prompt Agent). Optimize your prompts using genetic algorithms with optional LLEGO crossover for faster convergence.
+
+## Features
+
+- ๐งฌ **Genetic Algorithm Optimization**: Evolve prompts through multiple iterations
+- ๐ฏ **Multi-Model Support**: Works with OpenAI, Anthropic, Google, and custom models
+- ๐ **Real-time Metrics**: Track optimization progress and improvements
+- ๐ผ๏ธ **Multi-modal Support**: Include images in your training examples
+- โก **LLEGO Crossover**: Advanced genetic operations for faster convergence
+
+## How to Use
+
+1. **Select Model**: Choose your target LLM (GPT-4, Claude, Gemini, or custom)
+2. **Enter Seed Prompt**: Describe your task, constraints, and desired output format
+3. **Add Training Examples**: Provide input/output pairs (images optional)
+4. **Configure Optimization**: Set evolution rounds, batch size, and enable LLEGO
+5. **Start Optimization**: Watch as the genetic algorithm evolves your prompt
+
+## API Keys
+
+API keys are stored in-session only and never logged. You can provide them in the UI or set them as environment variables:
+
+- `OPENAI_API_KEY`
+- `ANTHROPIC_API_KEY`
+- `GOOGLE_API_KEY`
+
+## License
+
+MIT License
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..68f431f56a38e1a4767609a3d601b84667596c92
--- /dev/null
+++ b/app.py
@@ -0,0 +1,1563 @@
+"""
+๐ Universal Prompt Optimizer - Enhanced Production UI v8.0
+Principal Engineer Edition: Linear/Vercel-style Dark Mode with Premium UX
+"""
+
+import sys
+import os
+# Add src directory to Python path for gepa_optimizer imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+import gradio as gr
+import json
+import base64
+import io
+import os
+import logging
+import traceback
+import html
+import numpy as np
+from PIL import Image as PILImage
+from typing import List, Dict, Optional, Any, Tuple
+import threading
+from collections import deque
+
+# Optional import for URL image downloads
+try:
+ import requests
+ REQUESTS_AVAILABLE = True
+except ImportError:
+ REQUESTS_AVAILABLE = False
+
+# ==========================================
+# 0. LOGGING & BACKEND UTILS
+# ==========================================
+logging.basicConfig(
+ level=logging.INFO,
+ format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Global Candidates Store (Thread-safe)
+_candidates_store = {
+ 'candidates': deque(maxlen=100),
+ 'lock': threading.Lock(),
+ 'iteration': 0
+}
+
+def add_candidate_to_store(candidate: Dict[str, Any]):
+ with _candidates_store['lock']:
+ _candidates_store['candidates'].append({
+ 'iteration': _candidates_store['iteration'],
+ 'source': candidate.get('source', 'unknown'),
+ 'prompt': candidate.get('prompt', ''),
+ 'timestamp': candidate.get('timestamp', ''),
+ 'index': len(_candidates_store['candidates']) + 1
+ })
+
+def get_candidates_from_store() -> List[Dict[str, Any]]:
+ with _candidates_store['lock']:
+ return list(_candidates_store['candidates'])
+
+def clear_candidates_store():
+ with _candidates_store['lock']:
+ _candidates_store['candidates'].clear()
+ _candidates_store['iteration'] = 0
+
+def increment_iteration():
+ with _candidates_store['lock']:
+ _candidates_store['iteration'] += 1
+
+# ==========================================
+# 1. MOCK BACKEND (Kept as provided)
+# ==========================================
+try:
+ from gepa_optimizer import quick_optimize_sync, OptimizedResult
+ BACKEND_AVAILABLE = True
+except ImportError:
+ BACKEND_AVAILABLE = False
+ from dataclasses import dataclass
+
+ @dataclass
+ class OptimizedResult:
+ optimized_prompt: str
+ improvement_metrics: dict
+ iteration_history: list
+
+ def quick_optimize_sync(seed_prompt, dataset, model, **kwargs):
+ import time
+ iterations = kwargs.get('max_iterations', 5)
+ batch_size = kwargs.get('batch_size', 4)
+ use_llego = kwargs.get('use_llego', True)
+
+ # Simulate processing time based on iterations
+ time.sleep(0.5 * iterations)
+
+ llego_note = "with LLEGO crossover" if use_llego else "standard mutation only"
+
+ return OptimizedResult(
+ optimized_prompt=f"""# OPTIMIZED PROMPT FOR {model}
+# ----------------------------------------
+# Optimization: {iterations} iterations, batch size {batch_size}, {llego_note}
+
+## Task Context
+{seed_prompt}
+
+## Refined Instructions
+1. Analyse the input constraints strictly.
+2. Verify output format against expected schema.
+3. Apply chain-of-thought reasoning before answering.
+4. Cross-reference with provided examples for consistency.
+
+## Safety & Edge Cases
+- If input is ambiguous, ask for clarification.
+- Maintain a professional, neutral tone.
+- Handle edge cases gracefully with informative responses.""",
+ improvement_metrics={
+ "baseline_score": 0.45,
+ "final_score": 0.92,
+ "improvement": "+104.4%",
+ "iterations_run": iterations,
+ "candidates_evaluated": iterations * batch_size,
+ },
+ iteration_history=[
+ f"Iter 1: Baseline evaluation - Score: 0.45",
+ f"Iter 2: Added Chain-of-Thought constraints - Score: 0.62",
+ f"Iter 3: Refined output formatting rules - Score: 0.78",
+ f"Iter 4: {'LLEGO crossover applied' if use_llego else 'Mutation applied'} - Score: 0.88",
+ f"Iter 5: Final refinement - Score: 0.92",
+ ][:iterations],
+ )
+
+# ==========================================
+# 2. HELPER FUNCTIONS
+# ==========================================
+def gradio_image_to_base64(image_input) -> Optional[str]:
+ """Convert Gradio image input to base64 string with comprehensive error handling."""
+ if image_input is None:
+ return None
+
+ try:
+ pil_image = None
+
+ if isinstance(image_input, np.ndarray):
+ try:
+ # Validate array shape and dtype
+ if image_input.size == 0:
+ logger.warning("Empty image array provided")
+ return None
+ pil_image = PILImage.fromarray(image_input)
+ except (ValueError, TypeError) as e:
+ logger.error(f"Failed to convert numpy array to PIL Image: {str(e)}")
+ return None
+ elif isinstance(image_input, PILImage.Image):
+ pil_image = image_input
+ elif isinstance(image_input, str):
+ if not os.path.exists(image_input):
+ logger.warning(f"Image file not found: {image_input}")
+ return None
+ try:
+ pil_image = PILImage.open(image_input)
+ except (IOError, OSError) as e:
+ logger.error(f"Failed to open image file: {str(e)}")
+ return None
+ else:
+ logger.warning(f"Unsupported image input type: {type(image_input)}")
+ return None
+
+ if pil_image is None:
+ return None
+
+ try:
+ # Validate image before encoding
+ pil_image.verify()
+ # Reopen after verify (verify closes the image)
+ pil_image = PILImage.open(io.BytesIO(pil_image.tobytes()))
+ except Exception:
+ # If verify fails, try to proceed anyway
+ pass
+
+ try:
+ buffered = io.BytesIO()
+ pil_image.save(buffered, format="PNG")
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+ return f"data:image/png;base64,{img_str}"
+ except (IOError, OSError, ValueError) as e:
+ logger.error(f"Failed to encode image to base64: {str(e)}")
+ return None
+ except Exception as e:
+ logger.error(f"Unexpected error in image conversion: {str(e)}\n{traceback.format_exc()}")
+ return None
+
+def validate_dataset(dataset: List[Dict]) -> Tuple[bool, str]:
+ """Validate dataset structure and content with detailed error messages."""
+ if not isinstance(dataset, list):
+ return False, "Dataset must be a list of examples."
+
+ if len(dataset) == 0:
+ return False, "Dataset is empty. Add at least one example."
+
+ # Validate each item in the dataset
+ for i, item in enumerate(dataset):
+ if not isinstance(item, dict):
+ return False, f"Dataset item {i+1} must be a dictionary with 'input' and 'output' keys."
+
+ if "input" not in item or "output" not in item:
+ return False, f"Dataset item {i+1} is missing required 'input' or 'output' field."
+
+ if not isinstance(item.get("input"), str) or not isinstance(item.get("output"), str):
+ return False, f"Dataset item {i+1} has invalid 'input' or 'output' type (must be strings)."
+
+ if not item.get("input", "").strip() or not item.get("output", "").strip():
+ return False, f"Dataset item {i+1} has empty 'input' or 'output' field."
+
+ return True, ""
+
+def validate_model(model: str, custom_model: str) -> Tuple[bool, str]:
+ """Validate model selection and custom model format."""
+ if not model:
+ return False, "Please select a foundation model."
+
+ if model == "custom":
+ if not custom_model or not custom_model.strip():
+ return False, "Custom model selected but no model ID provided."
+
+ # Validate custom model format (provider/model_name)
+ parts = custom_model.strip().split("/")
+ if len(parts) != 2:
+ return False, "Custom model ID must be in format 'provider/model_name' (e.g., 'openai/gpt-4')."
+
+ if not parts[0].strip() or not parts[1].strip():
+ return False, "Custom model ID provider and model name cannot be empty."
+
+ return True, ""
+
+def validate_api_keys(model: str, api_keys: Dict[str, str]) -> Tuple[bool, str]:
+ """Validate that required API keys are provided for the selected model."""
+ if not api_keys:
+ return True, "" # Keys are optional if already set in environment
+
+ model_provider = model.split("/")[0] if "/" in model else model.lower()
+
+ # Check if model requires a specific provider key
+ required_providers = {
+ "openai": "openai",
+ "anthropic": "anthropic",
+ "google": "google"
+ }
+
+ if model_provider in required_providers:
+ provider = required_providers[model_provider]
+ key_value = api_keys.get(provider, "").strip() if api_keys.get(provider) else ""
+
+ # Check environment variable as fallback
+ env_vars = {
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "google": "GOOGLE_API_KEY"
+ }
+
+ if not key_value and not os.environ.get(env_vars.get(provider, "")):
+ return False, f"API key for {provider.capitalize()} is required for model '{model}' but not provided."
+
+ return True, ""
+
+def safe_optimize(seed_prompt, dataset, model, custom_model="", max_iterations=5, max_metric_calls=50, batch_size=4, use_llego=True, api_keys=None):
+ """Safely run optimization with comprehensive error handling."""
+ try:
+ # Validate seed prompt
+ if not seed_prompt or not isinstance(seed_prompt, str):
+ return False, "Seed prompt is required and must be a string.", None
+
+ if not seed_prompt.strip():
+ return False, "Seed prompt cannot be empty.", None
+
+ # Validate dataset
+ is_valid, msg = validate_dataset(dataset)
+ if not is_valid:
+ return False, msg, None
+
+ # Determine final model
+ final_model = custom_model.strip() if custom_model and custom_model.strip() else model
+
+ # Validate model
+ model_valid, model_msg = validate_model(model, custom_model)
+ if not model_valid:
+ return False, model_msg, None
+
+ # Validate API keys
+ api_valid, api_msg = validate_api_keys(final_model, api_keys or {})
+ if not api_valid:
+ return False, api_msg, None
+
+ # Validate optimization parameters
+ if not isinstance(max_iterations, int) or max_iterations < 1 or max_iterations > 50:
+ return False, "Max iterations must be between 1 and 50.", None
+
+ if not isinstance(max_metric_calls, int) or max_metric_calls < 10 or max_metric_calls > 500:
+ return False, "Max metric calls must be between 10 and 500.", None
+
+ if not isinstance(batch_size, int) or batch_size < 1 or batch_size > 20:
+ return False, "Batch size must be between 1 and 20.", None
+
+ # Check backend availability
+ if not BACKEND_AVAILABLE:
+ logger.warning("Backend not available, using mock optimizer")
+
+ # Set API keys from UI if provided
+ if api_keys:
+ try:
+ key_mapping = {
+ "openai": "OPENAI_API_KEY",
+ "google": "GOOGLE_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ }
+ for provider, env_var in key_mapping.items():
+ if api_keys.get(provider) and api_keys[provider].strip():
+ os.environ[env_var] = api_keys[provider].strip()
+ logger.info(f"Set {provider} API key from UI")
+ except Exception as e:
+ logger.error(f"Failed to set API keys: {str(e)}")
+ return False, f"Failed to configure API keys: {str(e)}", None
+
+ # Run optimization
+ try:
+ result = quick_optimize_sync(
+ seed_prompt=seed_prompt,
+ dataset=dataset,
+ model=final_model,
+ max_iterations=max_iterations,
+ max_metric_calls=max_metric_calls,
+ batch_size=batch_size,
+ use_llego=use_llego,
+ verbose=True,
+ )
+
+ # Validate result structure
+ if not result:
+ return False, "Optimization returned no result.", None
+
+ if not hasattr(result, 'optimized_prompt'):
+ return False, "Optimization result is missing required fields.", None
+
+ return True, "Success", result
+
+ except KeyboardInterrupt:
+ logger.warning("Optimization interrupted by user")
+ return False, "Optimization was interrupted.", None
+ except TimeoutError:
+ logger.error("Optimization timed out")
+ return False, "Optimization timed out. Try reducing max_iterations or max_metric_calls.", None
+ except ConnectionError as e:
+ logger.error(f"Connection error during optimization: {str(e)}")
+ return False, f"Connection error: {str(e)}. Check your internet connection and API keys.", None
+ except ValueError as e:
+ logger.error(f"Invalid parameter in optimization: {str(e)}")
+ return False, f"Invalid configuration: {str(e)}", None
+ except Exception as e:
+ error_msg = str(e)
+ logger.error(f"Optimization failed: {error_msg}\n{traceback.format_exc()}")
+ # Provide user-friendly error messages
+ if "api" in error_msg.lower() or "key" in error_msg.lower():
+ return False, f"API error: {error_msg}. Please check your API keys.", None
+ elif "rate limit" in error_msg.lower():
+ return False, "Rate limit exceeded. Please wait a moment and try again.", None
+ elif "quota" in error_msg.lower():
+ return False, "API quota exceeded. Please check your account limits.", None
+ else:
+ return False, f"Optimization failed: {error_msg}", None
+
+ except Exception as e:
+ logger.error(f"Unexpected error in safe_optimize: {str(e)}\n{traceback.format_exc()}")
+ return False, f"Unexpected error: {str(e)}", None
+
+# ==========================================
+# 3. UI LOGIC
+# ==========================================
+def add_example(input_text, output_text, image_input, current_dataset):
+ """Add an example to the dataset with comprehensive error handling."""
+ try:
+ # Validate inputs
+ if not input_text:
+ raise gr.Error("Input text is required.")
+
+ if not output_text:
+ raise gr.Error("Output text is required.")
+
+ if not isinstance(input_text, str) or not isinstance(output_text, str):
+ raise gr.Error("Input and Output must be text strings.")
+
+ input_text = input_text.strip()
+ output_text = output_text.strip()
+
+ if not input_text:
+ raise gr.Error("Input text cannot be empty.")
+
+ if not output_text:
+ raise gr.Error("Output text cannot be empty.")
+
+ # Validate dataset state
+ if not isinstance(current_dataset, list):
+ raise gr.Error("Dataset state is invalid. Please refresh the page.")
+
+ # Process image with error handling
+ img_b64 = None
+ try:
+ img_b64 = gradio_image_to_base64(image_input)
+ except Exception as e:
+ logger.warning(f"Image processing failed, continuing without image: {str(e)}")
+ # Continue without image - it's optional
+
+ # Create new item
+ try:
+ new_item = {
+ "input": input_text,
+ "output": output_text,
+ "image": img_b64,
+ "image_preview": "๐ผ๏ธ Image" if img_b64 else "-"
+ }
+
+ # Validate item structure
+ if not isinstance(new_item["input"], str) or not isinstance(new_item["output"], str):
+ raise gr.Error("Failed to create dataset item: invalid data types.")
+
+ current_dataset.append(new_item)
+
+ return current_dataset, "", "", None
+
+ except Exception as e:
+ logger.error(f"Failed to add example to dataset: {str(e)}")
+ raise gr.Error(f"Failed to add example: {str(e)}")
+
+ except gr.Error:
+ # Re-raise Gradio errors as-is
+ raise
+ except Exception as e:
+ logger.error(f"Unexpected error in add_example: {str(e)}\n{traceback.format_exc()}")
+ raise gr.Error(f"Unexpected error: {str(e)}")
+
+def update_table(dataset):
+ """Update the dataset table display with error handling."""
+ try:
+ if not dataset:
+ return []
+
+ if not isinstance(dataset, list):
+ logger.error(f"Invalid dataset type: {type(dataset)}")
+ return []
+
+ table_data = []
+ for i, item in enumerate(dataset):
+ try:
+ if not isinstance(item, dict):
+ logger.warning(f"Skipping invalid dataset item {i+1}: not a dictionary")
+ continue
+
+ input_text = str(item.get("input", ""))[:50] if item.get("input") else ""
+ output_text = str(item.get("output", ""))[:50] if item.get("output") else ""
+ image_preview = str(item.get("image_preview", "-"))
+
+ table_data.append([i+1, input_text, output_text, image_preview])
+ except Exception as e:
+ logger.warning(f"Error processing dataset item {i+1}: {str(e)}")
+ continue
+
+ return table_data
+
+ except Exception as e:
+ logger.error(f"Error updating table: {str(e)}\n{traceback.format_exc()}")
+ return []
+
+def clear_dataset():
+ """Clear the dataset with error handling."""
+ try:
+ return [], []
+ except Exception as e:
+ logger.error(f"Error clearing dataset: {str(e)}")
+ return [], []
+
+def get_candidates_display():
+ """Generate HTML display for candidates with error handling."""
+ try:
+ candidates = get_candidates_from_store()
+
+ if not candidates:
+ return "
๐งฌ
Waiting for optimization to start...
"
+
+ if not isinstance(candidates, list):
+ logger.error(f"Invalid candidates type: {type(candidates)}")
+ return "Error loading candidates.
"
+
+ html_output = ""
+
+ # Show last 10 candidates
+ candidates_to_show = list(candidates)[-10:]
+ for c in reversed(candidates_to_show):
+ try:
+ if not isinstance(c, dict):
+ continue
+
+ iteration = str(c.get('iteration', '?'))
+ source = str(c.get('source', 'unknown')).upper()
+ prompt = str(c.get('prompt', ''))[:200]
+
+ # Escape HTML to prevent XSS
+ iteration = html.escape(iteration)
+ source = html.escape(source)
+ prompt = html.escape(prompt)
+
+ html_output += f"""
+
+
+
+ ITERATION {iteration}
+ {source}
+
+
{prompt}...
+
+ """
+ except Exception as e:
+ logger.warning(f"Error rendering candidate: {str(e)}")
+ continue
+
+ html_output += "
"
+ return html_output
+
+ except Exception as e:
+ logger.error(f"Error generating candidates display: {str(e)}\n{traceback.format_exc()}")
+ return "Error loading candidates display.
"
+
+def run_optimization_flow(seed, dataset, model, custom_model, iter_count, call_count, batch, llego, k_openai, k_google, k_anthropic, progress=gr.Progress()):
+ """Run the optimization flow with comprehensive error handling."""
+ import time
+
+ try:
+ # Validate inputs
+ if not seed:
+ raise gr.Error("Seed prompt is required.")
+
+ if not dataset:
+ raise gr.Error("Dataset is required. Add at least one example.")
+
+ if not model:
+ raise gr.Error("Model selection is required.")
+
+ # Validate numeric parameters
+ try:
+ iter_count = int(iter_count) if iter_count else 5
+ call_count = int(call_count) if call_count else 50
+ batch = int(batch) if batch else 4
+ except (ValueError, TypeError) as e:
+ raise gr.Error(f"Invalid optimization parameters: {str(e)}")
+
+ # Determine final model
+ try:
+ final_model = custom_model.strip() if custom_model and custom_model.strip() else model
+ except Exception as e:
+ logger.warning(f"Error processing custom model: {str(e)}")
+ final_model = model
+
+ # Clear candidates store
+ try:
+ clear_candidates_store()
+ except Exception as e:
+ logger.warning(f"Error clearing candidates store: {str(e)}")
+
+ # Prepare API keys
+ api_keys = {}
+ try:
+ api_keys = {
+ "openai": k_openai if k_openai else "",
+ "google": k_google if k_google else "",
+ "anthropic": k_anthropic if k_anthropic else ""
+ }
+ except Exception as e:
+ logger.warning(f"Error processing API keys: {str(e)}")
+
+ # Initial state
+ try:
+ yield (
+ gr.update(visible=True),
+ gr.update(visible=False),
+ gr.update(visible=False),
+ "๐ Initializing Genetic Algorithm...",
+ "", {}, "", ""
+ )
+ time.sleep(0.5) # Brief pause for UI update
+ except Exception as e:
+ logger.error(f"Error in initial UI update: {str(e)}")
+ raise gr.Error(f"Failed to initialize UI: {str(e)}")
+
+ # Evolution loop (visual progress - actual work happens in safe_optimize)
+ try:
+ for i in range(1, iter_count + 1):
+ try:
+ increment_iteration()
+ add_candidate_to_store({
+ "source": "evolution_step",
+ "prompt": f"Candidate {i}: Optimizing instruction clarity and task alignment...",
+ "timestamp": "now"
+ })
+
+ progress(i/iter_count, desc=f"Evolution Round {i}/{iter_count}")
+ yield (
+ gr.update(), gr.update(), gr.update(),
+ f"๐งฌ **Evolution Round {i}/{iter_count}**\n\nโข Generating {batch} prompt mutations\nโข Evaluating fitness scores\nโข Selecting top candidates",
+ "", {}, "", get_candidates_display()
+ )
+ time.sleep(0.3) # Pause to show progress
+ except Exception as e:
+ logger.warning(f"Error in evolution step {i}: {str(e)}")
+ # Continue with next iteration
+ continue
+ except Exception as e:
+ logger.error(f"Error in evolution loop: {str(e)}")
+ # Continue to optimization attempt
+
+ # Final optimization
+ try:
+ success, msg, result = safe_optimize(
+ seed_prompt=seed,
+ dataset=dataset,
+ model=model,
+ custom_model=custom_model,
+ max_iterations=iter_count,
+ max_metric_calls=call_count,
+ batch_size=batch,
+ use_llego=llego,
+ api_keys=api_keys
+ )
+
+ if not success:
+ # Show error state
+ yield (
+ gr.update(visible=True),
+ gr.update(visible=False),
+ gr.update(visible=False),
+ f"โ **Optimization Failed**\n\n{msg}",
+ "", {}, "", get_candidates_display()
+ )
+ raise gr.Error(msg)
+
+ # Validate result before displaying
+ if not result:
+ raise gr.Error("Optimization completed but returned no result.")
+
+ if not hasattr(result, 'optimized_prompt'):
+ raise gr.Error("Optimization result is missing required fields.")
+
+ # Show results
+ try:
+ optimized_prompt = result.optimized_prompt if result.optimized_prompt else ""
+ improvement_metrics = result.improvement_metrics if hasattr(result, 'improvement_metrics') else {}
+ iteration_history = result.iteration_history if hasattr(result, 'iteration_history') else []
+
+ history_text = "\n".join(iteration_history) if isinstance(iteration_history, list) else str(iteration_history)
+
+ yield (
+ gr.update(visible=False),
+ gr.update(visible=False),
+ gr.update(visible=True),
+ "โ
Optimization Complete",
+ optimized_prompt,
+ improvement_metrics,
+ history_text,
+ get_candidates_display()
+ )
+ except Exception as e:
+ logger.error(f"Error displaying results: {str(e)}")
+ raise gr.Error(f"Failed to display results: {str(e)}")
+
+ except gr.Error:
+ # Re-raise Gradio errors
+ raise
+ except Exception as e:
+ logger.error(f"Error in optimization: {str(e)}\n{traceback.format_exc()}")
+ raise gr.Error(f"Optimization error: {str(e)}")
+
+ except gr.Error:
+ # Re-raise Gradio errors as-is
+ raise
+ except KeyboardInterrupt:
+ logger.warning("Optimization interrupted by user")
+ raise gr.Error("Optimization was interrupted.")
+ except Exception as e:
+ logger.error(f"Unexpected error in optimization flow: {str(e)}\n{traceback.format_exc()}")
+ raise gr.Error(f"Unexpected error: {str(e)}")
+
+# ==========================================
+# 4. ENHANCED CSS (Linear/Vercel-style)
+# ==========================================
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
+
+:root {
+ --bg0: #070A0F;
+ --bg1: #0B1020;
+ --bg2: rgba(255,255,255,0.04);
+ --bg3: rgba(255,255,255,0.06);
+
+ --stroke0: rgba(148,163,184,0.14);
+ --stroke1: rgba(148,163,184,0.22);
+
+ --text0: #EAF0FF;
+ --text1: rgba(234,240,255,0.74);
+ --text2: rgba(234,240,255,0.56);
+
+ --teal: #06B6D4;
+ --blue: #3B82F6;
+
+ --ok: #10B981;
+ --okGlow: rgba(16,185,129,0.18);
+
+ --bad: #EF4444;
+
+ --shadow: 0 12px 40px rgba(0,0,0,0.45);
+ --shadowSoft: 0 10px 24px rgba(0,0,0,0.32);
+
+ --radius: 14px;
+ --radiusSm: 10px;
+}
+
+html, body {
+ background: radial-gradient(1200px 700px at 20% -10%, rgba(6,182,212,0.13), transparent 55%),
+ radial-gradient(1000px 650px at 90% 0%, rgba(59,130,246,0.10), transparent 60%),
+ linear-gradient(180deg, var(--bg0) 0%, var(--bg1) 100%);
+ color: var(--text0);
+ font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
+}
+
+.gradio-container {
+ max-width: 1520px !important;
+ padding: 12px 18px !important;
+ margin: 0 auto !important;
+}
+
+/* --- App shell --- */
+.app-shell { min-height: auto !important; }
+.topbar {
+ padding: 12px 14px 12px 14px;
+ margin-bottom: 4px;
+ border: 1px solid var(--stroke0);
+ border-radius: var(--radius);
+ background: linear-gradient(180deg, rgba(255,255,255,0.04) 0%, rgba(255,255,255,0.02) 100%);
+ box-shadow: var(--shadowSoft);
+}
+.topbar-wrap { margin-bottom: 0 !important; }
+
+.brand-row { display: flex; align-items: center; justify-content: space-between; gap: 16px; }
+.brand-left { display: flex; align-items: center; gap: 14px; }
+.brand-mark {
+ width: 44px; height: 44px; border-radius: 12px;
+ background: linear-gradient(135deg, rgba(6,182,212,0.26), rgba(59,130,246,0.20));
+ border: 1px solid rgba(6,182,212,0.30);
+ box-shadow: 0 0 0 4px rgba(6,182,212,0.10);
+ display: flex; align-items: center; justify-content: center;
+ font-weight: 800;
+}
+.h1 {
+ font-size: 22px; font-weight: 800; letter-spacing: -0.02em;
+ margin: 0; line-height: 1.2;
+}
+.subtitle { margin-top: 4px; color: var(--text1); font-weight: 500; font-size: 13px; }
+
+.status-pill {
+ display: inline-flex; align-items: center; gap: 10px;
+ padding: 10px 12px; border-radius: 999px;
+ background: rgba(255,255,255,0.03);
+ border: 1px solid var(--stroke0);
+ color: var(--text1);
+ font-size: 12px; font-weight: 700; letter-spacing: 0.08em;
+ text-transform: uppercase;
+}
+.dot {
+ width: 10px; height: 10px; border-radius: 999px;
+ background: var(--ok);
+ box-shadow: 0 0 16px rgba(16,185,129,0.40);
+ animation: pulse 1.8s ease-in-out infinite;
+}
+@keyframes pulse { 0%, 100% { transform: scale(1); opacity: 0.95; } 50% { transform: scale(1.18); opacity: 0.70; } }
+
+/* --- Two-column layout helpers --- */
+.left-col, .right-col { min-width: 280px; }
+
+/* --- Cards / Sections --- */
+.card {
+ border-radius: var(--radius);
+ background: linear-gradient(180deg, rgba(255,255,255,0.045) 0%, rgba(255,255,255,0.022) 100%);
+ border: 1px solid var(--stroke0);
+ box-shadow: var(--shadowSoft);
+ padding: 16px;
+}
+.card + .card { margin-top: 14px; }
+
+.card-head {
+ display: flex; align-items: center; justify-content: space-between;
+ gap: 12px;
+ padding-bottom: 12px;
+ margin-bottom: 12px;
+ border-bottom: 1px solid var(--stroke0);
+}
+.card-title {
+ display: flex; align-items: center; gap: 10px;
+ font-size: 13px; font-weight: 800; letter-spacing: 0.12em;
+ text-transform: uppercase; color: var(--text1);
+}
+.step {
+ width: 30px; height: 30px; border-radius: 10px;
+ background: linear-gradient(135deg, rgba(6,182,212,0.95), rgba(59,130,246,0.95));
+ box-shadow: 0 10px 20px rgba(6,182,212,0.18);
+ display: flex; align-items: center; justify-content: center;
+ color: white; font-weight: 900; font-size: 13px;
+}
+.hint { color: var(--text2); font-size: 12px; line-height: 1.4; }
+
+.ds-count span {
+ display: inline-flex;
+ align-items: center;
+ padding: 7px 10px;
+ border-radius: 999px;
+ border: 1px solid var(--stroke0);
+ background: rgba(255,255,255,0.02);
+ color: var(--text1) !important;
+ font-weight: 700;
+ font-size: 12px;
+}
+
+/* --- Inputs --- */
+label { color: var(--text1) !important; font-weight: 650 !important; font-size: 12px !important; }
+
+textarea, input, select {
+ background: rgba(255,255,255,0.03) !important;
+ border: 1px solid var(--stroke0) !important;
+ border-radius: 12px !important;
+ color: var(--text0) !important;
+ transition: border-color 0.15s ease, box-shadow 0.15s ease, transform 0.15s ease;
+}
+
+textarea:focus, input:focus, select:focus {
+ outline: none !important;
+ border-color: rgba(6,182,212,0.55) !important;
+ box-shadow: 0 0 0 4px rgba(6,182,212,0.14) !important;
+}
+
+.keybox input { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; }
+
+.seed textarea { min-height: 160px !important; }
+.mono textarea { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; font-size: 12.5px !important; }
+
+/* --- Buttons --- */
+.cta button {
+ width: 100% !important;
+ border: 0 !important;
+ border-radius: 14px !important;
+ padding: 14px 16px !important;
+ font-size: 13px !important;
+ font-weight: 900 !important;
+ letter-spacing: 0.12em !important;
+ text-transform: uppercase !important;
+ color: white !important;
+ background: linear-gradient(135deg, rgba(6,182,212,1) 0%, rgba(59,130,246,1) 100%) !important;
+ box-shadow: 0 18px 48px rgba(6,182,212,0.22) !important;
+ position: relative !important;
+ overflow: hidden !important;
+}
+.cta button::after {
+ content: "";
+ position: absolute; inset: -120px;
+ background: radial-gradient(closest-side, rgba(255,255,255,0.18), transparent 60%);
+ transform: translateX(-40%);
+ transition: transform 0.45s ease;
+}
+.cta button:hover { transform: translateY(-1px); }
+.cta button:hover::after { transform: translateX(40%); }
+.cta button:active { transform: translateY(0px); }
+
+.btn-secondary button {
+ border-radius: 12px !important;
+ border: 1px solid var(--stroke1) !important;
+ background: rgba(255,255,255,0.03) !important;
+ color: var(--text0) !important;
+ font-weight: 800 !important;
+}
+.btn-secondary button:hover { border-color: rgba(6,182,212,0.55) !important; }
+
+.btn-danger button {
+ border-radius: 12px !important;
+ border: 1px solid rgba(239,68,68,0.55) !important;
+ background: rgba(239,68,68,0.06) !important;
+ color: rgba(255,170,170,1) !important;
+ font-weight: 900 !important;
+}
+
+/* --- Dataframe --- */
+.dataframe {
+ border-radius: 14px !important;
+ border: 1px solid var(--stroke0) !important;
+ background: rgba(255,255,255,0.02) !important;
+ overflow: hidden !important;
+}
+.dataframe thead th {
+ background: rgba(255,255,255,0.04) !important;
+ color: var(--text1) !important;
+ font-weight: 900 !important;
+ font-size: 11px !important;
+ letter-spacing: 0.10em !important;
+ text-transform: uppercase !important;
+ border-bottom: 1px solid var(--stroke0) !important;
+}
+.dataframe tbody td {
+ color: var(--text0) !important;
+ font-size: 12px !important;
+ border-bottom: 1px solid rgba(148,163,184,0.10) !important;
+}
+.dataframe tbody tr:hover { background: rgba(255,255,255,0.03) !important; }
+
+/* --- Status / Results --- */
+.panel {
+ border-radius: var(--radius);
+ border: 1px solid var(--stroke0);
+ background: linear-gradient(180deg, rgba(255,255,255,0.045), rgba(255,255,255,0.020));
+ box-shadow: var(--shadowSoft);
+ padding: 16px;
+}
+.panel-title {
+ display: flex; align-items: center; justify-content: space-between;
+ gap: 10px;
+ padding-bottom: 12px; margin-bottom: 12px;
+ border-bottom: 1px solid var(--stroke0);
+}
+.panel-title h3 { margin: 0; font-size: 13px; letter-spacing: 0.12em; text-transform: uppercase; color: var(--text1); }
+.running-pill {
+ display: inline-flex; align-items: center; gap: 10px;
+ padding: 8px 10px; border-radius: 999px;
+ border: 1px solid rgba(6,182,212,0.38);
+ background: rgba(6,182,212,0.08);
+ color: rgba(153,246,228,0.95);
+ font-weight: 900; font-size: 11px; letter-spacing: 0.10em; text-transform: uppercase;
+}
+.running-dot { width: 9px; height: 9px; border-radius: 99px; background: var(--teal); box-shadow: 0 0 18px rgba(6,182,212,0.45); animation: pulse 1.8s ease-in-out infinite; }
+
+.empty {
+ border-radius: var(--radius);
+ border: 1px dashed rgba(148,163,184,0.26);
+ background: rgba(255,255,255,0.02);
+ padding: 28px;
+ text-align: center;
+ color: var(--text2);
+}
+.empty .big { font-size: 40px; opacity: 0.22; margin-bottom: 10px; }
+.empty .t { color: var(--text1); font-weight: 800; margin-bottom: 6px; }
+.empty .s { font-size: 12px; }
+
+.results {
+ border-radius: var(--radius);
+ border: 1px solid rgba(16,185,129,0.55);
+ background: linear-gradient(180deg, rgba(16,185,129,0.12), rgba(255,255,255,0.02));
+ box-shadow: 0 0 0 4px rgba(16,185,129,0.10), 0 20px 60px rgba(0,0,0,0.42);
+ padding: 16px;
+}
+.results-banner {
+ display: flex; align-items: center; justify-content: space-between;
+ gap: 12px;
+ padding-bottom: 12px; margin-bottom: 12px;
+ border-bottom: 1px solid rgba(16,185,129,0.28);
+}
+.results-banner .k { display: flex; align-items: center; gap: 10px; }
+.results-banner .k .icon {
+ width: 36px; height: 36px; border-radius: 12px;
+ background: rgba(16,185,129,0.18);
+ border: 1px solid rgba(16,185,129,0.45);
+ display: flex; align-items: center; justify-content: center;
+}
+.results-banner .k .title { font-weight: 900; color: rgba(189,255,225,0.98); letter-spacing: 0.06em; text-transform: uppercase; font-size: 12px; }
+.results-banner .k .sub { margin-top: 2px; color: rgba(189,255,225,0.70); font-size: 12px; }
+
+.tabs { background: transparent !important; }
+.tab-nav button {
+ background: transparent !important;
+ border: 0 !important;
+ border-bottom: 2px solid transparent !important;
+ color: var(--text2) !important;
+ font-weight: 800 !important;
+ padding: 10px 12px !important;
+}
+.tab-nav button[aria-selected="true"] {
+ color: rgba(153,246,228,0.98) !important;
+ border-bottom-color: rgba(6,182,212,0.75) !important;
+}
+.tab-nav button:hover { color: var(--text0) !important; }
+
+.small-note { color: var(--text2); font-size: 12px; }
+
+/* --- Candidates stream --- */
+.cand-empty { padding: 28px; text-align: center; color: var(--text2); }
+.cand-empty-icon { font-size: 40px; opacity: 0.25; margin-bottom: 10px; }
+.cand-empty-title { color: var(--text1); font-weight: 900; margin-bottom: 4px; }
+.cand-empty-sub { font-size: 12px; }
+
+.cand-stream { display: flex; flex-direction: column; gap: 10px; }
+.cand-card {
+ border-radius: 14px;
+ border: 1px solid rgba(148,163,184,0.18);
+ background: linear-gradient(135deg, rgba(15,23,42,0.85), rgba(2,6,23,0.45));
+ overflow: hidden;
+}
+.cand-topbar { height: 2px; background: linear-gradient(90deg, var(--teal), var(--blue)); }
+.cand-header {
+ display: flex; align-items: center; justify-content: space-between;
+ gap: 10px;
+ padding: 10px 12px 0 12px;
+}
+.cand-iter { font-family: "JetBrains Mono", ui-monospace; font-size: 11px; color: rgba(153,246,228,0.92); font-weight: 800; letter-spacing: 0.08em; }
+.cand-pill {
+ font-size: 10px; font-weight: 900; letter-spacing: 0.10em;
+ padding: 5px 8px; border-radius: 999px;
+ border: 1px solid rgba(148,163,184,0.20);
+ background: rgba(255,255,255,0.03);
+ color: var(--text2);
+}
+.cand-body {
+ padding: 10px 12px 12px 12px;
+ font-family: "JetBrains Mono", ui-monospace;
+ font-size: 12px;
+ line-height: 1.6;
+ color: rgba(234,240,255,0.75);
+}
+
+/* --- Responsive --- */
+@media (max-width: 980px) {
+ .gradio-container { padding: 16px 12px !important; }
+ .brand-row { flex-direction: column; align-items: flex-start; }
+ .status-pill { align-self: stretch; justify-content: center; }
+}
+"""
+
+FORCE_DARK_JS = """
+function forceDarkTheme() {
+ try {
+ const url = new URL(window.location.href);
+ if (url.searchParams.get("__theme") !== "dark") {
+ url.searchParams.set("__theme", "dark");
+ window.location.replace(url.toString());
+ }
+ } catch (e) {
+ // no-op
+ }
+}
+forceDarkTheme();
+"""
+
+# ==========================================
+# 5. UI CONSTRUCTION (Redesigned)
+# ==========================================
+APP_TITLE = "Universal Prompt Optimizer"
+APP_SUBTITLE = "Genetic Evolutionary Prompt Agent (GEPA)"
+STATUS_READY = "System Ready"
+
+with gr.Blocks(
+ title="Universal Prompt Optimizer",
+ theme=gr.themes.Base()
+) as app:
+ dataset_state = gr.State([])
+
+ # TOP BAR
+ gr.HTML(
+ f"""
+
+
+
+
GE
+
+
{APP_TITLE}
+
{APP_SUBTITLE}
+
+
+
{STATUS_READY}
+
+
+ """,
+ elem_classes=["topbar-wrap"]
+ )
+
+ # MAIN LAYOUT
+ with gr.Row():
+
+ # LEFT COLUMN: Configuration
+ with gr.Column(scale=5):
+
+ # Step 1
+ with gr.Group(elem_classes=["card"]):
+ gr.HTML(
+ """
+
+
+
Select a target model, then provide keys (stored in-session only).
+
+ """
+ )
+
+ with gr.Row():
+ model_select = gr.Dropdown(
+ label="Foundation Model",
+ choices=[
+ "openai/gpt-4o",
+ "openai/gpt-4-turbo",
+ "anthropic/claude-3-5-sonnet",
+ "google/gemini-1.5-pro",
+ "custom"
+ ],
+ value="openai/gpt-4o",
+ scale=2
+ )
+ custom_model_input = gr.Textbox(
+ label="Custom Model ID",
+ placeholder="provider/model_name",
+ scale=1
+ )
+
+ gr.HTML('API Access Keys
')
+ gr.Markdown("*Keys are stored in-session only and never logged*", elem_classes=["text-xs"])
+
+ with gr.Row():
+ key_openai = gr.Textbox(
+ label="OpenAI API Key",
+ type="password",
+ placeholder="sk-...",
+ scale=1
+ )
+ key_google = gr.Textbox(
+ label="Google API Key",
+ type="password",
+ placeholder="AIza...",
+ scale=1
+ )
+ key_anthropic = gr.Textbox(
+ label="Anthropic API Key",
+ type="password",
+ placeholder="sk-ant...",
+ scale=1
+ )
+
+ # Step 2
+ with gr.Group(elem_classes=["card"]):
+ gr.HTML(
+ """
+
+
+
Describe the task, constraints, output format, and tone.
+
+ """
+ )
+ seed_input = gr.Textbox(
+ label="Task Description",
+ placeholder="Example: You are a code reviewer that identifies security vulnerabilities in Python code. Return a JSON report with severity and fixes...",
+ lines=7,
+ max_lines=14,
+ elem_classes=["seed", "mono"]
+ )
+
+ # Step 3
+ with gr.Group(elem_classes=["card"]):
+ gr.HTML(
+ """
+
+
+
Add a few high-quality I/O pairs (images optional) to shape the optimizer.
+
+ """
+ )
+
+ with gr.Tabs():
+ with gr.Tab("Manual Entry"):
+ with gr.Row():
+ with gr.Column(scale=2):
+ d_in = gr.Textbox(
+ label="Input / User Prompt",
+ placeholder="Example user input...",
+ lines=3
+ )
+ d_out = gr.Textbox(
+ label="Ideal Output",
+ placeholder="Expected AI response...",
+ lines=3
+ )
+ with gr.Column(scale=1):
+ d_img = gr.Image(
+ label="Attach Image (Optional)",
+ type="numpy",
+ height=170
+ )
+
+ btn_add = gr.Button(
+ "Add Example",
+ elem_classes=["btn-secondary"]
+ )
+
+ with gr.Tab("Bulk Import (JSON)"):
+ gr.Markdown(
+ "Paste a JSON array like: `[{\"input\": \"...\", \"output\": \"...\"}]`",
+ elem_classes=["small-note"]
+ )
+ bulk_json = gr.Textbox(
+ show_label=False,
+ placeholder='[{"input": "...", "output": "..."}]',
+ lines=6
+ )
+ btn_import = gr.Button(
+ "Import JSON",
+ elem_classes=["btn-secondary"]
+ )
+
+ with gr.Row():
+ gr.HTML("Current dataset
")
+ ds_count = gr.HTML(
+ "0 examples loaded",
+ elem_classes=["ds-count"]
+ )
+
+ ds_table = gr.Dataframe(
+ headers=["ID", "Input", "Output", "Media"],
+ datatype=["number", "str", "str", "str"],
+ row_count=6,
+ column_count=(4, "fixed"),
+ interactive=False
+ )
+
+ with gr.Row():
+ btn_clear = gr.Button(
+ "Clear All",
+ elem_classes=["btn-danger"],
+ size="sm"
+ )
+
+ # Step 4 (Prominent, not buried)
+ with gr.Group(elem_classes=["card"]):
+ gr.HTML(
+ """
+
+
+
Tune evolution budget. Defaults are safe for quick runs.
+
+ """
+ )
+
+ with gr.Row():
+ slider_iter = gr.Slider(
+ minimum=1,
+ maximum=20,
+ value=5,
+ step=1,
+ label="Evolution Rounds",
+ info="Number of genetic iterations"
+ )
+ slider_calls = gr.Slider(
+ minimum=10,
+ maximum=200,
+ value=50,
+ step=10,
+ label="Max LLM Calls",
+ info="Total API call budget"
+ )
+
+ with gr.Row():
+ slider_batch = gr.Slider(
+ minimum=1,
+ maximum=10,
+ value=4,
+ step=1,
+ label="Batch Size",
+ info="Candidates per iteration"
+ )
+ check_llego = gr.Checkbox(
+ value=True,
+ label="Enable LLEGO Crossover",
+ info="Use advanced genetic operations"
+ )
+
+ btn_optimize = gr.Button(
+ "Start Optimization",
+ elem_classes=["cta", "mt-6"]
+ )
+
+ # RIGHT: STATUS + RESULTS
+ with gr.Column(scale=5, elem_classes=["right-col"]):
+ # STATUS PANEL (Hidden by default)
+ status_panel = gr.Group(visible=False, elem_classes=["panel"])
+ with status_panel:
+ gr.HTML(
+ """
+
+
Optimization status
+
Running
+
+ """
+ )
+ txt_status = gr.Markdown("Initializing genetic algorithm...")
+
+ # EMPTY STATE
+ empty_state = gr.HTML(
+ """
+
+
๐งฌ
+
Ready to optimize
+
Fill Steps 1โ3, then click Start Optimization to begin prompt evolution.
+
+ """,
+ visible=True
+ )
+
+ # RESULTS PANEL (Hidden by default)
+ results_panel = gr.Group(visible=False, elem_classes=["results"])
+ with results_panel:
+ gr.HTML(
+ """
+
+
+
โ
+
+
Optimization successful
+
Review the optimized prompt, metrics, and evolution traces.
+
+
+
+ """
+ )
+
+ with gr.Tabs():
+ with gr.Tab("Optimized Prompt"):
+ res_prompt = gr.Textbox(
+ label="Optimized Prompt",
+ lines=18,
+ max_lines=28,
+ interactive=False,
+ show_label=True,
+ elem_classes=["mono"]
+ )
+
+ with gr.Tab("Metrics & Log"):
+ res_metrics = gr.JSON(label="Performance Gains")
+ res_history = gr.TextArea(
+ label="Evolution Log",
+ interactive=False,
+ lines=10
+ )
+
+ with gr.Tab("๐งฌ Live Candidates"):
+ gr.Markdown("Real-time stream of generated prompt candidates during optimization:")
+ live_candidates = gr.HTML()
+ btn_refresh_cand = gr.Button(
+ "๐ Refresh Stream",
+ elem_classes=["secondary-btn"],
+ size="sm"
+ )
+
+ # ==========================================
+ # 6. EVENT HANDLERS
+ # ==========================================
+
+ # Dataset Management
+ def update_dataset_count(dataset):
+ """Update dataset count display with error handling."""
+ try:
+ if not isinstance(dataset, list):
+ return "0 examples loaded"
+ count = len(dataset)
+ return f"{count} example{'s' if count != 1 else ''} loaded"
+ except Exception as e:
+ logger.error(f"Error updating dataset count: {str(e)}")
+ return "Error"
+
+ # Wrap event handlers with error handling
+ def safe_add_example(*args):
+ """Wrapper for add_example with error handling."""
+ try:
+ return add_example(*args)
+ except gr.Error:
+ raise
+ except Exception as e:
+ logger.error(f"Unexpected error in add_example: {str(e)}")
+ raise gr.Error(f"Failed to add example: {str(e)}")
+
+ def safe_update_table(dataset):
+ """Wrapper for update_table with error handling."""
+ try:
+ return update_table(dataset)
+ except Exception as e:
+ logger.error(f"Error updating table: {str(e)}")
+ return []
+
+ def safe_clear_dataset():
+ """Wrapper for clear_dataset with error handling."""
+ try:
+ return clear_dataset()
+ except Exception as e:
+ logger.error(f"Error clearing dataset: {str(e)}")
+ return [], []
+
+ btn_add.click(
+ safe_add_example,
+ inputs=[d_in, d_out, d_img, dataset_state],
+ outputs=[dataset_state, d_in, d_out, d_img]
+ ).then(
+ safe_update_table,
+ inputs=[dataset_state],
+ outputs=[ds_table]
+ ).then(
+ update_dataset_count,
+ inputs=[dataset_state],
+ outputs=[ds_count]
+ )
+
+ btn_clear.click(
+ safe_clear_dataset,
+ outputs=[dataset_state, ds_table]
+ ).then(
+ lambda: "0 examples loaded",
+ outputs=[ds_count]
+ )
+
+ # Bulk Import
+ def import_bulk_json(json_text, current_dataset):
+ """Import examples from JSON with comprehensive error handling."""
+ try:
+ # Validate inputs
+ if not json_text or not json_text.strip():
+ raise gr.Error("JSON input is empty. Please provide a JSON array.")
+
+ if not isinstance(current_dataset, list):
+ raise gr.Error("Dataset state is invalid. Please refresh the page.")
+
+ # Parse JSON
+ try:
+ data = json.loads(json_text.strip())
+ except json.JSONDecodeError as e:
+ raise gr.Error(f"Invalid JSON format: {str(e)}. Please check your JSON syntax.")
+
+ # Validate structure
+ if not isinstance(data, list):
+ raise gr.Error("JSON must be an array of objects. Example: [{\"input\": \"...\", \"output\": \"...\"}]")
+
+ if len(data) == 0:
+ raise gr.Error("JSON array is empty. Add at least one example object.")
+
+ # Validate and import items
+ imported_count = 0
+ errors = []
+
+ for i, item in enumerate(data):
+ try:
+ if not isinstance(item, dict):
+ errors.append(f"Item {i+1}: not a dictionary")
+ continue
+
+ if "input" not in item or "output" not in item:
+ errors.append(f"Item {i+1}: missing 'input' or 'output' field")
+ continue
+
+ input_val = item["input"]
+ output_val = item["output"]
+
+ if not isinstance(input_val, str) or not isinstance(output_val, str):
+ errors.append(f"Item {i+1}: 'input' and 'output' must be strings")
+ continue
+
+ if not input_val.strip() or not output_val.strip():
+ errors.append(f"Item {i+1}: 'input' and 'output' cannot be empty")
+ continue
+
+ # Add valid item
+ current_dataset.append({
+ "input": input_val.strip(),
+ "output": output_val.strip(),
+ "image": item.get("image"), # Optional
+ "image_preview": "๐ผ๏ธ Image" if item.get("image") else "-"
+ })
+ imported_count += 1
+
+ except Exception as e:
+ errors.append(f"Item {i+1}: {str(e)}")
+ logger.warning(f"Error importing item {i+1}: {str(e)}")
+ continue
+
+ # Report results
+ if imported_count == 0:
+ error_msg = "No valid examples imported. "
+ if errors:
+ error_msg += "Errors: " + "; ".join(errors[:3])
+ if len(errors) > 3:
+ error_msg += f" (and {len(errors) - 3} more)"
+ raise gr.Error(error_msg)
+
+ if errors:
+ warning_msg = f"Imported {imported_count} example(s). "
+ if len(errors) <= 3:
+ warning_msg += f"Warnings: {'; '.join(errors)}"
+ else:
+ warning_msg += f"{len(errors)} items had errors."
+ logger.warning(warning_msg)
+
+ return current_dataset, ""
+
+ except gr.Error:
+ # Re-raise Gradio errors
+ raise
+ except Exception as e:
+ logger.error(f"Unexpected error in import_bulk_json: {str(e)}\n{traceback.format_exc()}")
+ raise gr.Error(f"Failed to import JSON: {str(e)}")
+
+ btn_import.click(
+ import_bulk_json,
+ inputs=[bulk_json, dataset_state],
+ outputs=[dataset_state, bulk_json]
+ ).then(
+ safe_update_table,
+ inputs=[dataset_state],
+ outputs=[ds_table]
+ ).then(
+ update_dataset_count,
+ inputs=[dataset_state],
+ outputs=[ds_count]
+ )
+
+ # Main Optimization Flow
+ btn_optimize.click(
+ run_optimization_flow,
+ inputs=[
+ seed_input, dataset_state, model_select, custom_model_input,
+ slider_iter, slider_calls, slider_batch, check_llego,
+ key_openai, key_google, key_anthropic
+ ],
+ outputs=[
+ status_panel, empty_state, results_panel,
+ txt_status, res_prompt, res_metrics, res_history, live_candidates
+ ]
+ )
+
+ # Refresh Candidates
+ def safe_get_candidates_display():
+ """Wrapper for get_candidates_display with error handling."""
+ try:
+ return get_candidates_display()
+ except Exception as e:
+ logger.error(f"Error refreshing candidates: {str(e)}")
+ return "Error loading candidates.
"
+
+ btn_refresh_cand.click(
+ safe_get_candidates_display,
+ outputs=[live_candidates]
+ )
+
+# ==========================================
+# 7. LAUNCH
+# ==========================================
+if __name__ == "__main__":
+ app.queue().launch(
+ server_name="0.0.0.0",
+ server_port=7860,
+ share=False, # Set to False for HF Spaces
+ show_error=True,
+ css=CUSTOM_CSS,
+ js=FORCE_DARK_JS
+ )
+
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b415c2f7705cd0945b80f6960401d2591fe23cf0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+# Core dependencies - gepa from git
+git+https://github.com/gepa-ai/gepa.git
+numpy>=1.21.0
+pandas>=1.5.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+
+# HTTP/API clients
+requests>=2.31.0
+aiohttp>=3.8.0
+asyncio-throttle>=1.0.0
+
+# LLM Provider SDKs
+openai>=1.0.0
+anthropic>=0.18.0
+google-generativeai>=0.3.0
+google-genai>=0.2.0
+
+# Image processing
+Pillow>=9.0.0
+
+# Gradio UI (version will be set by README.md sdk_version)
+gradio>=4.0.0
\ No newline at end of file
diff --git a/src/gepa_optimizer.egg-info/PKG-INFO b/src/gepa_optimizer.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..272d9e1fc41f406056f6ddb09898b32ddd8a6037
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/PKG-INFO
@@ -0,0 +1,439 @@
+Metadata-Version: 2.4
+Name: gepa-optimizer
+Version: 0.1.0
+Summary: Universal prompt optimization framework based on GEPA
+Home-page: https://github.com/suhasb-dev/Prompt-Optimizer
+Author: Suhas
+Author-email: Suhas
+License: MIT
+Project-URL: Homepage, https://github.com/suhasb-dev/Prompt-Optimizer
+Project-URL: Repository, https://github.com/suhasb-dev/Prompt-Optimizer
+Project-URL: Documentation, https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/
+Project-URL: Bug Reports, https://github.com/suhasb-dev/Prompt-Optimizer/issues
+Keywords: prompt-optimization,llm,gepa,ai,machine-learning,ui-tree-extraction
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: gepa>=0.0.12
+Requires-Dist: pandas>=1.5.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: requests>=2.31.0
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio-throttle>=1.0.0
+Requires-Dist: google-generativeai>=0.3.0
+Requires-Dist: Pillow>=9.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: flake8>=6.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: sphinx>=5.0.0; extra == "docs"
+Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "docs"
+Provides-Extra: all
+Requires-Dist: pytest>=7.0.0; extra == "all"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "all"
+Requires-Dist: black>=23.0.0; extra == "all"
+Requires-Dist: flake8>=6.0.0; extra == "all"
+Requires-Dist: mypy>=1.0.0; extra == "all"
+Requires-Dist: sphinx>=5.0.0; extra == "all"
+Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "all"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+
+# GEPA Optimizer
+
+[](https://badge.fury.io/py/gepa-optimizer)
+[](https://www.python.org/downloads/)
+[](https://opensource.org/licenses/MIT)
+
+A universal prompt optimization framework built on [GEPA](https://arxiv.org/abs/2507.19457) with optional [LLEGO](https://arxiv.org/abs/2503.14217) genetic operators for accelerated convergence.
+
+## Overview
+
+GEPA Optimizer provides a modular architecture for optimizing prompts through reflective evolution. It requires custom evaluators and LLM clients, enabling domain-specific optimization for any use case.
+
+**Key capabilities:**
+- Multi-modal support (text + vision models)
+- Hybrid GEPA + LLEGO optimization modes
+- Configurable train/val/test data splitting
+- Batch API support for cost reduction
+- Async-first architecture
+
+## Installation
+
+```bash
+pip install gepa-optimizer
+```
+
+**From source:**
+```bash
+git clone https://github.com/suhasb-dev/Prompt-Optimizer.git
+cd Prompt-Optimizer
+pip install -e .
+```
+
+## Quick Start
+
+```python
+import asyncio
+from gepa_optimizer import (
+ GepaOptimizer,
+ OptimizationConfig,
+ BaseEvaluator,
+ BaseLLMClient
+)
+
+# Define custom evaluator
+class MyEvaluator(BaseEvaluator):
+ def evaluate(self, predicted: str, expected: str) -> dict:
+ score = 1.0 if predicted.strip() == expected.strip() else 0.0
+ return {"accuracy": score, "composite_score": score}
+
+# Define custom LLM client
+class MyLLMClient(BaseLLMClient):
+ def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> dict:
+ # Your LLM integration here
+ return {"content": "response"}
+
+async def main():
+ config = OptimizationConfig(
+ model="openai/gpt-4o",
+ reflection_model="openai/gpt-4o",
+ max_iterations=5,
+ max_metric_calls=50,
+ batch_size=8
+ )
+
+ optimizer = GepaOptimizer(
+ config=config,
+ llm_client=MyLLMClient("openai", "gpt-4o"),
+ evaluator=MyEvaluator()
+ )
+
+ result = await optimizer.train(
+ seed_prompt="Your initial prompt",
+ dataset=your_dataset
+ )
+
+ print(f"Optimized: {result.prompt}")
+ print(f"Score: {result.improvement_data}")
+
+asyncio.run(main())
+```
+
+## Project Structure
+
+```
+src/gepa_optimizer/
+โโโ core/ # Core optimization logic
+โ โโโ optimizer.py # GepaOptimizer main class
+โ โโโ base_adapter.py # BaseGepaAdapter interface
+โ โโโ universal_adapter.py
+โโโ evaluation/ # Evaluator implementations
+โ โโโ base_evaluator.py # BaseEvaluator abstract class
+โ โโโ scroll_evaluator.py
+โ โโโ validation_evaluator.py
+โ โโโ index_caching_evaluator.py
+โโโ llms/ # LLM client implementations
+โ โโโ base_llm.py # BaseLLMClient abstract class
+โ โโโ vision_llm.py # VisionLLMClient (OpenAI, Google, Anthropic)
+โ โโโ batch_llm.py # BatchLLMClient (50% cost savings)
+โโโ operators/ # LLEGO genetic operators
+โ โโโ llego_operators.py # FitnessGuidedCrossover, DiversityGuidedMutation
+โโโ data/ # Dataset loaders and converters
+โโโ models/ # Configuration and result models
+โโโ utils/ # Utilities and helpers
+```
+
+## Configuration
+
+### Basic Configuration
+
+```python
+from gepa_optimizer import OptimizationConfig, ModelConfig
+
+config = OptimizationConfig(
+ # Required parameters
+ model="openai/gpt-4o", # or ModelConfig instance
+ reflection_model="openai/gpt-4o",
+ max_iterations=10,
+ max_metric_calls=100,
+ batch_size=8,
+
+ # Data splitting (train/val/test)
+ data_split=DataSplitConfig(
+ train_ratio=0.6,
+ val_ratio=0.2,
+ test_ratio=0.2
+ ),
+
+ # Optional settings
+ reflection_examples=3, # Examples per reflection (2-5 recommended)
+ evaluate_on_test=True, # Final evaluation on held-out test set
+ log_level="INFO" # DEBUG, INFO, WARNING, ERROR
+)
+```
+
+### LLEGO Genetic Operators
+
+Enable LLEGO for faster convergence through fitness-guided crossover and diversity-guided mutation:
+
+```python
+config = OptimizationConfig(
+ model="openai/gpt-4o",
+ reflection_model="openai/gpt-4o",
+ max_iterations=5,
+ max_metric_calls=50,
+ batch_size=8,
+
+ # Enable LLEGO
+ use_llego_operators=True,
+ alpha=0.15, # Fitness extrapolation factor
+ tau=10.0, # Diversity temperature
+ nu=4, # Parent arity
+ n_crossover=2, # Crossover offspring per iteration
+ n_mutation=3, # Mutation offspring per iteration
+ population_size=15
+)
+```
+
+### Hybrid Mode (GEPA + LLEGO)
+
+Combine GEPA's semantic reflection with LLEGO's structural diversity:
+
+```python
+config = OptimizationConfig(
+ model="openai/gpt-4o",
+ reflection_model="openai/gpt-4o",
+ max_iterations=6,
+ max_metric_calls=200,
+ batch_size=10,
+
+ # Hybrid mode
+ use_llego_operators=True,
+ enable_gepa_reflection_with_llego=True,
+ num_gepa_reflection_candidates=3,
+ n_crossover=3,
+ n_mutation=3
+ # Total: 9 candidates per iteration (3 GEPA + 3 crossover + 3 mutation)
+)
+```
+
+### Batch API (Cost Optimization)
+
+Use batch processing for 50% cost reduction:
+
+```python
+from gepa_optimizer.llms import BatchLLMClient
+
+llm_client = BatchLLMClient(
+ provider="google",
+ model_name="gemini-2.5-flash",
+ batch_size=20,
+ polling_interval=30
+)
+
+optimizer = GepaOptimizer(
+ config=config,
+ llm_client=llm_client,
+ evaluator=evaluator
+)
+```
+
+## Built-in Components
+
+### LLM Clients
+
+| Client | Description | Use Case |
+|--------|-------------|----------|
+| `VisionLLMClient` | Multi-modal client for OpenAI, Google, Anthropic | Real-time requests |
+| `BatchLLMClient` | Batch processing client | Cost-sensitive workloads |
+
+### Evaluators
+
+| Evaluator | Description |
+|-----------|-------------|
+| `ScrollElementEvaluator` | UI element detection scoring |
+| `ValidationEvaluator` | Screen validation tasks |
+| `IndexCachingEvaluator` | Index-based element selection |
+| `UITreeEvaluator` | UI tree extraction |
+
+### Dataset Loaders
+
+| Loader | Description |
+|--------|-------------|
+| `load_scroll_dataset()` | Load scroll detection datasets |
+| `load_validation_split()` | Load validation datasets with splits |
+| `load_index_caching_split()` | Load index caching datasets |
+
+## Creating Custom Components
+
+### Custom Evaluator
+
+```python
+from gepa_optimizer import BaseEvaluator
+
+class CustomEvaluator(BaseEvaluator):
+ def __init__(self):
+ super().__init__(metric_weights={
+ "accuracy": 0.5,
+ "completeness": 0.3,
+ "format": 0.2
+ })
+
+ def evaluate(self, predicted: str, expected: str) -> dict:
+ accuracy = self._compute_accuracy(predicted, expected)
+ completeness = self._compute_completeness(predicted, expected)
+ format_score = self._compute_format(predicted)
+
+ composite = (
+ accuracy * 0.5 +
+ completeness * 0.3 +
+ format_score * 0.2
+ )
+
+ return {
+ "accuracy": accuracy,
+ "completeness": completeness,
+ "format": format_score,
+ "composite_score": composite # Required key
+ }
+```
+
+### Custom LLM Client
+
+```python
+from gepa_optimizer import BaseLLMClient
+
+class CustomLLMClient(BaseLLMClient):
+ def __init__(self, api_key: str):
+ super().__init__(provider="custom", model_name="my-model")
+ self.api_key = api_key
+
+ def generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: str = None,
+ **kwargs
+ ) -> dict:
+ # Your API call here
+ response = call_your_api(system_prompt, user_prompt, image_base64)
+ return {"content": response}
+```
+
+## Examples
+
+| File | Description |
+|------|-------------|
+| [`examples/basic_usage.py`](examples/basic_usage.py) | Basic optimization workflow |
+| [`examples/advanced_usage.py`](examples/advanced_usage.py) | Advanced configuration |
+| [`examples/batch_api_example.py`](examples/batch_api_example.py) | Batch API usage |
+| [`examples/gemini_usage.py`](examples/gemini_usage.py) | Google Gemini integration |
+
+**Run examples:**
+```bash
+python examples/basic_usage.py
+```
+
+## Testing
+
+```bash
+# Run all tests
+pytest tests/
+
+# Run unit tests only
+pytest tests/unit/
+
+# Run integration tests
+pytest tests/integration/
+```
+
+## API Reference
+
+### GepaOptimizer
+
+```python
+class GepaOptimizer:
+ def __init__(
+ self,
+ config: OptimizationConfig,
+ llm_client: BaseLLMClient,
+ evaluator: BaseEvaluator,
+ adapter_type: str = "universal"
+ )
+
+ async def train(
+ self,
+ seed_prompt: str,
+ dataset: Union[List, Dict],
+ **kwargs
+ ) -> OptimizedResult
+```
+
+### OptimizationConfig
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `model` | `str \| ModelConfig` | Required | Target model |
+| `reflection_model` | `str \| ModelConfig` | Required | Reflection model |
+| `max_iterations` | `int` | Required | Maximum optimization iterations |
+| `max_metric_calls` | `int` | Required | Maximum evaluation calls |
+| `batch_size` | `int` | Required | Samples per evaluation batch |
+| `use_llego_operators` | `bool` | `False` | Enable LLEGO genetic operators |
+| `enable_gepa_reflection_with_llego` | `bool` | `False` | Enable hybrid mode |
+| `use_llm_as_judge` | `bool` | `True` | Enable LLM-as-Judge feedback |
+| `log_level` | `str` | `"INFO"` | Logging verbosity |
+
+### OptimizedResult
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `prompt` | `str` | Optimized prompt |
+| `original_prompt` | `str` | Initial seed prompt |
+| `improvement_data` | `dict` | Score improvements |
+| `optimization_time` | `float` | Total time in seconds |
+| `is_successful` | `bool` | Optimization success status |
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `OPENAI_API_KEY` | OpenAI API key |
+| `ANTHROPIC_API_KEY` | Anthropic API key |
+| `GOOGLE_API_KEY` | Google AI API key |
+
+## References
+
+- **GEPA Paper:** [Reflective Prompt Evolution Can Outperform Reinforcement Learning](https://arxiv.org/abs/2507.19457)
+- **LLEGO Paper:** [Decision Tree Induction Through LLMs via Semantically-Aware Evolution](https://arxiv.org/abs/2503.14217)
+- **GEPA Library:** [github.com/gepa-ai/gepa](https://github.com/gepa-ai/gepa)
+
+## License
+
+MIT License - see [LICENSE](LICENSE) for details.
+
+## Contributing
+
+Contributions welcome. Please open an issue or submit a pull request.
+
+## Support
+
+- **Issues:** [GitHub Issues](https://github.com/suhasb-dev/Prompt-Optimizer/issues)
+- **Documentation:** [GitBook](https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/)
diff --git a/src/gepa_optimizer.egg-info/SOURCES.txt b/src/gepa_optimizer.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f019258d7e1ba6587c93e9adafd7e606b206503b
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/SOURCES.txt
@@ -0,0 +1,65 @@
+LICENSE
+README.md
+pyproject.toml
+setup.py
+src/gepa_optimizer/__init__.py
+src/gepa_optimizer/cli.py
+src/gepa_optimizer/types.py
+src/gepa_optimizer/version.py
+src/gepa_optimizer.egg-info/PKG-INFO
+src/gepa_optimizer.egg-info/SOURCES.txt
+src/gepa_optimizer.egg-info/dependency_links.txt
+src/gepa_optimizer.egg-info/entry_points.txt
+src/gepa_optimizer.egg-info/requires.txt
+src/gepa_optimizer.egg-info/top_level.txt
+src/gepa_optimizer/core/__init__.py
+src/gepa_optimizer/core/base_adapter.py
+src/gepa_optimizer/core/custom_adapter.py
+src/gepa_optimizer/core/optimizer.py
+src/gepa_optimizer/core/result.py
+src/gepa_optimizer/core/universal_adapter.py
+src/gepa_optimizer/data/__init__.py
+src/gepa_optimizer/data/converters.py
+src/gepa_optimizer/data/index_caching_loader.py
+src/gepa_optimizer/data/loaders.py
+src/gepa_optimizer/data/scroll_dataset_loader.py
+src/gepa_optimizer/data/validation_dataset_loader.py
+src/gepa_optimizer/data/validators.py
+src/gepa_optimizer/evaluation/__init__.py
+src/gepa_optimizer/evaluation/base_evaluator.py
+src/gepa_optimizer/evaluation/index_caching_evaluator.py
+src/gepa_optimizer/evaluation/scroll_evaluator.py
+src/gepa_optimizer/evaluation/ui_evaluator.py
+src/gepa_optimizer/evaluation/universal_evaluator.py
+src/gepa_optimizer/evaluation/validation_evaluator.py
+src/gepa_optimizer/infrastructure/__init__.py
+src/gepa_optimizer/infrastructure/logging/__init__.py
+src/gepa_optimizer/infrastructure/logging/context.py
+src/gepa_optimizer/infrastructure/logging/formatters.py
+src/gepa_optimizer/infrastructure/logging/logger.py
+src/gepa_optimizer/llms/__init__.py
+src/gepa_optimizer/llms/base_llm.py
+src/gepa_optimizer/llms/batch_llm.py
+src/gepa_optimizer/llms/llego_enhanced_llm.py
+src/gepa_optimizer/llms/vision_llm.py
+src/gepa_optimizer/models/__init__.py
+src/gepa_optimizer/models/config.py
+src/gepa_optimizer/models/dataset.py
+src/gepa_optimizer/models/result.py
+src/gepa_optimizer/operators/__init__.py
+src/gepa_optimizer/operators/base_operator.py
+src/gepa_optimizer/operators/crossover.py
+src/gepa_optimizer/operators/llego_operators.py
+src/gepa_optimizer/operators/models.py
+src/gepa_optimizer/operators/mutation.py
+src/gepa_optimizer/utils/__init__.py
+src/gepa_optimizer/utils/api_keys.py
+src/gepa_optimizer/utils/candidate_collector.py
+src/gepa_optimizer/utils/clean_logger.py
+src/gepa_optimizer/utils/exceptions.py
+src/gepa_optimizer/utils/helpers.py
+src/gepa_optimizer/utils/llm_judge_prompt.py
+src/gepa_optimizer/utils/log_parser.py
+src/gepa_optimizer/utils/logging.py
+src/gepa_optimizer/utils/metrics.py
+src/gepa_optimizer/utils/pareto_logger.py
\ No newline at end of file
diff --git a/src/gepa_optimizer.egg-info/dependency_links.txt b/src/gepa_optimizer.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/gepa_optimizer.egg-info/entry_points.txt b/src/gepa_optimizer.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9b0dbe7680b3733ee391c2c83177f29594117eb
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+gepa-optimize = gepa_optimizer.cli:main
diff --git a/src/gepa_optimizer.egg-info/requires.txt b/src/gepa_optimizer.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecfbd2e6e4a4482e0036fa85ac2ca7d695b13be6
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/requires.txt
@@ -0,0 +1,29 @@
+gepa>=0.0.12
+pandas>=1.5.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+requests>=2.31.0
+aiohttp>=3.8.0
+asyncio-throttle>=1.0.0
+google-generativeai>=0.3.0
+Pillow>=9.0.0
+
+[all]
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+black>=23.0.0
+flake8>=6.0.0
+mypy>=1.0.0
+sphinx>=5.0.0
+sphinx-rtd-theme>=1.2.0
+
+[dev]
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+black>=23.0.0
+flake8>=6.0.0
+mypy>=1.0.0
+
+[docs]
+sphinx>=5.0.0
+sphinx-rtd-theme>=1.2.0
diff --git a/src/gepa_optimizer.egg-info/top_level.txt b/src/gepa_optimizer.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a53df9f6ea55c2b670c462010432f5969311d777
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/top_level.txt
@@ -0,0 +1 @@
+gepa_optimizer
diff --git a/src/gepa_optimizer/__init__.py b/src/gepa_optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a4dc05ed44b418078c9404690ba7af8d163d7f
--- /dev/null
+++ b/src/gepa_optimizer/__init__.py
@@ -0,0 +1,295 @@
+"""
+GEPA Universal Prompt Optimizer
+
+A modern, modular Python library for universal prompt optimization powered by GEPA.
+
+Quick Start (No custom evaluator needed!):
+
+ from gepa_optimizer import quick_optimize
+
+ result = await quick_optimize(
+ seed_prompt="Your initial prompt",
+ dataset=[
+ {"input": "task1", "output": "expected1"},
+ {"input": "task2", "output": "expected2"},
+ ],
+ model="openai/gpt-4o" # or any: "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022"
+ )
+ print(result.optimized_prompt)
+"""
+
+# Core functionality
+from .core import GepaOptimizer
+from .core.base_adapter import BaseGepaAdapter
+from .core.universal_adapter import UniversalGepaAdapter
+
+# Configuration and models
+from .models import OptimizationConfig, OptimizationResult, OptimizedResult, ModelConfig
+
+# Data processing
+from .data import UniversalConverter, DataLoader, DataValidator
+from .data.scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset
+from .data.validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split
+from .data.index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split
+
+# LLM clients
+from .llms import VisionLLMClient
+from .llms.base_llm import BaseLLMClient
+from .llms.batch_llm import BatchLLMClient
+
+# Evaluators - including Universal Semantic Evaluator (works for ANY task!)
+from .evaluation import (
+ BaseEvaluator,
+ UniversalSemanticEvaluator,
+ create_universal_evaluator,
+ UITreeEvaluator,
+ ScrollElementEvaluator,
+ ValidationEvaluator,
+ IndexCachingEvaluator
+)
+
+# LLEGO Genetic Operators
+from .operators import (
+ # Base interfaces
+ BaseGeneticOperator,
+ BaseCrossoverOperator,
+ BaseMutationOperator,
+ # Concrete operators
+ FitnessGuidedCrossover,
+ DiversityGuidedMutation,
+ LLEGOIntegrationLayer,
+ # Data models
+ PromptCandidate,
+ PromptMetadata
+)
+
+# Utilities
+from .utils import setup_logging, calculate_metrics, sanitize_prompt, APIKeyManager
+from .utils.exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError
+
+# Logging infrastructure
+from .infrastructure.logging import get_logger, configure_logging, LogContext
+
+# Type definitions (for type hints in user code)
+from .types import (
+ DatasetItem,
+ EvaluationResult,
+ LLMResponse,
+ CandidateDict,
+ LLMClientProtocol,
+ EvaluatorProtocol,
+)
+
+__version__ = "0.1.0"
+
+
+# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+# CONVENIENCE FUNCTION: quick_optimize
+# No evaluator needed - uses Universal Semantic Evaluator automatically
+# โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+
+async def quick_optimize(
+ seed_prompt: str,
+ dataset: list,
+ model: str,
+ max_iterations: int = 5,
+ max_metric_calls: int = 50,
+ batch_size: int = 4,
+ use_llego: bool = True,
+ verbose: bool = True
+) -> OptimizedResult:
+ """
+ ๐ Quick prompt optimization - no custom evaluator needed!
+
+ Uses Universal Semantic Evaluator that works for ANY task.
+
+ Args:
+ seed_prompt: Your initial prompt to optimize
+ dataset: List of dicts with 'input' and 'output' (expected) keys
+ Can also include 'image' key for multi-modal tasks
+ model: LLM model to use in format "provider/model-name" (REQUIRED)
+ Examples:
+ - "google/gemini-1.5-pro"
+ - "google/gemini-2.5-flash-preview-05-20"
+ - "openai/gpt-4o"
+ - "openai/gpt-4-turbo"
+ - "anthropic/claude-3-5-sonnet-20241022"
+ max_iterations: Maximum optimization iterations (default: 5)
+ max_metric_calls: Maximum evaluation calls (default: 50)
+ batch_size: Samples per evaluation batch (default: 4)
+ use_llego: Enable LLEGO genetic operators (default: True)
+ verbose: Show progress logs (default: True)
+
+ Returns:
+ OptimizedResult with optimized prompt and improvement metrics
+
+ Example:
+ >>> result = await quick_optimize(
+ ... seed_prompt="Count the objects in the image",
+ ... dataset=[
+ ... {"input": "image1.jpg", "output": "5 objects", "image": "base64..."},
+ ... {"input": "image2.jpg", "output": "3 objects", "image": "base64..."},
+ ... ],
+ ... model="openai/gpt-4o", # or "google/gemini-1.5-pro", etc.
+ ... max_iterations=3
+ ... )
+ >>> print(result.optimized_prompt)
+ """
+ import logging
+
+ if verbose:
+ logging.basicConfig(level=logging.INFO)
+
+ # Create LLM client
+ llm_client = VisionLLMClient.from_model_string(model)
+
+ # Create Universal Semantic Evaluator (uses same LLM for analysis)
+ evaluator = UniversalSemanticEvaluator(
+ llm_client=llm_client,
+ use_llm_analysis=True
+ )
+
+ # Create configuration
+ config = OptimizationConfig(
+ model=model,
+ reflection_model=model,
+ max_iterations=max_iterations,
+ max_metric_calls=max_metric_calls,
+ batch_size=batch_size,
+ use_llego_operators=use_llego,
+ enable_gepa_reflection_with_llego=use_llego,
+ num_gepa_reflection_candidates=3,
+ n_crossover=2,
+ n_mutation=2,
+ verbose=verbose
+ )
+
+ # Create optimizer
+ optimizer = GepaOptimizer(
+ config=config,
+ llm_client=llm_client,
+ evaluator=evaluator
+ )
+
+ # Run optimization
+ result = await optimizer.train(
+ seed_prompt=seed_prompt,
+ dataset=dataset
+ )
+
+ return result
+
+
+def quick_optimize_sync(
+ seed_prompt: str,
+ dataset: list,
+ model: str,
+ max_iterations: int = 5,
+ max_metric_calls: int = 50,
+ batch_size: int = 4,
+ use_llego: bool = True,
+ verbose: bool = True
+) -> OptimizedResult:
+ """
+ ๐ Synchronous version of quick_optimize.
+
+ Same as quick_optimize but runs synchronously (blocks until complete).
+
+ Args:
+ model: LLM model to use in format "provider/model-name" (REQUIRED)
+ Examples: "openai/gpt-4o", "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022"
+
+ See quick_optimize for full documentation.
+ """
+ import asyncio
+ return asyncio.run(quick_optimize(
+ seed_prompt=seed_prompt,
+ dataset=dataset,
+ model=model,
+ max_iterations=max_iterations,
+ max_metric_calls=max_metric_calls,
+ batch_size=batch_size,
+ use_llego=use_llego,
+ verbose=verbose
+ ))
+
+
+__all__ = [
+ # ๐ Quick Start (recommended for new users)
+ "quick_optimize",
+ "quick_optimize_sync",
+
+ # Core functionality
+ "GepaOptimizer",
+ "BaseGepaAdapter",
+ "UniversalGepaAdapter",
+
+ # Configuration
+ "OptimizationConfig",
+ "OptimizationResult",
+ "OptimizedResult",
+ "ModelConfig",
+
+ # Data processing
+ "UniversalConverter",
+ "DataLoader",
+ "DataValidator",
+
+ # Dataset loaders
+ "ScrollDatasetLoader",
+ "load_scroll_dataset",
+ "ValidationDatasetLoader",
+ "load_validation_dataset",
+ "load_validation_split",
+ "IndexCachingDatasetLoader",
+ "load_index_caching_dataset",
+ "load_index_caching_split",
+
+ # LLM clients
+ "VisionLLMClient",
+ "BaseLLMClient",
+ "BatchLLMClient",
+
+ # Evaluators (Universal recommended for general use)
+ "UniversalSemanticEvaluator",
+ "create_universal_evaluator",
+ "BaseEvaluator",
+ "UITreeEvaluator",
+ "ScrollElementEvaluator",
+ "ValidationEvaluator",
+ "IndexCachingEvaluator",
+
+ # LLEGO Genetic Operators - Base interfaces
+ "BaseGeneticOperator",
+ "BaseCrossoverOperator",
+ "BaseMutationOperator",
+ # LLEGO Genetic Operators - Concrete implementations
+ "FitnessGuidedCrossover",
+ "DiversityGuidedMutation",
+ "LLEGOIntegrationLayer",
+ "PromptCandidate",
+ "PromptMetadata",
+
+ # Utilities
+ "APIKeyManager",
+ "GepaOptimizerError",
+ "GepaDependencyError",
+ "InvalidInputError",
+ "DatasetError",
+ "setup_logging",
+ "calculate_metrics",
+ "sanitize_prompt",
+
+ # Logging infrastructure
+ "get_logger",
+ "configure_logging",
+ "LogContext",
+
+ # Type definitions
+ "DatasetItem",
+ "EvaluationResult",
+ "LLMResponse",
+ "CandidateDict",
+ "LLMClientProtocol",
+ "EvaluatorProtocol",
+]
diff --git a/src/gepa_optimizer/cli.py b/src/gepa_optimizer/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a1593308fcd2ae0cab1fa15fc734815d877252
--- /dev/null
+++ b/src/gepa_optimizer/cli.py
@@ -0,0 +1,239 @@
+"""
+Command Line Interface for GEPA Optimizer
+"""
+
+import argparse
+import sys
+import json
+import asyncio
+from pathlib import Path
+from typing import Optional
+
+from .core import GepaOptimizer
+from .models import OptimizationConfig, ModelConfig
+from .utils import setup_logging, APIKeyManager
+
+
+def main():
+ """Main CLI entry point"""
+ parser = argparse.ArgumentParser(
+ description="GEPA Universal Prompt Optimizer CLI",
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ epilog="""
+Examples:
+ gepa-optimize --model openai/gpt-4-turbo --prompt "Extract UI elements" --dataset data.json
+ gepa-optimize --config config.json --prompt "Analyze interface" --dataset images/
+ """
+ )
+
+ # Required arguments
+ parser.add_argument(
+ "--prompt",
+ required=True,
+ help="Initial seed prompt to optimize"
+ )
+ parser.add_argument(
+ "--dataset",
+ required=True,
+ help="Path to dataset file or directory"
+ )
+
+ # Model configuration
+ parser.add_argument(
+ "--model",
+ help="Model specification (e.g., 'openai/gpt-4-turbo')"
+ )
+ parser.add_argument(
+ "--reflection-model",
+ help="Reflection model specification"
+ )
+ parser.add_argument(
+ "--config",
+ help="Path to configuration JSON file"
+ )
+
+ # Optimization parameters
+ parser.add_argument(
+ "--max-iterations",
+ type=int,
+ default=10,
+ help="Maximum optimization iterations (default: 10)"
+ )
+ parser.add_argument(
+ "--max-metric-calls",
+ type=int,
+ default=100,
+ help="Maximum metric evaluation calls (default: 100)"
+ )
+ parser.add_argument(
+ "--batch-size",
+ type=int,
+ default=4,
+ help="Batch size for evaluation (default: 4)"
+ )
+
+ # GEPA-specific parameters
+ parser.add_argument(
+ "--candidate-selection-strategy",
+ type=str,
+ default="pareto",
+ choices=["pareto", "best"],
+ help="Strategy for selecting candidates (default: pareto)"
+ )
+ parser.add_argument(
+ "--skip-perfect-score",
+ action="store_true",
+ help="Skip updating candidates with perfect scores"
+ )
+ parser.add_argument(
+ "--reflection-minibatch-size",
+ type=int,
+ default=None,
+ help="Number of examples to use for reflection (default: use batch_size)"
+ )
+ parser.add_argument(
+ "--perfect-score",
+ type=float,
+ default=1.0,
+ help="Perfect score threshold (default: 1.0)"
+ )
+ parser.add_argument(
+ "--module-selector",
+ type=str,
+ default="round_robin",
+ choices=["round_robin", "all"],
+ help="Component selection strategy (default: round_robin)"
+ )
+
+ # Output options
+ parser.add_argument(
+ "--output",
+ help="Output file path for results (default: stdout)"
+ )
+ parser.add_argument(
+ "--verbose", "-v",
+ action="store_true",
+ help="Enable verbose logging"
+ )
+
+ args = parser.parse_args()
+
+ # Setup logging
+ setup_logging(level="DEBUG" if args.verbose else "INFO")
+
+ try:
+ # Load configuration
+ if args.config:
+ config = load_config_from_file(args.config)
+ else:
+ config = create_config_from_args(args)
+
+ # Validate API keys
+ validate_api_keys(config)
+
+ # Create optimizer
+ optimizer = GepaOptimizer(config=config)
+
+ # Run optimization (async)
+ print(f"๐ Starting optimization with model: {config.model.model_name}")
+ result = asyncio.run(optimizer.train(
+ seed_prompt=args.prompt,
+ dataset=args.dataset
+ ))
+
+ # Output results
+ output_results(result, args.output)
+
+ print("โ
Optimization completed successfully!")
+
+ except Exception as e:
+ print(f"โ Error: {str(e)}", file=sys.stderr)
+ sys.exit(1)
+
+
+def load_config_from_file(config_path: str) -> OptimizationConfig:
+ """Load configuration from JSON file"""
+ path = Path(config_path)
+ if not path.exists():
+ raise FileNotFoundError(f"Configuration file not found: {config_path}")
+
+ with open(path, 'r') as f:
+ config_data = json.load(f)
+
+ # Convert model configs
+ if 'model' in config_data and isinstance(config_data['model'], dict):
+ config_data['model'] = ModelConfig(**config_data['model'])
+
+ if 'reflection_model' in config_data and isinstance(config_data['reflection_model'], dict):
+ config_data['reflection_model'] = ModelConfig(**config_data['reflection_model'])
+
+ return OptimizationConfig(**config_data)
+
+
+def create_config_from_args(args) -> OptimizationConfig:
+ """Create configuration from command line arguments"""
+ if not args.model:
+ raise ValueError("Either --model or --config must be specified")
+
+ # Parse model specification
+ model_config = ModelConfig.from_string(args.model)
+
+ reflection_model_config = None
+ if args.reflection_model:
+ reflection_model_config = ModelConfig.from_string(args.reflection_model)
+
+ return OptimizationConfig(
+ model=model_config,
+ reflection_model=reflection_model_config,
+ max_iterations=args.max_iterations,
+ max_metric_calls=args.max_metric_calls,
+ batch_size=args.batch_size
+ )
+
+
+def validate_api_keys(config: OptimizationConfig):
+ """Validate that required API keys are available"""
+ api_manager = APIKeyManager()
+
+ providers = [config.model.provider]
+ if config.reflection_model:
+ providers.append(config.reflection_model.provider)
+
+ missing_keys = api_manager.get_missing_keys(providers)
+
+ if missing_keys:
+ print("โ Missing API keys for the following providers:")
+ for provider in missing_keys:
+ print(f" - {provider.upper()}_API_KEY")
+ print("\nPlease set the required environment variables or use a .env file")
+ sys.exit(1)
+
+def output_results(result, output_path: Optional[str]):
+ """Output optimization results"""
+ output_data = {
+ "optimized_prompt": result.prompt,
+ "original_prompt": result.original_prompt,
+ "improvement_metrics": result.improvement_data,
+ "optimization_time": result.optimization_time,
+ "status": result.status,
+ "session_id": result.session_id
+ }
+
+ if output_path:
+ with open(output_path, 'w') as f:
+ json.dump(output_data, f, indent=2)
+ print(f"๐ Results saved to: {output_path}")
+ else:
+ print("\n๐ Optimization Results:")
+ print(f"Session ID: {result.session_id}")
+ print(f"Status: {result.status}")
+ print(f"Time: {result.optimization_time:.2f}s")
+ print(f"\nOriginal Prompt:\n{result.original_prompt}")
+ print(f"\nOptimized Prompt:\n{result.prompt}")
+
+ if 'improvement_percent' in result.improvement_data:
+ print(f"\nImprovement: {result.improvement_data['improvement_percent']:.2f}%")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/src/gepa_optimizer/core/__init__.py b/src/gepa_optimizer/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41630b803c65248370baa2b0874fdf287ed0d052
--- /dev/null
+++ b/src/gepa_optimizer/core/__init__.py
@@ -0,0 +1,8 @@
+"""
+Core functionality for GEPA Universal Prompt Optimizer
+"""
+
+from .optimizer import GepaOptimizer
+from .result import ResultProcessor
+
+__all__ = ["GepaOptimizer", "ResultProcessor"]
diff --git a/src/gepa_optimizer/core/base_adapter.py b/src/gepa_optimizer/core/base_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1ff8ea3eb47fbbd0ffc89b011fe233cafc68ce
--- /dev/null
+++ b/src/gepa_optimizer/core/base_adapter.py
@@ -0,0 +1,85 @@
+"""
+Base adapter class for all GEPA adapters.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+import logging
+from gepa.core.adapter import GEPAAdapter, EvaluationBatch
+
+from ..llms.base_llm import BaseLLMClient
+from ..evaluation.base_evaluator import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+class BaseGepaAdapter(GEPAAdapter, ABC):
+ """
+ Abstract base class for GEPA adapters.
+
+ Provides the foundation for creating task-specific adapters while
+ maintaining compatibility with the GEPA framework.
+ """
+
+ def __init__(self, llm_client: BaseLLMClient, evaluator: BaseEvaluator):
+ """
+ Initialize adapter with LLM client and evaluator.
+
+ Args:
+ llm_client: LLM client for generating responses
+ evaluator: Evaluator for scoring predictions
+ """
+ if not isinstance(llm_client, BaseLLMClient):
+ raise TypeError("llm_client must be an instance of BaseLLMClient")
+ if not isinstance(evaluator, BaseEvaluator):
+ raise TypeError("evaluator must be an instance of BaseEvaluator")
+
+ self.llm_client = llm_client
+ self.evaluator = evaluator
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+ # Performance tracking
+ self._evaluation_count = 0
+ self._best_score = 0.0
+ self._best_candidate = None
+
+ @abstractmethod
+ def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str],
+ capture_traces: bool = False) -> EvaluationBatch:
+ """
+ Evaluate candidate on a batch of data.
+
+ Args:
+ batch: List of data items to evaluate
+ candidate: Prompt candidate to evaluate
+ capture_traces: Whether to capture detailed traces
+
+ Returns:
+ EvaluationBatch with outputs, scores, and optional trajectories
+ """
+ pass
+
+ @abstractmethod
+ def make_reflective_dataset(self, candidate: Dict[str, str],
+ eval_batch: EvaluationBatch,
+ components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]:
+ """
+ Create reflective dataset for GEPA's reflection process.
+
+ Args:
+ candidate: Current prompt candidate
+ eval_batch: Results from evaluation
+ components_to_update: List of components to update
+
+ Returns:
+ Dictionary mapping components to reflection data
+ """
+ pass
+
+ def get_performance_stats(self) -> Dict[str, Any]:
+ """Get performance statistics for monitoring"""
+ return {
+ 'evaluation_count': self._evaluation_count,
+ 'best_score': self._best_score,
+ 'model_info': self.llm_client.get_model_info(),
+ 'evaluator_class': self.evaluator.__class__.__name__
+ }
diff --git a/src/gepa_optimizer/core/custom_adapter.py b/src/gepa_optimizer/core/custom_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f0d32ddc2a39b866d77fa5693be5e178cf20d09
--- /dev/null
+++ b/src/gepa_optimizer/core/custom_adapter.py
@@ -0,0 +1,389 @@
+"""
+Custom GEPA Adapter for the GEPA Universal Prompt Optimizer
+"""
+
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+# Import ModelConfig
+from ..models import ModelConfig
+
+from gepa.core.adapter import GEPAAdapter, EvaluationBatch
+from ..llms.vision_llm import VisionLLMClient
+from ..evaluation.ui_evaluator import UITreeEvaluator
+from .base_adapter import BaseGepaAdapter
+
+logger = logging.getLogger(__name__)
+
+class CustomGepaAdapter(BaseGepaAdapter):
+ """
+ Custom adapter for the GEPA Universal Prompt Optimizer.
+ """
+
+ def __init__(self, model_config: 'ModelConfig', metric_weights: Optional[Dict[str, float]] = None):
+ """Initialize the custom GEPA adapter with model configuration."""
+ # Convert string model to ModelConfig if needed
+ if not isinstance(model_config, ModelConfig):
+ model_config = ModelConfig(
+ provider='openai',
+ model_name=str(model_config),
+ api_key=None
+ )
+
+ # Initialize components
+ llm_client = VisionLLMClient(
+ provider=model_config.provider,
+ model_name=model_config.model_name,
+ api_key=model_config.api_key,
+ base_url=model_config.base_url,
+ temperature=model_config.temperature,
+ max_tokens=model_config.max_tokens,
+ top_p=model_config.top_p,
+ frequency_penalty=model_config.frequency_penalty,
+ presence_penalty=model_config.presence_penalty
+ )
+
+ evaluator = UITreeEvaluator(metric_weights=metric_weights)
+
+ # Initialize parent class
+ super().__init__(llm_client, evaluator)
+
+ # Track candidates for logging
+ self._last_candidate = None
+ self._evaluation_count = 0
+
+ self.logger.info(f"๐ Initialized UI Tree adapter with {model_config.provider}/{model_config.model_name}")
+
+ def _parse_json_safely(self, json_str: str) -> Dict[str, Any]:
+ """Safely parse JSON string to dictionary with enhanced parsing and repair."""
+ if not json_str or not isinstance(json_str, str):
+ return {}
+
+ # Try direct parsing first
+ try:
+ return json.loads(json_str)
+ except json.JSONDecodeError:
+ pass
+
+ # Try to extract JSON from markdown code blocks
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', json_str, re.DOTALL)
+ if json_match:
+ try:
+ return json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ # Try to find JSON object in the string
+ json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
+ if json_match:
+ try:
+ return json.loads(json_match.group(0))
+ except json.JSONDecodeError:
+ pass
+
+ # Try repair and parse
+ repaired_json = self._repair_json(json_str)
+ if repaired_json:
+ try:
+ return json.loads(repaired_json)
+ except json.JSONDecodeError:
+ pass
+
+ self.logger.warning(f"Failed to parse JSON: {json_str[:100]}...")
+ return {}
+
+ def _repair_json(self, json_str: str) -> str:
+ """Attempt to repair common JSON issues."""
+ try:
+ # Remove markdown formatting
+ json_str = re.sub(r'```(?:json)?\s*', '', json_str)
+ json_str = re.sub(r'```\s*$', '', json_str)
+
+ # Remove extra text before/after JSON
+ json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(0)
+
+ # Fix common issues
+ json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas
+ json_str = re.sub(r',\s*]', ']', json_str) # Remove trailing commas in arrays
+ json_str = re.sub(r'([{,]\s*)(\w+):', r'\1"\2":', json_str) # Quote unquoted keys
+
+ return json_str
+ except Exception as e:
+ self.logger.warning(f"๐ง JSON repair failed: {e}")
+ return ""
+
+ def evaluate(
+ self,
+ batch: List[Dict[str, Any]],
+ candidate: Dict[str, str],
+ capture_traces: bool = False,
+ ) -> EvaluationBatch:
+ """Evaluate the candidate on a batch of data."""
+ outputs = []
+ scores = []
+ trajectories = [] if capture_traces else None
+
+ system_prompt = candidate.get('system_prompt', '')
+
+ # Check if this is a new candidate (different from last one)
+ if self._last_candidate != system_prompt:
+ self._evaluation_count += 1
+ self.log_proposed_candidate(candidate, self._evaluation_count)
+ self._last_candidate = system_prompt
+
+ self.logger.info(f"๐ Evaluating {len(batch)} samples with prompt: '{system_prompt[:50]}...'")
+
+ for i, item in enumerate(batch):
+ input_text = item.get('input', '')
+ image_base64 = item.get('image', '')
+ ground_truth_json = item.get('output', '')
+
+ # Call the LLM client
+ llm_response = self.llm_client.generate(system_prompt, input_text, image_base64=image_base64)
+
+ # Extract content from the response dictionary
+ if isinstance(llm_response, dict):
+ llm_output_json_str = llm_response.get("content", "")
+ if not llm_output_json_str:
+ llm_output_json_str = str(llm_response)
+ else:
+ llm_output_json_str = str(llm_response) if llm_response else ""
+
+ # ๐ DEBUG: Log essential info only (removed verbose JSON content)
+ self.logger.debug(f"๐ Sample {i+1} - LLM Response Type: {type(llm_response)}")
+ self.logger.debug(f"๐ Sample {i+1} - Response Length: {len(llm_output_json_str)} chars")
+
+ outputs.append(llm_output_json_str)
+
+ # Parse JSON strings to dictionaries for evaluation
+ llm_output_dict = self._parse_json_safely(llm_output_json_str)
+ ground_truth_dict = self._parse_json_safely(ground_truth_json)
+
+ # Initialize evaluation_results with default values
+ evaluation_results = {
+ "composite_score": 0.0,
+ "element_completeness": 0.0,
+ "element_type_accuracy": 0.0,
+ "text_content_accuracy": 0.0,
+ "hierarchy_accuracy": 0.0,
+ "style_accuracy": 0.0
+ }
+
+ # Calculate composite score and evaluation results
+ if not llm_output_dict and not ground_truth_dict:
+ composite_score = 0.1
+ evaluation_results = {k: 0.1 for k in evaluation_results.keys()}
+ self.logger.warning(f"โ ๏ธ Sample {i+1}: Empty results - using default score: {composite_score}")
+ elif not llm_output_dict or not ground_truth_dict:
+ composite_score = 0.05
+ evaluation_results = {k: 0.05 for k in evaluation_results.keys()}
+ self.logger.warning(f"โ ๏ธ Sample {i+1}: Incomplete results - using low score: {composite_score}")
+ else:
+ # Calculate score using evaluator with parsed dictionaries
+ evaluation_results = self.evaluator.evaluate(llm_output_dict, ground_truth_dict)
+ composite_score = evaluation_results["composite_score"]
+
+ # Clean, readable logging (removed verbose JSON dumps)
+ llm_children = len(llm_output_dict.get('children', []))
+ gt_children = len(ground_truth_dict.get('children', []))
+
+ if composite_score < 0.1:
+ self.logger.warning(f"โ ๏ธ Sample {i+1}: Low score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements")
+ self.logger.debug(f" Score breakdown: {evaluation_results}")
+ else:
+ self.logger.info(f"โ
Sample {i+1}: Score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements")
+
+ scores.append(composite_score)
+
+ if capture_traces:
+ trajectories.append({
+ 'input_text': input_text,
+ 'image_base64': image_base64,
+ 'ground_truth_json': ground_truth_json,
+ 'llm_output_json': llm_output_json_str,
+ 'evaluation_results': evaluation_results
+ })
+
+ avg_score = sum(scores) / len(scores) if scores else 0.0
+
+ # Update performance tracking (handled by parent class)
+ if avg_score > self._best_score:
+ self._best_score = avg_score
+ self._best_candidate = candidate.copy()
+ self.logger.info(f"๐ฏ New best candidate found with score: {avg_score:.4f}")
+
+ self.logger.info(f"๐ Batch evaluation complete - Average score: {avg_score:.4f}")
+
+ return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
+
+ def make_reflective_dataset(
+ self,
+ candidate: Dict[str, str],
+ eval_batch: EvaluationBatch,
+ components_to_update: List[str],
+ ) -> Dict[str, List[Dict[str, Any]]]:
+ """Create a reflective dataset from the evaluation results."""
+ reflective_dataset = {}
+ system_prompt = candidate.get('system_prompt', '')
+
+ # ๐ฏ NEW: Log the proposed new prompt being evaluated
+ self.logger.info(f"๐ Creating reflection dataset for prompt: '{system_prompt[:100]}...'")
+
+ # Pretty print reflection dataset creation
+ self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update)
+
+ for component in components_to_update:
+ reflective_dataset[component] = []
+ for i, trace in enumerate(eval_batch.trajectories):
+ feedback = self._generate_feedback(trace['evaluation_results'])
+ reflective_dataset[component].append({
+ "current_prompt": system_prompt,
+ "input_text": trace['input_text'],
+ "image_base64": trace['image_base64'],
+ "generated_json": trace['llm_output_json'],
+ "ground_truth_json": trace['ground_truth_json'],
+ "score": trace['evaluation_results']["composite_score"],
+ "feedback": feedback,
+ "detailed_scores": trace['evaluation_results']
+ })
+
+ # ๐ฏ NEW: Log reflection dataset summary
+ total_samples = sum(len(data) for data in reflective_dataset.values())
+ avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0
+ self.logger.info(f"๐ Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}")
+
+ return reflective_dataset
+
+ def _generate_feedback(self, evaluation_results: Dict[str, float]) -> str:
+ """Generate textual feedback based on evaluation results."""
+ composite_score = evaluation_results.get("composite_score", 0.0)
+
+ feedback_parts = []
+
+ # Overall quality assessment
+ if composite_score >= 0.8:
+ feedback_parts.append("The overall quality is good.")
+ elif composite_score >= 0.5:
+ feedback_parts.append("The overall quality is moderate.")
+ else:
+ feedback_parts.append("The overall quality is low. Focus on fundamental accuracy.")
+
+ # Specific metric feedback
+ if evaluation_results.get("element_completeness", 0.0) < 0.7:
+ feedback_parts.append("Element completeness is low. Ensure all UI elements are captured.")
+
+ if evaluation_results.get("element_type_accuracy", 0.0) < 0.7:
+ feedback_parts.append("Element type accuracy is low. Verify correct UI element identification (Button, Text, Image, etc.).")
+
+ if evaluation_results.get("text_content_accuracy", 0.0) < 0.7:
+ feedback_parts.append("Text content accuracy is low. Improve text extraction fidelity.")
+
+ if evaluation_results.get("hierarchy_accuracy", 0.0) < 0.7:
+ feedback_parts.append("Hierarchy accuracy is low. Ensure correct parent-child relationships.")
+
+ if evaluation_results.get("style_accuracy", 0.0) < 0.7:
+ feedback_parts.append("Style accuracy is low. Capture more styling properties (colors, sizes, positioning).")
+
+ return " ".join(feedback_parts)
+
+ def get_best_candidate(self) -> Optional[Dict[str, str]]:
+ """Get the best candidate found so far."""
+ return self._best_candidate
+
+ def get_best_score(self) -> float:
+ """Get the best score found so far."""
+ return self._best_score
+
+ def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0):
+ """
+ Log the new proposed candidate prompt.
+
+ Args:
+ candidate: The new candidate prompt from GEPA
+ iteration: Current optimization iteration
+ """
+ system_prompt = candidate.get('system_prompt', '')
+
+ logger.info("="*80)
+ logger.info(f"NEW PROPOSED CANDIDATE (Iteration {iteration})")
+ logger.info("="*80)
+ logger.info(f"PROPOSED PROMPT:")
+ logger.info("-" * 40)
+ logger.debug(f'"{system_prompt}"')
+ logger.info("-" * 40)
+ logger.info(f"Prompt Length: {len(system_prompt)} characters")
+ logger.info(f"Word Count: {len(system_prompt.split())} words")
+ logger.info("="*80)
+
+ def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch,
+ components_to_update: List[str]):
+ """
+ Log the reflection dataset creation process.
+
+ Args:
+ candidate: Current candidate being evaluated
+ eval_batch: Evaluation results
+ components_to_update: Components being updated
+ """
+ system_prompt = candidate.get('system_prompt', '')
+
+ logger.info("="*80)
+ logger.info("REFLECTION DATASET CREATION")
+ logger.info("="*80)
+
+ logger.info(f"CURRENT PROMPT BEING ANALYZED:")
+ logger.info("-" * 40)
+ logger.debug(f'"{system_prompt}"')
+ logger.info("-" * 40)
+
+ logger.info(f"EVALUATION SUMMARY:")
+ logger.info("-" * 40)
+ if eval_batch.scores:
+ avg_score = sum(eval_batch.scores) / len(eval_batch.scores)
+ min_score = min(eval_batch.scores)
+ max_score = max(eval_batch.scores)
+ logger.info(f" Average Score: {avg_score:.4f}")
+ logger.info(f" Min Score: {min_score:.4f}")
+ logger.info(f" Max Score: {max_score:.4f}")
+ logger.info(f" Total Samples: {len(eval_batch.scores)}")
+
+ logger.info(f"COMPONENTS TO UPDATE:")
+ logger.info("-" * 40)
+ for i, component in enumerate(components_to_update, 1):
+ logger.info(f" {i}. {component}")
+
+ if eval_batch.trajectories:
+ logger.debug(f"DETAILED ANALYSIS:")
+ logger.debug("-" * 40)
+ for i, trace in enumerate(eval_batch.trajectories[:3], 1): # Show first 3 samples
+ evaluation_results = trace['evaluation_results']
+ composite_score = evaluation_results.get("composite_score", 0.0)
+
+ logger.debug(f" Sample {i} (Score: {composite_score:.4f}):")
+
+ # Show input data (truncated)
+ input_text = trace['input_text'][:100] + "..." if len(trace['input_text']) > 100 else trace['input_text']
+ logger.debug(f" Input: \"{input_text}\"")
+
+ # Show predicted output (truncated)
+ predicted_output = trace['llm_output_json'][:100] + "..." if len(trace['llm_output_json']) > 100 else trace['llm_output_json']
+ logger.debug(f" Output: \"{predicted_output}\"")
+
+ # Show detailed scores
+ logger.debug(f" Detailed Scores:")
+ for metric, score in evaluation_results.items():
+ if metric != "composite_score":
+ logger.debug(f" {metric.replace('_', ' ').title()}: {score:.4f}")
+
+ # Show generated feedback
+ feedback = self._generate_feedback(evaluation_results)
+ logger.debug(f" Feedback: \"{feedback}\"")
+
+ if len(eval_batch.trajectories) > 3:
+ logger.debug(f" ... and {len(eval_batch.trajectories) - 3} more samples")
+
+ logger.info("="*80)
diff --git a/src/gepa_optimizer/core/optimizer.py b/src/gepa_optimizer/core/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..30271dce298b80e1dc6b4478bb69aa8e2766b65c
--- /dev/null
+++ b/src/gepa_optimizer/core/optimizer.py
@@ -0,0 +1,1279 @@
+"""
+Main GepaOptimizer class - the heart of the optimization system
+"""
+
+import time
+import logging
+from typing import Any, Dict, List, Optional, Union
+import asyncio
+import io
+import sys
+from contextlib import redirect_stdout, redirect_stderr
+
+import gepa
+from ..utils.api_keys import APIKeyManager
+from .result import ResultProcessor
+from ..data.converters import UniversalConverter
+from ..models.result import OptimizationResult, OptimizedResult
+from ..models.config import OptimizationConfig, ModelConfig
+from ..utils.helpers import sanitize_prompt
+from ..utils.exceptions import GepaDependencyError, InvalidInputError, DatasetError, GepaOptimizerError
+
+logger = logging.getLogger(__name__)
+
+class GepaOptimizer:
+ """
+ Main class for prompt optimization using GEPA
+
+ This is the primary interface that users interact with.
+ Provides both simple and advanced optimization capabilities.
+ """
+
+ def __init__(self, config: Optional[OptimizationConfig] = None,
+ adapter_type: str = "universal",
+ custom_adapter: Optional[Any] = None,
+ llm_model_name: Optional[str] = None,
+ metric_weights: Optional[Dict[str, float]] = None,
+ **kwargs):
+ """
+ Initialize the optimizer
+
+ Args:
+ config: Optimization configuration (required)
+ adapter_type: Type of adapter to use ("universal" only - fully configurable)
+ custom_adapter: Custom adapter instance (overrides adapter_type)
+ llm_model_name: [Deprecated] Use config.model instead. Will be removed in future versions.
+ metric_weights: [Deprecated] Not used - evaluator handles metrics. Will be removed in future versions.
+ **kwargs: Additional parameters for universal adapter (llm_client, evaluator, etc.)
+
+ Raises:
+ ValueError: If required configuration is missing
+ GepaDependencyError: If GEPA library is not available
+ """
+ if config is None:
+ raise ValueError("config parameter is required. Use OptimizationConfig to configure the optimizer.")
+
+ # Initialize logger first
+ self.logger = logging.getLogger(__name__)
+
+ self.config = config
+ self.converter = UniversalConverter(data_split_config=config.data_split)
+ self.api_manager = APIKeyManager()
+ self.result_processor = ResultProcessor()
+
+ # Initialize adapter based on configuration
+ if custom_adapter:
+ # User provided custom adapter
+ from .base_adapter import BaseGepaAdapter
+ if not isinstance(custom_adapter, BaseGepaAdapter):
+ raise TypeError("custom_adapter must be an instance of BaseGepaAdapter")
+ self.adapter = custom_adapter
+ self.logger.info("Using user-provided custom adapter")
+ elif adapter_type == "universal":
+ # Universal adapter requires user to provide components
+ llm_client = kwargs.get('llm_client')
+ evaluator = kwargs.get('evaluator')
+
+ if not llm_client or not evaluator:
+ raise ValueError(
+ "llm_client and evaluator are required for universal adapter. "
+ "Example: GepaOptimizer(config=config, adapter_type='universal', "
+ "llm_client=llm_client, evaluator=evaluator)"
+ )
+
+ from .universal_adapter import UniversalGepaAdapter
+ self.adapter = UniversalGepaAdapter(
+ llm_client=llm_client,
+ evaluator=evaluator,
+ data_converter=kwargs.get('data_converter')
+ )
+ self.logger.info("Using universal adapter")
+ else:
+ raise ValueError(
+ f"Unknown adapter_type: {adapter_type}. "
+ f"Only 'universal' is supported. "
+ f"Provide llm_client and evaluator when using universal adapter."
+ )
+
+ # Keep backward compatibility
+ self.custom_adapter = self.adapter
+
+ # Log model configuration
+ model_info = self.adapter.get_performance_stats()
+ self.logger.info(f"Initialized adapter: {model_info}")
+
+ # Set up logging
+ logging.basicConfig(
+ level=logging.INFO,
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+ )
+
+ # Validate GEPA availability
+ if gepa is None:
+ raise GepaDependencyError("GEPA library is not available. Please install it with: pip install gepa")
+
+ async def train(self,
+ seed_prompt: str,
+ dataset: Union[List[Any], str, Dict, Any],
+ **kwargs) -> OptimizedResult:
+ """
+ Main training method for prompt optimization
+
+ Args:
+ seed_prompt: Initial prompt to optimize
+ dataset: Training data in any format
+ **kwargs: Additional parameters that can override config
+
+ Returns:
+ OptimizedResult: Optimization result with improved prompt
+
+ Raises:
+ InvalidInputError: For invalid input parameters
+ DatasetError: For issues with dataset processing
+ GepaOptimizerError: For optimization failures
+ """
+ start_time = time.time()
+ session_id = f"opt_{int(start_time)}_{id(self)}"
+
+ try:
+ self.logger.info(f"Starting optimization session: {session_id}")
+ self.logger.info(f"Using model: {self.config.model.model_name} (provider: {self.config.model.provider})")
+
+ # #region agent log
+ import json as _json_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "optimizer.py:train_start", "message": "Optimization train() started", "data": {"session_id": session_id, "max_iterations": self.config.max_iterations}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ # ๐ฅ FIX E: Reset Pareto logger at start of each optimization run
+ from ..utils.pareto_logger import reset_pareto_logger
+ reset_pareto_logger()
+ self.logger.info("โ
Reset Pareto logger for new optimization run")
+
+ # Update config with any overrides from kwargs
+ self._update_config_from_kwargs(kwargs)
+
+ # Step 1: Validate inputs
+ self._validate_inputs(seed_prompt)
+
+ # Step 2: Convert dataset to GEPA format with 3-way split
+ # ๐ฅ FIX: Support pre-split datasets (user-provided train/val/test)
+ if isinstance(dataset, dict) and all(k in dataset for k in ['train', 'val', 'test']):
+ # User provided pre-split dataset - use it directly
+ self.logger.info("โ
Detected pre-split dataset - using user's split (no re-splitting)")
+ trainset_raw = dataset.get('train', [])
+ valset_raw = dataset.get('val', [])
+ testset_raw = dataset.get('test', [])
+
+ # Still need to standardize the format (convert to GEPA format)
+ trainset = self.converter._standardize(trainset_raw)
+ valset = self.converter._standardize(valset_raw)
+ testset = self.converter._standardize(testset_raw) if testset_raw else []
+
+ self.logger.info(
+ f"Using pre-split dataset: {len(trainset)} train (Dfeedback), "
+ f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)"
+ )
+ else:
+ # Standard path: convert and split automatically
+ self.logger.info("Converting dataset to GEPA format with 3-way split...")
+ trainset, valset, testset = self.converter.convert(
+ dataset,
+ split_config=self.config.data_split
+ )
+
+ # Log split with adaptive strategy info
+ split_strategy = self.config.data_split.small_dataset_strategy
+ strategy_note = ""
+ if split_strategy == 'adaptive':
+ total_size = len(trainset) + len(valset) + len(testset)
+ train_ratio, val_ratio, test_ratio = self.config.data_split.get_adaptive_ratios(total_size)
+ strategy_note = f" (adaptive: {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% ratios)"
+ self.logger.info(
+ f"Dataset split{strategy_note}: {len(trainset)} train (Dfeedback), "
+ f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)"
+ )
+
+ if not trainset:
+ raise DatasetError("Dataset appears to be empty after conversion")
+
+ # Step 3: Create seed candidate
+ seed_candidate = self._create_seed_candidate(seed_prompt)
+
+ # ๐ฅ CRITICAL: Set valset info in adapter BEFORE baseline evaluation
+ # This ensures adapter correctly detects 'dpareto' dataset type
+ # Use direct assignment (don't rely on hasattr) to ensure attributes are set
+ try:
+ self.adapter._valset_size = len(valset) if valset else 0
+ self.logger.info(f"โ
Set valset_size in adapter: {len(valset) if valset else 0} for Dpareto detection")
+ except AttributeError:
+ self.logger.warning("โ ๏ธ Could not set _valset_size in adapter - attribute not supported")
+
+ try:
+ self.adapter._valset = valset
+ self.logger.info(f"โ
Stored valset in adapter ({len(valset) if valset else 0} samples)")
+ except AttributeError:
+ self.logger.warning("โ ๏ธ Could not set _valset in adapter - attribute not supported")
+
+ # Step 3.5: Calculate baseline score on VALIDATION set (not test set)
+ # This ensures fair comparison since optimization uses validation set for Pareto selection
+ baseline_val_score = None
+ if valset:
+ self.logger.info("๐ Evaluating seed prompt on validation set for baseline...")
+ # Set baseline flag so adapter knows this is baseline, not optimization
+ # Use direct assignment to ensure the flag is set
+ try:
+ self.adapter._is_baseline_evaluation = True
+ self.logger.info("โ
Set baseline evaluation flag in adapter")
+ except AttributeError:
+ self.logger.warning("โ ๏ธ Could not set _is_baseline_evaluation in adapter")
+
+ try:
+ # Evaluate on validation set (same as what GEPA will use for Pareto selection)
+ eval_result = self.adapter.evaluate(
+ batch=valset,
+ candidate=seed_candidate,
+ capture_traces=False
+ )
+ baseline_val_score = sum(eval_result.scores) / len(eval_result.scores) if eval_result.scores else 0.0
+ self.logger.info(f"๐ Baseline validation score: {baseline_val_score:.4f} (on {len(valset)} samples)")
+
+ # Store baseline in adapter for later use
+ if hasattr(self.adapter, '_baseline_score'):
+ self.adapter._baseline_score = baseline_val_score
+
+ # ๐ฅ CRITICAL FIX: Also set baseline in Pareto logger
+ # This ensures candidates can be properly evaluated against baseline
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ pareto_log.set_baseline(baseline_val_score)
+ self.logger.info(f"โ
Baseline set in Pareto logger: {baseline_val_score:.4f}")
+
+ except Exception as e:
+ self.logger.warning(f"Baseline evaluation failed: {e}")
+ import traceback
+ self.logger.debug(f"Baseline evaluation error: {traceback.format_exc()}")
+ finally:
+ try:
+ self.adapter._is_baseline_evaluation = False
+ self.logger.debug("โ
Reset baseline evaluation flag - optimization can begin")
+ except AttributeError:
+ pass # Ignore if attribute not supported
+
+ # Step 4: Run GEPA optimization
+ self.logger.info("Starting GEPA optimization...")
+ gepa_result, actual_iterations = await self._run_gepa_optimization(
+ adapter=self.adapter,
+ seed_candidate=seed_candidate,
+ trainset=trainset,
+ valset=valset,
+ **kwargs
+ )
+
+ # Step 5: Extract best candidate
+ best_candidate = self._extract_best_candidate(gepa_result)
+
+ # ๐ฅ CRITICAL: Extract optimized prompt from best_candidate
+ # This is the actual optimized prompt that GEPA found
+ self.logger.info(f"\n{'โ'*80}")
+ self.logger.info(f"๐ EXTRACTING OPTIMIZED PROMPT FROM GEPA RESULT")
+ self.logger.info(f"{'โ'*80}")
+ self.logger.info(f"best_candidate keys: {list(best_candidate.keys()) if isinstance(best_candidate, dict) else 'N/A'}")
+
+ optimized_prompt = best_candidate.get('system_prompt', seed_prompt)
+ if not optimized_prompt or optimized_prompt.strip() == '':
+ # Fallback: try other keys or use seed prompt
+ optimized_prompt = best_candidate.get('prompt', best_candidate.get('text', seed_prompt))
+
+ # Get fitness score if available
+ best_fitness = best_candidate.get('fitness') or self.adapter.get_best_score() if hasattr(self.adapter, 'get_best_score') else None
+ candidate_source = best_candidate.get('source', 'unknown')
+
+ self.logger.info(f"\nโ
EXTRACTED OPTIMIZED PROMPT:")
+ self.logger.info(f" Source: {candidate_source}")
+ if best_fitness is not None:
+ self.logger.info(f" Fitness: f={best_fitness:.4f}")
+ self.logger.info(f" Length: {len(optimized_prompt)} characters")
+ self.logger.info(f" Words: {len(optimized_prompt.split())} words")
+ self.logger.info(f"\n๐ FULL OPTIMIZED PROMPT TEXT:")
+ self.logger.info(f"{'โ'*80}")
+ self.logger.info(optimized_prompt)
+ self.logger.info(f"{'โ'*80}")
+
+ if optimized_prompt != seed_prompt:
+ self.logger.info(f"\nโ
SUCCESS: Prompt WAS OPTIMIZED!")
+ self.logger.info(f" Seed length: {len(seed_prompt)} chars")
+ self.logger.info(f" Optimized length: {len(optimized_prompt)} chars")
+ self.logger.info(f" Difference: {len(optimized_prompt) - len(seed_prompt):+d} chars")
+ if best_fitness is not None:
+ baseline_fitness = 0.5 # Default baseline, could be improved
+ improvement = best_fitness - baseline_fitness
+ improvement_pct = (improvement / baseline_fitness * 100) if baseline_fitness > 0 else 0
+ self.logger.info(f" Fitness: f={best_fitness:.4f} (improvement: {improvement:+.4f} ({improvement_pct:+.1f}%))")
+ else:
+ self.logger.warning(f"\nโ ๏ธ WARNING: Optimized prompt is IDENTICAL to seed prompt")
+ self.logger.warning(f" This means GEPA didn't modify the prompt during optimization")
+ if best_fitness is not None:
+ self.logger.warning(f" Best fitness found: f={best_fitness:.4f}")
+ self.logger.warning(f" ๐ก Check if LLEGO best candidate is being properly extracted")
+
+ self.logger.info(f"{'โ'*80}\n")
+
+ # Step 5.5: Calculate improvement metrics (validation vs validation)
+ optimized_test_score = None
+ improvement_data = {}
+
+ # ๐ฅ FIX: Calculate improvement based on VALIDATION scores (fair comparison)
+ # Compare optimized VALIDATION score vs validation baseline (both on Dpareto)
+ # This ensures fair comparison - both evaluated on the same validation set
+ optimized_val_score = best_fitness # Best candidate's fitness is from validation set (Dpareto)
+
+ if baseline_val_score is not None and optimized_val_score is not None:
+ absolute_improvement = optimized_val_score - baseline_val_score
+ relative_improvement = (
+ (absolute_improvement / baseline_val_score * 100)
+ if baseline_val_score > 0 else 0
+ )
+
+ improvement_data = {
+ 'baseline_val_score': baseline_val_score,
+ 'optimized_val_score': optimized_val_score,
+ 'absolute_improvement': absolute_improvement,
+ 'relative_improvement_percent': relative_improvement
+ }
+
+ self.logger.info(
+ f"๐ Validation improvement: {relative_improvement:+.2f}% "
+ f"(baseline val: {baseline_val_score:.4f} โ optimized val: {optimized_val_score:.4f})"
+ )
+
+ # Step 5.6: Evaluate optimized prompt on test set (if available) for final reporting
+ if testset and self.config.evaluate_on_test:
+ self.logger.info("๐ Evaluating optimized prompt on test set...")
+
+ # ๐ฅ CRITICAL FIX: Clear LLEGO candidate queue before test evaluation
+ # This prevents the LLEGO wrapper from intercepting test evaluation calls
+ # and returning wrong candidates instead of actually running the optimized prompt
+ from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+ if hasattr(self.adapter, 'llm_client') and isinstance(self.adapter.llm_client, LLEGOEnhancedLLMClient):
+ if hasattr(self.adapter.llm_client, '_adapter_generated_candidates'):
+ self.adapter.llm_client._adapter_generated_candidates = []
+ self.logger.info("โ
Cleared LLEGO candidate queue for clean test evaluation")
+ if hasattr(self.adapter.llm_client, '_candidate_queue'):
+ self.adapter.llm_client._candidate_queue = []
+ self.logger.info("โ
Cleared LLEGO hybrid candidate queue for clean test evaluation")
+
+ # Evaluate on test set for final reporting (but improvement is based on validation)
+ try:
+ optimized_test_score = self._evaluate_candidate_on_testset(
+ best_candidate,
+ testset
+ )
+ self.logger.info(f"๐ Optimized test score: {optimized_test_score:.4f}")
+
+ # Add test score to improvement_data for reference (but improvement is based on validation)
+ improvement_data['optimized_test_score'] = optimized_test_score
+
+ if baseline_val_score is not None:
+ test_vs_baseline = (
+ ((optimized_test_score - baseline_val_score) / baseline_val_score * 100)
+ if baseline_val_score > 0 else 0
+ )
+ self.logger.info(
+ f"๐ Test set vs validation baseline: {test_vs_baseline:+.2f}% "
+ f"(baseline val: {baseline_val_score:.4f} โ optimized test: {optimized_test_score:.4f})"
+ )
+ except Exception as e:
+ self.logger.warning(f"Test evaluation failed: {e}")
+
+ # Step 6: Process results
+ optimization_time = time.time() - start_time
+
+ processed_result = self.result_processor.process_full_result(
+ result=gepa_result,
+ original_prompt=seed_prompt,
+ optimization_time=optimization_time,
+ actual_iterations=actual_iterations,
+ test_metrics=improvement_data # Add test metrics
+ )
+
+ # Merge improvement data
+ final_improvement_data = {**processed_result.get('improvement_data', {}), **improvement_data}
+
+ # Step 7: Create result objects
+ # ๐ฅ CRITICAL: Use extracted optimized_prompt instead of processed_result
+ result = OptimizedResult(
+ original_prompt=seed_prompt,
+ optimized_prompt=optimized_prompt, # Use extracted prompt, not processed_result!
+ improvement_data=final_improvement_data,
+ optimization_time=optimization_time,
+ dataset_size=len(trainset) + len(valset) + len(testset),
+ total_iterations=processed_result.get('total_iterations', 0),
+ status=processed_result.get('status', 'completed'),
+ error_message=processed_result.get('error_message'),
+ detailed_result=OptimizationResult(
+ session_id=session_id,
+ original_prompt=seed_prompt,
+ optimized_prompt=optimized_prompt, # Use extracted prompt!
+ improvement_data=final_improvement_data,
+ optimization_time=optimization_time,
+ dataset_size=len(trainset) + len(valset) + len(testset),
+ total_iterations=processed_result.get('total_iterations', 0),
+ status=processed_result.get('status', 'completed'),
+ error_message=processed_result.get('error_message')
+ )
+ )
+
+ self.logger.info(f"โ
Optimization completed in {optimization_time:.2f}s")
+ return result
+
+ except Exception as e:
+ optimization_time = time.time() - start_time
+ error_msg = f"Optimization failed: {str(e)}"
+ self.logger.error(error_msg)
+
+ # Return failed result
+ return OptimizedResult(
+ original_prompt=seed_prompt,
+ optimized_prompt=seed_prompt, # Return original on failure
+ improvement_data={'error': error_msg},
+ optimization_time=optimization_time,
+ dataset_size=0,
+ total_iterations=0,
+ status='failed',
+ error_message=error_msg
+ )
+
+ def _update_config_from_kwargs(self, kwargs: Dict[str, Any]) -> None:
+ """Update configuration with runtime overrides from kwargs."""
+ updated_params = []
+
+ for key, value in kwargs.items():
+ if hasattr(self.config, key):
+ setattr(self.config, key, value)
+ updated_params.append(f"{key}={value}")
+ else:
+ self.logger.warning(f"Unknown parameter '{key}' ignored")
+
+ if updated_params:
+ self.logger.info(f"Updated config parameters: {', '.join(updated_params)}")
+
+ def _validate_inputs(self, seed_prompt: str) -> None:
+ """
+ Validate input parameters for optimization
+
+ Args:
+ seed_prompt: The seed prompt to validate
+
+ Raises:
+ InvalidInputError: If validation fails
+ """
+ if not seed_prompt or not isinstance(seed_prompt, str):
+ raise InvalidInputError("Seed prompt must be a non-empty string")
+
+ if len(seed_prompt.strip()) < 10:
+ raise InvalidInputError("Seed prompt is too short (minimum 10 characters)")
+
+ # Validate model configuration
+ model_config = self.config.model
+ if not hasattr(model_config, 'model_name') or not model_config.model_name:
+ raise InvalidInputError("Model name is required")
+
+ reflection_config = self.config.reflection_model
+ if not hasattr(reflection_config, 'model_name') or not reflection_config.model_name:
+ raise InvalidInputError("Reflection model name is required")
+
+ def _clean_reflection_prompt(self, prompt: str, max_length: int = 50000) -> str:
+ """
+ Clean reflection prompt by removing base64 images and truncating if too long.
+
+ ๐ฅ CRITICAL: GEPA's reflective dataset includes base64 images which create
+ massive prompts (7MB+) that exceed token limits. This function:
+ 1. Strips all base64 image data
+ 2. Removes excessive detailed_scores entries
+ 3. Truncates to reasonable size
+ 4. Preserves essential feedback information
+
+ Args:
+ prompt: Original prompt from GEPA (may contain base64)
+ max_length: Maximum length after cleaning (default: 50K chars)
+
+ Returns:
+ Cleaned prompt without base64, within size limits
+ """
+ import re
+
+ # Step 1: Remove base64 image strings (typically very long alphanumeric strings)
+ # Base64 images are usually 50K+ characters of A-Za-z0-9+/= pattern
+ # Look for very long base64-like sequences
+ base64_pattern = r'[A-Za-z0-9+/=]{5000,}' # Sequences of 5000+ base64 chars
+ cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', prompt)
+
+ # Step 2: Remove detailed_scores sections that might contain base64 references
+ # These are usually in markdown format: "### detailed_scores\n...base64..."
+ detailed_scores_pattern = r'### detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*'
+ cleaned = re.sub(detailed_scores_pattern, '### detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE)
+
+ # Step 3: Remove any remaining image_base64 references
+ cleaned = re.sub(r'image_base64[^\n]*', 'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE)
+ cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned) # Very long strings likely base64
+
+ # Step 4: Truncate if still too long (keep the beginning which usually has the most important info)
+ if len(cleaned) > max_length:
+ # Keep first part (usually contains prompt and key feedback)
+ # Add truncation notice
+ truncated_size = len(cleaned) - max_length
+ cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters of detailed evaluation data]"
+ self.logger.warning(f"โ ๏ธ Prompt truncated: {len(prompt)} โ {len(cleaned)} chars")
+
+ return cleaned
+
+ def _validate_models(self, task_lm, reflection_lm):
+ """
+ Validate if specified models are supported.
+
+ Note: No hardcoded restrictions - the API provider will validate model existence.
+ This method is kept for potential future validation logic but doesn't restrict users.
+ """
+ # No hardcoded model restrictions - users can specify any model
+ # The API provider will handle validation and return errors if model doesn't exist
+ self.logger.debug(f"Using task model: {task_lm}, reflection model: {reflection_lm}")
+
+ def _create_seed_candidate(self, seed_prompt: str) -> Dict[str, str]:
+ """Create a seed candidate from the input prompt."""
+ sanitized_prompt = sanitize_prompt(seed_prompt)
+ return {'system_prompt': sanitized_prompt}
+
+ async def _run_gepa_optimization(self, adapter, seed_candidate: Any, trainset: List[Any], valset: List[Any], **kwargs) -> tuple: # Return tuple
+ """
+ Run GEPA optimization with the given adapter and data
+
+ Args:
+ adapter: Custom adapter for GEPA
+ seed_candidate: Initial prompt candidate
+ trainset: Training dataset
+ valset: Validation dataset
+ **kwargs: Additional optimization parameters that can override config
+
+ Returns:
+ Dict with optimization results
+
+ Raises:
+ GepaOptimizerError: If optimization fails
+
+ Note:
+ The following parameters are required in the config:
+ - max_metric_calls: Maximum number of metric evaluations
+ - batch_size: Batch size for evaluation
+ - max_iterations: Maximum number of optimization iterations
+ """
+ try:
+ # Get optimization parameters from config (these are required fields)
+ max_metric_calls = self.config.max_metric_calls
+ batch_size = self.config.batch_size
+ max_iterations = self.config.max_iterations
+
+ # Create reflection model client
+ from ..llms.vision_llm import VisionLLMClient
+ base_reflection_lm_client = VisionLLMClient(
+ provider=self.config.reflection_model.provider,
+ model_name=self.config.reflection_model.model_name,
+ api_key=self.config.reflection_model.api_key,
+ base_url=self.config.reflection_model.base_url,
+ temperature=self.config.reflection_model.temperature,
+ max_tokens=self.config.reflection_model.max_tokens,
+ top_p=self.config.reflection_model.top_p,
+ frequency_penalty=self.config.reflection_model.frequency_penalty,
+ presence_penalty=self.config.reflection_model.presence_penalty
+ )
+ # reflection_lm_client will be set below (may be wrapped with LLEGO)
+ reflection_lm_client = base_reflection_lm_client
+
+ # ๐ LLEGO Integration: Create enhanced reflection callable
+ if self.config.use_llego_operators:
+ self.logger.info("๐งฌ LLEGO genetic operators ENABLED")
+ self.logger.info(f" ฮฑ={self.config.alpha}, ฯ={self.config.tau}, ฮฝ={self.config.nu}")
+ self.logger.info(f" Crossover offspring: {self.config.n_crossover}, Mutation offspring: {self.config.n_mutation}")
+
+ # Import LLEGO operators
+ from ..operators.llego_operators import LLEGOIntegrationLayer, PromptCandidate
+
+ # Initialize LLEGO integration layer
+ llego = LLEGOIntegrationLayer(
+ alpha=self.config.alpha,
+ tau=self.config.tau,
+ nu=self.config.nu,
+ population_size=self.config.population_size,
+ n_crossover=self.config.n_crossover,
+ n_mutation=self.config.n_mutation
+ )
+
+ # Initialize with seed prompt
+ llego.initialize_population(
+ seed_prompt=seed_candidate.get('system_prompt', ''),
+ initial_fitness=0.5
+ )
+
+ # ๐ฅ HYBRID MODE FIX: Wrap reflection_lm_client with LLEGO for hybrid mode
+ # This ensures reflection calls go through LLEGO wrapper for candidate generation
+ if self.config.enable_gepa_reflection_with_llego:
+ self.logger.info("๐ฅ HYBRID MODE: Wrapping reflection_lm_client with LLEGO")
+ from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+
+ # Wrap reflection_lm_client with LLEGO so hybrid generation is triggered
+ reflection_lm_client = LLEGOEnhancedLLMClient(
+ base_llm=base_reflection_lm_client,
+ llego_layer=llego,
+ config=self.config, # Pass config for hybrid mode!
+ verbose=True
+ )
+ self.logger.info("โ
reflection_lm_client wrapped with LLEGO (hybrid mode enabled)")
+
+ # ๐ฅ CRITICAL: Store reflection_lm_client reference in adapter so it can set context
+ # This allows make_reflective_dataset to set reflection context on BOTH clients
+ if hasattr(adapter, 'reflection_lm_client'):
+ adapter.reflection_lm_client = reflection_lm_client
+ self.logger.info("โ
Stored reflection_lm_client reference in adapter")
+ else:
+ # Add reflection_lm_client attribute to adapter
+ adapter.reflection_lm_client = reflection_lm_client
+ self.logger.info("โ
Added reflection_lm_client attribute to adapter")
+
+ # ๐ฅ NEW: Also store config and reflection_lm_client for adapter-level generation
+ if hasattr(adapter, '_config'):
+ adapter._config = self.config
+ self.logger.info("โ
Stored config in adapter for hybrid mode")
+ else:
+ adapter._config = self.config
+ self.logger.info("โ
Added _config attribute to adapter")
+
+ if hasattr(adapter, '_reflection_lm_client'):
+ adapter._reflection_lm_client = reflection_lm_client
+ self.logger.info("โ
Stored _reflection_lm_client in adapter for hybrid mode")
+ else:
+ adapter._reflection_lm_client = reflection_lm_client
+ self.logger.info("โ
Added _reflection_lm_client attribute to adapter")
+
+ # ๐ฅ CRITICAL FIX: Ensure LLEGO layer is stored in adapter
+ # Without this, adapter.llego will be None and population updates are skipped!
+ if hasattr(adapter, 'llego'):
+ if adapter.llego is None:
+ adapter.llego = llego
+ self.logger.info("โ
CRITICAL: Set LLEGO layer in adapter (was None)")
+ else:
+ self.logger.debug("โ
LLEGO layer already set in adapter")
+ else:
+ # Add llego attribute if it doesn't exist
+ adapter.llego = llego
+ self.logger.info("โ
CRITICAL: Added LLEGO layer to adapter")
+
+ # ๐ฅ CRITICAL: Always set _reflection_lm_client in adapter (even without hybrid mode)
+ # This is required for propose_new_texts() to work
+ if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None:
+ adapter._reflection_lm_client = reflection_lm_client
+ self.logger.info("โ
Set _reflection_lm_client in adapter (required for propose_new_texts)")
+
+ # ๐ฅ HYBRID MODE FIX: Inject config into LLEGO wrapper for hybrid mode
+ # The adapter already has LLEGO wrapper, we just need to update its config
+ if self.config.enable_gepa_reflection_with_llego:
+ # HYBRID MODE: Update the LLEGO wrapper's config
+ self.logger.info("๐ฅ HYBRID MODE: Enabling hybrid candidate generation in LLEGO wrapper")
+
+ # Get the LLM client (may already be wrapped)
+ llm_client = self.adapter.llm_client
+ from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+
+ if isinstance(llm_client, LLEGOEnhancedLLMClient):
+ # Already wrapped, just update config
+ llm_client.config = self.config
+ self.logger.info("โ
Updated LLEGO wrapper with hybrid mode config")
+ else:
+ # Not wrapped yet, wrap it now with config
+ llego_wrapped_llm = LLEGOEnhancedLLMClient(
+ base_llm=llm_client,
+ llego_layer=llego,
+ config=self.config, # โ Pass config for hybrid mode!
+ verbose=True
+ )
+ # Update adapter's LLM client
+ self.adapter.llm_client = llego_wrapped_llm
+ self.logger.info("โ
Wrapped LLM client with LLEGO (hybrid mode enabled)")
+
+ adapter = self.adapter
+ else:
+ # LLEGO-ONLY MODE: Wrap adapter with LLEGO layer (no hybrid)
+ self.logger.info("๐งฌ LLEGO-ONLY MODE: Recreating adapter with LLEGO integration...")
+ if hasattr(self, 'adapter') and self.adapter:
+ from .universal_adapter import UniversalGepaAdapter
+
+ # Get original LLM client and evaluator from current adapter
+ original_llm = self.adapter.llm_client
+ # If it's already wrapped, unwrap it
+ if hasattr(original_llm, 'base_llm'):
+ original_llm = original_llm.base_llm
+
+ evaluator = self.adapter.evaluator
+ data_converter = self.adapter.data_converter
+
+ # Recreate adapter with LLEGO (no hybrid mode config)
+ from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+ llego_wrapped_llm = LLEGOEnhancedLLMClient(
+ base_llm=original_llm,
+ llego_layer=llego,
+ config=None, # No hybrid mode
+ verbose=True
+ )
+
+ adapter = UniversalGepaAdapter(
+ llm_client=llego_wrapped_llm,
+ evaluator=evaluator,
+ data_converter=data_converter,
+ llego_layer=llego
+ )
+ self.logger.info("โ
Adapter recreated with LLEGO-enhanced LLM client")
+ else:
+ adapter = self.adapter
+
+ # Create LLEGO-enhanced reflection callable
+ # When hybrid mode is enabled, reflection_lm_client is wrapped with LLEGO
+ # The wrapper will automatically generate hybrid candidates when called
+ def reflection_lm_callable(prompt: str) -> str:
+ """
+ Reflection callable that delegates to LLEGO-wrapped client.
+ In hybrid mode, the wrapper generates candidates from both GEPA and LLEGO.
+
+ ๐ฅ CRITICAL: Clean the prompt to remove base64 images and truncate if too long.
+ """
+ # ๐ฅ FIX: Clean prompt to remove base64 images and truncate excessive data
+ cleaned_prompt = self._clean_reflection_prompt(prompt)
+
+ self.logger.info(f"\n{'๐ฅ'*40}")
+ self.logger.info(f"๐ฅ reflection_lm_callable CALLED (delegating to LLEGO wrapper)")
+ self.logger.info(f"๐ฅ Original prompt length: {len(prompt)} chars")
+ self.logger.info(f"๐ฅ Cleaned prompt length: {len(cleaned_prompt)} chars")
+ self.logger.info(f"๐ฅ Truncation: {len(prompt) - len(cleaned_prompt)} chars removed")
+ self.logger.info(f"๐ฅ First 200 chars (cleaned): {cleaned_prompt[:200]}...")
+ self.logger.info(f"{'๐ฅ'*40}\n")
+
+ try:
+ # ๐ฅ CRITICAL: Set reflection context BEFORE generating
+ # This signals to the LLEGO wrapper that we're in reflection mode
+ if isinstance(reflection_lm_client, LLEGOEnhancedLLMClient):
+ reflection_lm_client.set_reflection_context(
+ current_prompt=cleaned_prompt, # Use cleaned prompt
+ feedback=None,
+ in_reflection=True # Enable reflection mode
+ )
+ self.logger.info("โ
Reflection context set on reflection_lm_client")
+
+ # ๐ฅ HYBRID MODE: If reflection_lm_client is wrapped with LLEGO,
+ # calling generate() will trigger hybrid candidate generation
+ # The wrapper handles queuing and returns candidates one by one
+
+ # ๐ฅ CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback
+ optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+
+Output the improved prompt directly and only the prompt."""
+
+ result = reflection_lm_client.generate(
+ system_prompt=optimization_system_prompt,
+ user_prompt=cleaned_prompt, # Use cleaned prompt (no base64, truncated)
+ image_base64=""
+ )
+
+ # Extract content from result
+ if isinstance(result, dict):
+ candidate = result.get("content", str(result))
+ source = result.get("source", "unknown")
+ self.logger.info(f"โ
Candidate from {source} (FULL TEXT):")
+ self.logger.info(f" '{candidate}'")
+ return candidate
+ else:
+ candidate = str(result)
+ self.logger.info(f"โ
Candidate generated (FULL TEXT):")
+ self.logger.info(f" '{candidate}'")
+ return candidate
+
+ except Exception as e:
+ self.logger.error(f"โ Error in reflection_lm_callable: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ # Fallback: return prompt as-is
+ return prompt
+
+ # Set up reflection context for LLEGO wrapper
+ if self.config.enable_gepa_reflection_with_llego and isinstance(reflection_lm_client, LLEGOEnhancedLLMClient):
+ # Store current prompt in reflection context for LLEGO operators
+ reflection_lm_client.set_reflection_context(
+ current_prompt=seed_candidate.get('system_prompt', ''),
+ feedback=None,
+ in_reflection=True
+ )
+
+ else:
+ # Standard GEPA reflection (no LLEGO)
+ adapter = self.adapter # Use the original adapter
+
+ # ๐ฅ CRITICAL: Always set _reflection_lm_client in adapter (even without LLEGO)
+ # This is required for propose_new_texts() to work
+ if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None:
+ adapter._reflection_lm_client = reflection_lm_client
+ self.logger.info("โ
Set _reflection_lm_client in adapter (required for propose_new_texts)")
+
+ # Define standard reflection callable (no LLEGO enhancement)
+ def reflection_lm_callable(prompt: str) -> str:
+ """Standard callable wrapper for reflection model that GEPA expects"""
+ try:
+ # ๐ฅ CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback
+ optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+
+Output the improved prompt directly and only the prompt."""
+
+ # For reflection, we only need text generation (no images)
+ result = reflection_lm_client.generate(
+ system_prompt=optimization_system_prompt,
+ user_prompt=prompt,
+ image_base64="" # No image for reflection
+ )
+
+ # Extract string content from the result dictionary
+ if isinstance(result, dict):
+ return result.get("content", str(result))
+ else:
+ return str(result)
+
+ except Exception as e:
+ self.logger.error(f"Reflection model error: {e}")
+ return prompt # Return original prompt on error
+ self.logger.info(
+ f"Starting GEPA optimization with {max_iterations} iterations, "
+ f"batch size {batch_size}, max metric calls: {max_metric_calls}"
+ )
+ self.logger.info(
+ f"GEPA parameters: candidate_selection_strategy=pareto, "
+ f"reflection_minibatch_size={batch_size}, "
+ f"skip_perfect_score=False, "
+ f"module_selector=round_robin"
+ )
+
+ # Prepare optimization parameters with ONLY valid GEPA parameters
+ # Note: 'adapter' variable is set above (either LLEGO-enhanced or standard)
+ # ๐ฅ REMOVED: Excessive diagnostic warnings - moved to DEBUG level
+ reflection_lm_passed = reflection_lm_callable if self.config.use_llego_operators else None
+ if reflection_lm_passed:
+ self.logger.debug(f"reflection_lm_callable passed to GEPA (may be ignored in adapter mode)")
+
+ # #region agent log
+ import json as _json_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params", "message": "GEPA params construction", "data": {"max_iterations_from_config": max_iterations, "max_metric_calls": max_metric_calls, "batch_size": batch_size}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ gepa_params = {
+ 'adapter': adapter, # Use the adapter created above (with or without LLEGO)
+ 'seed_candidate': seed_candidate,
+ 'trainset': trainset,
+ 'valset': valset,
+ 'max_metric_calls': max_metric_calls,
+ # NOTE: GEPA does NOT have num_iterations - it uses max_metric_calls to control iterations
+
+ # ๐ฅ CRITICAL: When using an adapter, GEPA expects:
+ # - adapter.make_reflective_dataset() to create feedback data
+ # - GEPA's internal proposer to generate candidates from that data
+ # - task_lm and reflection_lm must be None (GEPA will use model from adapter)
+ 'task_lm': None, # Don't pass - adapter handles this
+ 'reflection_lm': reflection_lm_passed, # Pass LLEGO-enhanced reflection (may be ignored!)
+
+ # Valid GEPA parameters based on actual library
+ 'candidate_selection_strategy': 'pareto', # Use Pareto selection
+ 'skip_perfect_score': False, # Don't skip perfect scores
+ 'reflection_minibatch_size': batch_size, # Use batch size for reflection
+ 'perfect_score': 1.0, # Perfect score threshold
+ 'module_selector': 'round_robin', # Cycle through components
+ 'display_progress_bar': self.config.verbose, # Show progress if verbose
+ 'raise_on_exception': True, # Raise exceptions for debugging
+ }
+
+ # ๐ฅ CRITICAL FIX: Filter kwargs to only include valid GEPA parameters
+ # GEPA does NOT accept num_iterations, max_iterations, or other non-GEPA params
+ VALID_GEPA_PARAMS = {
+ 'seed_candidate', 'trainset', 'valset', 'adapter', 'task_lm', 'reflection_lm',
+ 'candidate_selection_strategy', 'skip_perfect_score', 'batch_sampler',
+ 'reflection_minibatch_size', 'perfect_score', 'reflection_prompt_template',
+ 'module_selector', 'use_merge', 'max_merge_invocations', 'merge_val_overlap_floor',
+ 'max_metric_calls', 'stop_callbacks', 'logger', 'run_dir', 'use_wandb',
+ 'wandb_api_key', 'wandb_init_kwargs', 'use_mlflow', 'mlflow_tracking_uri',
+ 'mlflow_experiment_name', 'track_best_outputs', 'display_progress_bar',
+ 'use_cloudpickle', 'seed', 'raise_on_exception', 'val_evaluation_policy'
+ }
+
+ # Only add valid kwargs that aren't already in gepa_params
+ for key, value in kwargs.items():
+ if key in VALID_GEPA_PARAMS and key not in gepa_params:
+ gepa_params[key] = value
+ elif key not in VALID_GEPA_PARAMS:
+ self.logger.debug(f"โ ๏ธ Filtering out invalid GEPA parameter: {key}")
+
+ # #region agent log
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params_final", "message": "Final GEPA params keys", "data": {"params_keys": list(gepa_params.keys()), "max_metric_calls": gepa_params.get('max_metric_calls', 'NOT_PASSED')}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ # ๐ฏ NEW: Capture GEPA's internal logging for pareto front information
+ gepa_output = io.StringIO()
+
+ # Log iteration start
+ from ..utils.clean_logger import get_clean_logger
+ clean_log = get_clean_logger()
+ clean_log.log_iteration_start(1, seed_prompt=seed_candidate.get('system_prompt', ''))
+
+ # ๐ฅ CRITICAL: Pass valset size to adapter for better dataset type detection
+ if hasattr(adapter, '_valset_size'):
+ adapter._valset_size = len(valset)
+ self.logger.debug(f"โ
Set valset_size in adapter: {len(valset)} for Dpareto detection")
+
+ # ๐ฅ CRITICAL FIX: Store valset in adapter so we can evaluate generated candidates on it
+ # This ensures generated candidates are evaluated on Dpareto for Pareto selection
+ if hasattr(adapter, '_valset'):
+ adapter._valset = valset
+ self.logger.debug(f"โ
Stored valset in adapter ({len(valset)} samples) for Dpareto evaluation of generated candidates")
+ else:
+ # Add _valset attribute if it doesn't exist
+ adapter._valset = valset
+ self.logger.debug(f"โ
Added _valset attribute to adapter ({len(valset)} samples)")
+
+ # Run GEPA optimization (synchronous call wrapped in async)
+ result = await asyncio.get_event_loop().run_in_executor(
+ None,
+ lambda: self._run_gepa_with_logging(gepa_params, gepa_output)
+ )
+
+ # ๐ฏ NEW: Process and log pareto front information, extract iteration count
+ gepa_logs = gepa_output.getvalue()
+ actual_iterations = self._log_pareto_front_info(gepa_logs) # Get iteration count
+
+ return result, actual_iterations # Return both result and iteration count
+ except Exception as e:
+ # Try to extract partial results before failing
+ self.logger.warning(f"GEPA optimization failed: {e}")
+
+ # Check if we have any cached results from the adapter
+ best_candidate = adapter.get_best_candidate()
+ best_score = adapter.get_best_score()
+
+ if best_candidate and best_score > 0:
+ self.logger.info(f"๐ฏ Using cached best result with score: {best_score:.4f}")
+
+ # Create a mock GEPA result with the best candidate found
+ return {
+ 'best_candidate': best_candidate,
+ 'best_score': best_score,
+ 'partial_result': True,
+ 'error': f'GEPA failed but returning best result found: {str(e)}'
+ }
+ else:
+ # If no cached results, re-raise the error
+ raise GepaOptimizerError(f"GEPA optimization failed: {str(e)}")
+
+ def _run_gepa_with_logging(self, gepa_params: Dict[str, Any], output_buffer: io.StringIO) -> Any:
+ """Run GEPA optimization while capturing its output."""
+ # Capture GEPA's print statements and logging
+ with redirect_stdout(output_buffer), redirect_stderr(output_buffer):
+ return gepa.optimize(**gepa_params)
+
+ def _log_pareto_front_info(self, gepa_logs: str) -> int: # Return int instead of None
+ """Extract and log pareto front information from GEPA logs. Returns max iteration count."""
+ lines = gepa_logs.split('\n')
+ current_iteration = 0
+ max_iteration = 0 # Track max iteration
+
+ for line in lines:
+ # Look for iteration information
+ if 'iteration' in line.lower():
+ # Try to extract iteration number
+ import re
+ iteration_match = re.search(r'iteration\s+(\d+)', line.lower())
+ if iteration_match:
+ current_iteration = int(iteration_match.group(1))
+ max_iteration = max(max_iteration, current_iteration) # Track max
+ # Log iteration change
+ from ..utils.clean_logger import get_clean_logger
+ clean_log = get_clean_logger()
+ if current_iteration > clean_log.current_iteration:
+ clean_log.current_iteration = current_iteration
+
+ # Look for pareto front information
+ if 'pareto front' in line.lower() or 'new program' in line.lower():
+ self.logger.info(f"GEPA Pareto Update: {line.strip()}")
+ elif 'iteration' in line.lower() and ('score' in line.lower() or 'program' in line.lower()):
+ self.logger.debug(f"{line.strip()}")
+ elif 'best' in line.lower() and 'score' in line.lower():
+ self.logger.info(f"{line.strip()}")
+
+ # Look for evaluation information
+ if 'evaluating' in line.lower() and 'candidate' in line.lower():
+ self.logger.debug(f"{line.strip()}")
+
+ self.logger.info(f"GEPA Optimization Complete: {max_iteration} iterations")
+
+ # #region agent log
+ import json as _json_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "F", "location": "optimizer.py:gepa_complete", "message": "GEPA optimization complete - iteration count", "data": {"max_iteration_from_logs": max_iteration, "expected_iterations": self.config.max_iterations, "off_by_one": max_iteration != self.config.max_iterations, "gepa_logs_length": len(gepa_logs)}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ return max_iteration # Return the max iteration count
+
+ def _extract_best_candidate(self, gepa_result: Any) -> Dict[str, str]:
+ """
+ Extract the best candidate from GEPA Pareto front (single source of truth).
+
+ GEPA Pareto front is the single source of truth because:
+ - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto
+ - All non-dominated candidates are added to GEPA Pareto front
+ - Therefore, the best candidate MUST be in GEPA Pareto front
+
+ Args:
+ gepa_result: Raw result from gepa.optimize() (used only as fallback edge case)
+
+ Returns:
+ Best candidate dictionary with prompt components from GEPA Pareto front
+ """
+ try:
+ self.logger.info(f"\n{'โ'*80}")
+ self.logger.info(f"๐ EXTRACTING BEST CANDIDATE FROM GEPA PARETO FRONT")
+ self.logger.info(f"{'โ'*80}")
+
+ # ========================================================================
+ # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth)
+ # ========================================================================
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+
+ if pareto_log.pareto_front:
+ try:
+ # Get best candidate from GEPA Pareto front (highest score = best)
+ gepa_pareto_best = max(pareto_log.pareto_front, key=lambda x: x['score'])
+ gepa_pareto_fitness = gepa_pareto_best['score']
+ gepa_pareto_prompt = gepa_pareto_best['prompt']
+ gepa_pareto_type = gepa_pareto_best.get('type', 'unknown')
+ gepa_pareto_notation = gepa_pareto_best.get('notation', 'S')
+
+ best_candidate = {
+ 'system_prompt': gepa_pareto_prompt,
+ 'fitness': gepa_pareto_fitness,
+ 'source': 'gepa_pareto_front',
+ 'candidate_type': gepa_pareto_type,
+ 'notation': gepa_pareto_notation
+ }
+
+ self.logger.info(f"โ
SELECTED: Best candidate from GEPA Pareto front")
+ self.logger.info(f" Notation: {gepa_pareto_notation}")
+ self.logger.info(f" Fitness: f({gepa_pareto_notation})={gepa_pareto_fitness:.4f}")
+ self.logger.info(f" Type: {gepa_pareto_type}")
+ self.logger.info(f" Prompt length: {len(gepa_pareto_prompt)} chars")
+ self.logger.info(f" ๐ก GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)")
+
+ return best_candidate
+
+ except Exception as e:
+ self.logger.error(f"โ Failed to extract from GEPA Pareto front: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+
+ # ========================================================================
+ # EDGE CASE FALLBACK: Pareto front empty (shouldn't happen, but handle gracefully)
+ # ========================================================================
+ self.logger.warning(f"โ ๏ธ GEPA Pareto front is empty - using gepa_result as fallback")
+ self.logger.warning(f" This should not happen if all candidates are evaluated on Dpareto")
+
+ # Try to extract from gepa_result (last resort)
+ if hasattr(gepa_result, 'best_candidate'):
+ gepa_candidate = gepa_result.best_candidate
+ gepa_prompt = gepa_candidate.get('system_prompt') if isinstance(gepa_candidate, dict) else str(gepa_candidate)
+ gepa_fitness = getattr(gepa_result, 'best_score', None)
+
+ if gepa_prompt:
+ self.logger.info(f"โ
Using gepa_result.best_candidate as fallback")
+ return {
+ 'system_prompt': gepa_prompt,
+ 'fitness': float(gepa_fitness) if gepa_fitness is not None else None,
+ 'source': 'gepa_result_fallback',
+ 'candidate_type': 'unknown',
+ 'notation': 'S'
+ }
+
+ # Last resort: return empty prompt
+ self.logger.error(f"โ No candidates found anywhere - returning empty prompt")
+ return {'system_prompt': ''}
+
+ except Exception as e:
+ self.logger.error(f"โ Error extracting best candidate: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ return {'system_prompt': ''}
+
+ def _evaluate_candidate_on_testset(
+ self,
+ candidate: Dict[str, str],
+ testset: List[Dict]
+ ) -> float:
+ """
+ Evaluate a candidate prompt on the held-out test set.
+
+ Args:
+ candidate: Prompt candidate to evaluate
+ testset: Test dataset (not used during optimization)
+
+ Returns:
+ Average composite score on test set
+
+ Raises:
+ TestSetEvaluationError: If evaluation fails
+ """
+ from ..utils.exceptions import TestSetEvaluationError
+
+ try:
+ # Evaluate using the adapter (same as GEPA does internally)
+ eval_result = self.adapter.evaluate(
+ batch=testset,
+ candidate=candidate,
+ capture_traces=False # Don't need detailed traces for test
+ )
+
+ if not eval_result.scores:
+ raise TestSetEvaluationError("No scores returned from test evaluation")
+
+ # Calculate average score
+ avg_score = sum(eval_result.scores) / len(eval_result.scores)
+
+ self.logger.debug(
+ f"Test set evaluation: {len(eval_result.scores)} samples, "
+ f"scores: {eval_result.scores}, avg: {avg_score:.4f}"
+ )
+
+ return avg_score
+
+ except Exception as e:
+ raise TestSetEvaluationError(f"Failed to evaluate on test set: {str(e)}")
+
+ def optimize_sync(self,
+ model: str,
+ seed_prompt: str,
+ dataset: Any,
+ reflection_lm: str,
+ max_metric_calls: int = 150,
+ **kwargs) -> OptimizedResult:
+ """
+ Synchronous version of the optimization method
+
+ Args:
+ model: Target model to optimize for
+ seed_prompt: Initial prompt to optimize
+ dataset: Training data in any format
+ reflection_lm: Model for reflection
+ max_metric_calls: Budget for optimization attempts
+ **kwargs: Additional optimization parameters
+
+ Returns:
+ OptimizedResult: Optimization result
+ """
+ # Run the async method in a new event loop
+ loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(loop)
+
+ try:
+ result = loop.run_until_complete(
+ self.train(model, seed_prompt, dataset, reflection_lm, max_metric_calls, **kwargs)
+ )
+ return result
+ finally:
+ loop.close()
+
+
+# Convenience function for quick optimization
+def optimize_prompt(
+ model: Union[str, ModelConfig],
+ seed_prompt: str,
+ dataset: Any,
+ reflection_model: Optional[Union[str, ModelConfig]] = None,
+ **kwargs
+) -> OptimizedResult:
+ """
+ Convenience function for quick prompt optimization without creating optimizer instance
+
+ Args:
+ model: Target model configuration
+ seed_prompt: Initial prompt to optimize
+ dataset: Training data
+ reflection_model: Model for reflection (optional)
+ **kwargs: Additional optimization parameters
+
+ Returns:
+ OptimizedResult: Optimization result
+ """
+ # Create default config if not provided
+ if reflection_model is None:
+ reflection_model = model
+
+ config = OptimizationConfig(
+ model=model,
+ reflection_model=reflection_model,
+ max_iterations=kwargs.get('max_iterations', 10),
+ max_metric_calls=kwargs.get('max_metric_calls', 50),
+ batch_size=kwargs.get('batch_size', 4)
+ )
+
+ optimizer = GepaOptimizer(config=config)
+ return asyncio.run(optimizer.train(seed_prompt, dataset, **kwargs))
+
+
+
+
+
+
diff --git a/src/gepa_optimizer/core/result.py b/src/gepa_optimizer/core/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23bb98840b4e023873ef435df846afebe748187
--- /dev/null
+++ b/src/gepa_optimizer/core/result.py
@@ -0,0 +1,180 @@
+"""
+Result processing for GEPA Optimizer
+Handles extraction and processing of GEPA optimization results
+"""
+
+from typing import Any, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+class ResultProcessor:
+ """
+ Processes raw GEPA optimization results into clean, usable formats
+ """
+
+ @staticmethod
+ def extract_optimized_prompt(result: Any) -> str:
+ """
+ Extract the optimized prompt from GEPA result object
+
+ Args:
+ result: Raw GEPA optimization result
+
+ Returns:
+ str: The optimized prompt text
+ """
+ try:
+ # Try multiple possible result structures
+ if hasattr(result, 'best_candidate'):
+ candidate = result.best_candidate
+
+ if isinstance(candidate, dict):
+ # Try common prompt keys
+ for key in ['system_prompt', 'prompt', 'text']:
+ if key in candidate:
+ return str(candidate[key])
+
+ # If no standard key found, return string representation
+ return str(candidate)
+ else:
+ return str(candidate)
+
+ # Fallback - convert entire result to string
+ return str(result)
+
+ except Exception as e:
+ logger.warning(f"Failed to extract optimized prompt: {e}")
+ return "Optimization completed (prompt extraction failed)"
+
+ @staticmethod
+ def extract_metrics(result: Any) -> Dict[str, Any]:
+ """
+ Extract performance metrics from GEPA result
+
+ Args:
+ result: Raw GEPA optimization result
+
+ Returns:
+ Dict[str, Any]: Extracted metrics
+ """
+ metrics = {}
+
+ try:
+ # Extract common metrics
+ if hasattr(result, 'best_score'):
+ metrics['best_score'] = float(result.best_score)
+
+ if hasattr(result, 'baseline_score'):
+ metrics['baseline_score'] = float(result.baseline_score)
+
+ if hasattr(result, 'improvement'):
+ metrics['improvement'] = float(result.improvement)
+
+ if hasattr(result, 'iterations'):
+ metrics['iterations'] = int(result.iterations)
+
+ # Calculate improvement percentage if we have both scores
+ if 'best_score' in metrics and 'baseline_score' in metrics:
+ baseline = metrics['baseline_score']
+ if baseline > 0:
+ improvement_percent = ((metrics['best_score'] - baseline) / baseline) * 100
+ metrics['improvement_percent'] = round(improvement_percent, 2)
+
+ # Extract additional metadata
+ if hasattr(result, 'metadata'):
+ metrics['metadata'] = result.metadata
+
+ except Exception as e:
+ logger.warning(f"Failed to extract metrics: {e}")
+
+ return metrics
+
+ @staticmethod
+ def extract_reflection_history(result: Any) -> list:
+ """
+ Extract reflection/optimization history from GEPA result
+
+ Args:
+ result: Raw GEPA optimization result
+
+ Returns:
+ list: List of reflection iterations
+ """
+ history = []
+
+ try:
+ if hasattr(result, 'optimization_history'):
+ for i, iteration in enumerate(result.optimization_history):
+ history_item = {
+ 'iteration': i,
+ 'score': iteration.get('score', 0.0),
+ 'candidate': iteration.get('candidate', {}),
+ 'feedback': iteration.get('feedback', ''),
+ 'improvement': iteration.get('improvement', 0.0)
+ }
+ history.append(history_item)
+
+ except Exception as e:
+ logger.warning(f"Failed to extract reflection history: {e}")
+
+ return history
+
+ @staticmethod
+ def process_full_result(
+ result: Any,
+ original_prompt: str,
+ optimization_time: float,
+ actual_iterations: Optional[int] = None,
+ test_metrics: Optional[Dict[str, Any]] = None
+ ) -> Dict[str, Any]:
+ """
+ Process complete GEPA result into structured format.
+
+ Args:
+ result: Raw GEPA optimization result
+ original_prompt: Original seed prompt
+ optimization_time: Time taken for optimization
+ actual_iterations: Actual number of iterations from GEPA logs (optional)
+ test_metrics: Metrics from test set evaluation (optional)
+
+ Returns:
+ Dict[str, Any]: Complete processed result
+ """
+ # Extract metrics first
+ metrics = ResultProcessor.extract_metrics(result)
+
+ # Extract iterations from GEPA result
+ total_iterations = 0
+ try:
+ # First priority: use actual_iterations if provided (from logs)
+ if actual_iterations is not None:
+ total_iterations = actual_iterations
+ elif hasattr(result, 'iterations'):
+ total_iterations = int(result.iterations)
+ elif hasattr(result, 'num_iterations'):
+ total_iterations = int(result.num_iterations)
+ elif hasattr(result, 'optimization_history'):
+ total_iterations = len(result.optimization_history)
+ # Check if it's in metrics
+ elif 'iterations' in metrics:
+ total_iterations = metrics['iterations']
+ except Exception as e:
+ logger.warning(f"Failed to extract iterations: {e}")
+
+ # Merge test metrics into improvement_data
+ improvement_data = {}
+ if test_metrics:
+ improvement_data.update(test_metrics)
+
+ return {
+ 'original_prompt': original_prompt,
+ 'optimized_prompt': ResultProcessor.extract_optimized_prompt(result),
+ 'metrics': metrics,
+ 'improvement_data': improvement_data,
+ 'reflection_history': ResultProcessor.extract_reflection_history(result),
+ 'optimization_time': optimization_time,
+ 'total_iterations': total_iterations,
+ 'status': 'completed',
+ 'raw_result': result # Keep raw result for advanced users
+ }
diff --git a/src/gepa_optimizer/core/universal_adapter.py b/src/gepa_optimizer/core/universal_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..539b7413219278c5e28085fb159456aa664c3600
--- /dev/null
+++ b/src/gepa_optimizer/core/universal_adapter.py
@@ -0,0 +1,2386 @@
+"""
+Universal GEPA adapter for user-defined metrics and LLM clients.
+"""
+
+from .base_adapter import BaseGepaAdapter
+from ..data.converters import UniversalConverter
+from typing import Any, Dict, List, Optional
+import logging
+import re
+from gepa.core.adapter import EvaluationBatch
+
+logger = logging.getLogger(__name__)
+
+class UniversalGepaAdapter(BaseGepaAdapter):
+ """
+ Universal GEPA adapter that works with any LLM client and evaluator.
+
+ This adapter uses the existing UniversalConverter for data processing
+ and delegates LLM generation and evaluation to user-provided components.
+
+ Features:
+ - Optimized multi-variation JSON generation (66% cost reduction)
+ - Robust parsing with multiple fallback strategies
+ - Automatic fallback to sequential generation if JSON parsing fails
+ """
+
+ # Fallback system prompt for sequential generation (when JSON parsing fails)
+ _FALLBACK_SYSTEM_PROMPT = """You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+
+Output the improved prompt directly and only the prompt."""
+
+ def __init__(self, llm_client, evaluator, data_converter=None, llego_layer=None):
+ """
+ Initialize universal adapter.
+
+ Args:
+ llm_client: User-provided LLM client (must inherit from BaseLLMClient)
+ evaluator: User-provided evaluator (must inherit from BaseEvaluator)
+ data_converter: Optional custom data converter (uses UniversalConverter by default)
+ llego_layer: Optional LLEGO integration layer for genetic operations
+ """
+ # Store LLEGO layer first
+ self.llego = llego_layer
+
+ # If LLEGO is provided, wrap the LLM client
+ # Note: If config is passed separately, it will be handled by optimizer
+ if llego_layer is not None:
+ from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+ # Only wrap if not already wrapped (optimizer may have wrapped it with config)
+ if not isinstance(llm_client, LLEGOEnhancedLLMClient):
+ # Wrap before calling super().__init__
+ # Config will be set later by optimizer if hybrid mode is enabled
+ llm_client = LLEGOEnhancedLLMClient(llm_client, llego_layer, config=None, verbose=True)
+ else:
+ # Already wrapped, but update config if available
+ if hasattr(llm_client, 'config') and llm_client.config is None:
+ # Config will be set by optimizer later
+ pass
+
+ # Initialize parent (this sets up self.logger)
+ super().__init__(llm_client, evaluator)
+
+ # Use existing UniversalConverter for data processing
+ self.data_converter = data_converter or UniversalConverter()
+
+ # ๐ฅ NEW: Initialize optimization state tracking
+ self._is_baseline_evaluation = False # Flag to distinguish baseline vs optimization
+ self._last_candidate = None # Track last candidate to detect changes
+ self._gepa_iteration = 0 # Track actual GEPA iteration (not evaluation count)
+
+ # Track candidates for logging
+ self._evaluation_count = 0
+
+ # Track current evaluation context
+ self._current_evaluation_type = None # 'seed', 'gepa_reflection', 'llego_crossover', 'llego_mutation'
+ self._current_dataset_type = None # 'dfeedback' or 'dpareto'
+ self._baseline_score = None # Store baseline score for comparison
+
+ # Track candidate sources by prompt text (in case GEPA doesn't pass source field)
+ self._candidate_sources = {} # Maps prompt_text -> source_type
+
+ # Track validation set size for better dataset type detection
+ self._valset_size = None # Will be set by optimizer
+ self._valset = None # Will be set by optimizer - stores actual valset for Dpareto evaluation
+
+ # ๐ฅ CRITICAL: Track which candidates have been evaluated on Dpareto to avoid double evaluation
+ # Key: normalized prompt text, Value: (fitness_score, candidate_type, timestamp)
+ self._dpareto_evaluated_candidates = {} # Maps prompt -> (score, type)
+
+ # ๐ฅ HYBRID MODE: Storage for generated candidates
+ self._generated_candidates = [] # Store hybrid mode candidates
+ self._candidate_generation_active = False # Track if we're generating candidates
+ self._config = None # Will be set by optimizer if hybrid mode enabled
+ self._reflection_lm_client = None # Will be set by optimizer
+
+ # ๐ฅ FORMAT AWARENESS: Store detected output format for better prompts
+ self._detected_format = None # Will be populated from expected outputs
+ self._format_detection_done = False # Only detect once
+
+ # Log initialization
+ model_info = llm_client.get_model_info()
+ if llego_layer is not None:
+ self.logger.info(f"๐ Initialized Universal adapter with {model_info}")
+ self.logger.info(f"๐งฌ LLEGO integration ENABLED - LLM client is wrapped for genetic operations")
+ else:
+ self.logger.info(f"๐ Initialized Universal adapter with {model_info}")
+
+ def _clean_llm_output(self, output: str) -> str:
+ """
+ ๐ฅ CRITICAL: Clean LLM output before evaluation.
+
+ LLMs often wrap JSON/structured output in markdown code blocks.
+ This causes evaluation to fail because the evaluator sees:
+ "```json\n{\"key\": \"value\"}\n```"
+ Instead of:
+ "{\"key\": \"value\"}"
+
+ This method extracts the clean content for fair comparison.
+ """
+ if not output or not isinstance(output, str):
+ return output
+
+ cleaned = output.strip()
+
+ # Remove markdown code blocks (```json ... ``` or ``` ... ```)
+ code_block_match = re.search(r'```(?:json|JSON)?\s*([\s\S]*?)\s*```', cleaned)
+ if code_block_match:
+ extracted = code_block_match.group(1).strip()
+ # Only use extracted if it looks like valid content
+ if extracted and (extracted.startswith('{') or extracted.startswith('[') or len(extracted) > 10):
+ self.logger.debug(f"๐ฆ Cleaned markdown code block from LLM output")
+ return extracted
+
+ # Remove leading/trailing markdown artifacts
+ # Handle cases like "Here is the JSON:\n```json\n...\n```"
+ if '```' in cleaned:
+ # Try to extract content between first ``` and last ```
+ parts = cleaned.split('```')
+ if len(parts) >= 3:
+ # Content is in the middle part(s)
+ middle_content = parts[1]
+ # Remove language tag if present (e.g., "json\n")
+ middle_content = re.sub(r'^(?:json|JSON|python|text)\s*\n?', '', middle_content).strip()
+ if middle_content:
+ return middle_content
+
+ return cleaned
+
+ def _detect_and_cache_format(self, batch: List[Dict[str, Any]]) -> None:
+ """
+ Detect output format from expected outputs and cache for future use.
+
+ This enables format-aware prompting and feedback generation.
+ """
+ try:
+ from ..utils.format_detection import detect_output_format
+
+ # Extract expected outputs from batch
+ expected_outputs = []
+ for item in batch:
+ # Try to extract output directly, or standardize if needed
+ output = None
+ if isinstance(item, dict):
+ # Try common output field names first
+ output = item.get('output') or item.get('expected_output') or item.get('result') or item.get('answer')
+ if not output:
+ # Standardize using converter's private method (same as _evaluate_batch_mode)
+ try:
+ standardized = self.data_converter._standardize([item])[0]
+ output = standardized.get('output')
+ except Exception:
+ pass
+
+ if output and isinstance(output, str) and output.strip():
+ expected_outputs.append(output)
+
+ if expected_outputs:
+ self._detected_format = detect_output_format(expected_outputs)
+ self.logger.info(f"๐ FORMAT DETECTED: {self._detected_format['format_type']}")
+ self.logger.info(f" Spec: {self._detected_format['format_spec'][:100]}...")
+ self.logger.info(f" Avg length: {self._detected_format['avg_length']} chars")
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_DETECT", "location": "universal_adapter.py:format_detected", "message": "Format detection successful", "data": {"format_type": self._detected_format['format_type'], "num_outputs": len(expected_outputs), "avg_length": self._detected_format['avg_length'], "has_constraint": bool(self._detected_format.get('format_constraint'))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ else:
+ self.logger.warning("โ ๏ธ No expected outputs found for format detection")
+ self._detected_format = None
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_DETECT", "location": "universal_adapter.py:format_detected", "message": "Format detection failed - no outputs", "data": {"batch_size": len(batch)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ except Exception as e:
+ self.logger.warning(f"โ ๏ธ Format detection failed: {e}")
+ self._detected_format = None
+
+ def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str],
+ capture_traces: bool = False) -> EvaluationBatch:
+ """
+ Evaluate candidates using user-provided LLM client and evaluator.
+
+ This method automatically detects BatchLLMClient and uses batch processing
+ for cost savings, or falls back to standard individual processing.
+
+ This method works with any data type supported by UniversalConverter.
+
+ ๐ฅ IMPORTANT: We only optimize system_prompt, NOT user_prompt.
+ The user_prompt varies per tester and is not part of optimization.
+
+ ๐ฅ CACHING: Seed prompt is evaluated ONLY ONCE on Dpareto (validation set).
+ Subsequent evaluations return cached result to save API calls and ensure consistency.
+ """
+ system_prompt = candidate.get('system_prompt', '')
+
+ # ๐ฅ FORMAT DETECTION: Detect output format from expected outputs (once)
+ if not self._format_detection_done and batch:
+ self._detect_and_cache_format(batch)
+ self._format_detection_done = True
+
+ # Determine dataset type first (needed for cache check)
+ batch_size_threshold = self._config.batch_size if hasattr(self, '_config') and self._config else 8
+
+ # ๐ฅ CRITICAL FIX: If _is_baseline_evaluation is True, we KNOW this is the validation set
+ # This fixes the issue where valset_size might not be set yet when baseline detection happens
+ if hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation:
+ dataset_type = 'dpareto' # Baseline is ALWAYS evaluated on validation set
+ self.logger.debug(f"๐ฏ Forced dataset_type to 'dpareto' (baseline evaluation flag is True)")
+ elif hasattr(self, '_valset_size') and self._valset_size is not None and len(batch) >= self._valset_size:
+ dataset_type = 'dpareto' # Full validation set size = Dpareto
+ elif len(batch) > batch_size_threshold * 1.5:
+ dataset_type = 'dpareto' # Much larger than batch = likely full valset
+ else:
+ dataset_type = 'dfeedback' # Small batch = training minibatch for reflection
+
+ # ๐ฅ CRITICAL: Check cache to avoid re-evaluating same prompt on Dpareto
+ # This ensures seed prompt is evaluated ONLY ONCE
+ if dataset_type == 'dpareto':
+ normalized_prompt = system_prompt.strip().strip('"\'')
+ if normalized_prompt in self._dpareto_evaluated_candidates:
+ existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt]
+ self.logger.info(
+ f"โป๏ธ CACHE HIT: Prompt already evaluated on Dpareto "
+ f"(score={existing_score:.4f}, type={existing_type}) - skipping re-evaluation"
+ )
+
+ # Return cached result - create EvaluationBatch with cached score
+ cached_outputs = [f"[CACHED: {existing_type}]"] * len(batch)
+ cached_scores = [existing_score] * len(batch)
+
+ # Still update baseline if this is seed and baseline not set
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+
+ if existing_type == 'seed' and self._baseline_score is None:
+ self._baseline_score = existing_score
+ pareto_log.set_baseline(existing_score)
+ self.logger.info(f"๐ Baseline score set from cache: {existing_score:.4f}")
+
+ # Log to Pareto logger (for tracking, but no re-evaluation)
+ pareto_log.log_candidate_evaluation(
+ prompt=system_prompt,
+ score=existing_score,
+ candidate_type=existing_type,
+ dataset_type='dpareto'
+ )
+
+ return EvaluationBatch(
+ outputs=cached_outputs,
+ scores=cached_scores,
+ trajectories=None # No traces for cached results
+ )
+
+ # Determine candidate type
+ # Priority order:
+ # 1. Check candidate dict for 'source' field (from LLM wrapper)
+ # 2. Check _candidate_sources mapping (from previous evaluations)
+ # 3. Check _current_evaluation_type (from log_proposed_candidate)
+ # 4. Infer from context (seed, repeat, etc.)
+
+ candidate_type = candidate.get('source') # First try candidate dict
+ if not candidate_type or candidate_type == 'unknown':
+ candidate_type = self._candidate_sources.get(system_prompt) # Check mapping
+ if not candidate_type or candidate_type == 'unknown':
+ candidate_type = self._current_evaluation_type # Use stored type
+ if not candidate_type or candidate_type == 'unknown':
+ # Try to infer from prompt or metadata
+ if system_prompt == self._last_candidate:
+ candidate_type = 'repeat' # Same prompt being re-evaluated
+ elif self._evaluation_count == 0 or 'seed' in str(candidate.get('source', '')).lower():
+ candidate_type = 'seed' # Explicitly mark as seed
+ self.logger.debug("๐ฑ Detected seed prompt (Sโ)")
+ else:
+ candidate_type = 'unknown' # Truly unknown
+
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "C", "location": "universal_adapter.py:candidate_type_detect", "message": "Candidate type detection", "data": {"candidate_type": candidate_type, "evaluation_count": self._evaluation_count, "from_candidate_dict": candidate.get('source'), "from_sources_mapping": self._candidate_sources.get(system_prompt), "from_current_type": self._current_evaluation_type}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ # Store source for future lookups (always update if we found a valid type)
+ if candidate_type and candidate_type != 'unknown' and system_prompt not in self._candidate_sources:
+ self._candidate_sources[system_prompt] = candidate_type
+ self.logger.debug(f" ๐ Stored candidate type: {candidate_type} for prompt (length: {len(system_prompt)})")
+
+ # Dataset type already determined above for cache check - reuse it
+
+ # #region agent log
+ try:
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "H", "location": "universal_adapter.py:dataset_type_detect", "message": "Dataset type detection", "data": {"batch_size": len(batch), "valset_size": getattr(self, '_valset_size', None), "batch_size_threshold": batch_size_threshold, "detected_type": dataset_type, "evaluation_count": self._evaluation_count}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ except Exception:
+ pass
+ # #endregion
+
+ # Check if this is a new candidate (different from last one)
+ if self._last_candidate != system_prompt:
+ self._evaluation_count += 1
+ # ๐ฅ CRITICAL: If this is baseline evaluation, force candidate_type to 'seed'
+ if self._is_baseline_evaluation:
+ candidate_type = 'seed'
+ self.logger.debug(f"๐ฑ Baseline evaluation detected - setting candidate_type to 'seed'")
+ self._current_evaluation_type = candidate_type
+ self._current_dataset_type = dataset_type
+ self._last_candidate = system_prompt
+
+ # Minimal logging - just track what we're evaluating
+ if self._is_baseline_evaluation:
+ self.logger.debug(f"Evaluating baseline (Sโ) on {dataset_type}")
+ else:
+ self.logger.debug(f"Evaluating candidate #{self._evaluation_count} ({candidate_type}) on {dataset_type}")
+
+ # Detect and use batch mode if available
+ from ..llms.batch_llm import BatchLLMClient
+ is_batch_mode = isinstance(self.llm_client, BatchLLMClient)
+
+ if is_batch_mode:
+ outputs, scores, trajectories = self._evaluate_batch_mode(
+ batch, system_prompt, capture_traces
+ )
+ else:
+ outputs, scores, trajectories = self._evaluate_standard_mode(
+ batch, system_prompt, capture_traces
+ )
+
+ avg_score = sum(scores) / len(scores) if scores else 0.0
+
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "B,C", "location": "universal_adapter.py:baseline_check", "message": "Baseline check conditions", "data": {"baseline_score_is_none": self._baseline_score is None, "current_dataset_type": self._current_dataset_type, "current_evaluation_type": self._current_evaluation_type, "is_baseline_evaluation": self._is_baseline_evaluation, "batch_size": len(batch), "avg_score": avg_score}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ # ๐ฅ CRITICAL FIX: Baseline MUST be set from seed's first Dpareto evaluation ONLY
+ # This ensures FAIR comparison: seed and candidates evaluated on SAME dataset (Dpareto) with SAME number of datapoints
+ #
+ # Fair evaluation requires:
+ # - Seed baseline: Dpareto (validation set) - first evaluation during optimization
+ # - Candidates: Dpareto (validation set) - same dataset, same size
+ # - Same conditions = fair comparison โ
+ #
+ # We IGNORE test set for baseline - baseline must come from Dpareto to ensure same dataset/size
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+
+ # ๐ฅ FIX: Check if this is baseline evaluation AND dpareto - set baseline with priority
+ is_baseline_eval = hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation
+
+ if self._baseline_score is None:
+ # ๐ฅ FIX B: Set baseline on FIRST Dpareto evaluation, regardless of candidate type
+ # Also set baseline if this is explicitly marked as baseline evaluation
+ if self._current_dataset_type == 'dpareto' or is_baseline_eval:
+ # โ
PRIMARY: Set baseline from FIRST Dpareto evaluation (seed or first candidate)
+ self._baseline_score = avg_score
+ pareto_log.set_baseline(avg_score)
+ self.logger.info(f"๐ Baseline score (Dpareto, {len(batch)} samples): {avg_score:.4f}")
+ self.logger.info(f" โ
Baseline set from {'baseline evaluation' if is_baseline_eval else 'first Dpareto'} (type: {self._current_evaluation_type})")
+ # #region agent log
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "B", "location": "universal_adapter.py:baseline_set", "message": "Baseline score SET", "data": {"baseline_score": avg_score, "candidate_type": self._current_evaluation_type, "dataset_type": self._current_dataset_type, "is_baseline_eval": is_baseline_eval}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ # Note: Test set evaluations are ignored for baseline - baseline comes from Dpareto
+ else:
+ # ๐ฅ SAFETY CHECK: Ensure Pareto logger also has baseline if adapter has it
+ # This handles the case where optimizer set baseline in adapter but Pareto logger wasn't updated
+ if (self._current_dataset_type == 'dpareto' or is_baseline_eval) and pareto_log.baseline_score is None:
+ pareto_log.set_baseline(self._baseline_score)
+ self.logger.info(f"โ
Synchronized baseline in Pareto logger: {self._baseline_score:.4f}")
+
+ # Track Dpareto evaluations for Pareto front
+ if self._current_dataset_type == 'dpareto':
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ pareto_log.log_candidate_evaluation(
+ prompt=system_prompt,
+ score=avg_score,
+ candidate_type=self._current_evaluation_type or 'unknown',
+ dataset_type=self._current_dataset_type
+ )
+
+ # Track evaluated candidates
+ normalized_prompt = system_prompt.strip().strip('"\'')
+ if normalized_prompt not in self._dpareto_evaluated_candidates:
+ self._dpareto_evaluated_candidates[normalized_prompt] = (
+ avg_score, self._current_evaluation_type or 'unknown', 'evaluated_by_gepa'
+ )
+
+ self.logger.debug(f"Evaluation complete: score={avg_score:.4f}")
+
+ # ๐ฅ CRITICAL: Update _best_candidate and _best_score with average fitness for Dpareto evaluations
+ # This ensures the adapter tracks the best average fitness, not just per-sample scores
+ # Only update if this score is better than current best
+ if self._current_dataset_type == 'dpareto':
+ if self._best_score is None or avg_score > self._best_score:
+ self._best_score = avg_score
+ self._best_candidate = {
+ 'system_prompt': system_prompt,
+ 'fitness': avg_score,
+ 'source': self._current_evaluation_type or 'unknown'
+ }
+ self.logger.info(f"โ
Updated best candidate from Dpareto evaluation: f={avg_score:.4f} (type: {self._current_evaluation_type})")
+
+ return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
+
+ def _evaluate_batch_mode(
+ self,
+ batch: List[Dict],
+ system_prompt: str,
+ capture_traces: bool
+ ) -> tuple:
+ """
+ Batch mode evaluation - process all samples in one API call.
+
+ This method prepares all requests, submits them as a batch job to Gemini,
+ waits for completion, then evaluates all results.
+ """
+ # Prepare all requests
+ requests = []
+ standardized_items = []
+
+ for item in batch:
+ standardized_item = self.data_converter._standardize([item])[0]
+ standardized_items.append(standardized_item)
+
+ request = {
+ 'system_prompt': system_prompt,
+ 'user_prompt': standardized_item['input']
+ }
+
+ if standardized_item.get('image'):
+ request['image_base64'] = standardized_item['image']
+
+ requests.append(request)
+
+ # Submit batch job and get all results at once
+ batch_results = self.llm_client.generate_batch(requests)
+
+ # Process results
+ outputs = []
+ scores = []
+ trajectories = [] if capture_traces else None
+
+ for i, (llm_response, standardized_item) in enumerate(zip(batch_results, standardized_items)):
+ # Extract content
+ raw_output = llm_response.get("content", "")
+
+ # ๐ฅ CRITICAL: Clean markdown wrappers before evaluation
+ predicted_output = self._clean_llm_output(raw_output)
+ outputs.append(predicted_output)
+
+ # Evaluate with cleaned output
+ evaluation_results = self.evaluator.evaluate(
+ predicted_output,
+ standardized_item['output']
+ )
+
+ composite_score = evaluation_results.get("composite_score", 0.0)
+ scores.append(composite_score)
+
+ # Update tracking
+ if composite_score > self._best_score:
+ self._best_score = composite_score
+ self._best_candidate = {'system_prompt': system_prompt}
+
+ # Capture traces
+ if capture_traces:
+ trajectories.append({
+ 'input_data': standardized_item,
+ 'predicted_output': predicted_output,
+ 'evaluation_results': evaluation_results
+ })
+
+ # Concise logging with element IDs and candidate notation
+ predicted_element = evaluation_results.get('predicted_element', '?')
+ expected_element = evaluation_results.get('expected_element', '?')
+ status = "โ
" if composite_score == 1.0 else "โ"
+
+ # Add notation for candidate type
+ notation_map = {'seed': 'Sโ', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโโ', 'llego_mutation': 'Oโแตคโ'}
+ notation = notation_map.get(self._current_evaluation_type, 'S')
+
+ self.logger.info(f" [{notation}] Sample {i+1}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status}")
+
+ return outputs, scores, trajectories
+
+ def _evaluate_standard_mode(
+ self,
+ batch: List[Dict],
+ system_prompt: str,
+ capture_traces: bool
+ ) -> tuple:
+ """
+ Standard mode evaluation - process samples individually (existing logic).
+
+ This is the original implementation, preserved for backward compatibility
+ and for use with non-batch LLM clients.
+ """
+ outputs = []
+ scores = []
+ trajectories = [] if capture_traces else None
+
+ for i, item in enumerate(batch):
+ # Use existing data processing logic
+ standardized_item = self.data_converter._standardize([item])[0]
+
+ # Prepare generation parameters
+ generation_params = {
+ 'system_prompt': system_prompt,
+ 'user_prompt': standardized_item['input']
+ }
+
+ # Add image if present
+ if standardized_item.get('image'):
+ generation_params['image_base64'] = standardized_item['image']
+
+ # Generate response using user's LLM client
+ llm_response = self.llm_client.generate(**generation_params)
+
+ # Extract content
+ if isinstance(llm_response, dict):
+ raw_output = llm_response.get("content", "")
+ else:
+ raw_output = str(llm_response)
+
+ # ๐ฅ CRITICAL: Clean markdown wrappers before evaluation
+ predicted_output = self._clean_llm_output(raw_output)
+ outputs.append(predicted_output)
+
+ # Evaluate using user's evaluator with cleaned output
+ evaluation_results = self.evaluator.evaluate(
+ predicted_output,
+ standardized_item['output']
+ )
+
+ composite_score = evaluation_results.get("composite_score", 0.0)
+ scores.append(composite_score)
+
+ # #region agent log
+ try:
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_adapter.py:evaluation_result", "message": "Individual evaluation result", "data": {"sample_idx": i, "composite_score": composite_score, "semantic_sim": evaluation_results.get("semantic_similarity", -1), "structural_sim": evaluation_results.get("structural_similarity", -1), "format_mismatch": evaluation_results.get("analysis", {}).get("format_mismatch", False), "predicted_len": len(predicted_output) if predicted_output else 0, "expected_len": len(standardized_item.get('output', ''))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ except Exception:
+ pass
+ # #endregion
+
+ # Update performance tracking
+ self._evaluation_count += 1
+ if composite_score > self._best_score:
+ self._best_score = composite_score
+ self._best_candidate = {'system_prompt': system_prompt}
+
+ # Capture traces if requested
+ if capture_traces:
+ trajectories.append({
+ 'input_data': standardized_item,
+ 'predicted_output': predicted_output,
+ 'evaluation_results': evaluation_results
+ })
+
+ # Concise logging with element IDs and candidate notation
+ predicted_element = evaluation_results.get('predicted_element', '?')
+ expected_element = evaluation_results.get('expected_element', '?')
+ status = "โ
" if composite_score == 1.0 else "โ"
+
+ # Add notation for candidate type
+ notation_map = {'seed': 'Sโ', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโโ', 'llego_mutation': 'Oโแตคโ'}
+ notation = notation_map.get(self._current_evaluation_type, 'S')
+
+ self.logger.info(f" [{notation}] Sample {i+1}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status}")
+
+ return outputs, scores, trajectories
+
+ def make_reflective_dataset(self, candidate: Dict[str, str], eval_batch: EvaluationBatch,
+ components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]:
+ """
+ Create reflective dataset using user-provided evaluator.
+
+ This method generates feedback based on the evaluation results
+ from the user's custom evaluator.
+
+ ๐ฅ NEW: If hybrid mode is enabled, this method ALSO generates hybrid candidates
+ (GEPA Reflection + LLEGO Operators) and stores them for GEPA to use.
+ """
+ # ๐ฅ REMOVED: Excessive diagnostic logs - moved to DEBUG level if needed
+ self.logger.debug(f"make_reflective_dataset() called - generating feedback and hybrid candidates")
+
+ reflective_dataset = {}
+ system_prompt = candidate.get('system_prompt', '')
+
+ # ๐ฅ REMOVED: Verbose diagnostic checks - only log if hybrid mode is actually enabled
+ hybrid_mode_enabled = (self._config and
+ hasattr(self._config, 'enable_gepa_reflection_with_llego') and
+ self._config.enable_gepa_reflection_with_llego and
+ self._reflection_lm_client)
+
+ if hybrid_mode_enabled:
+ self.logger.debug(f"โ
Hybrid mode conditions met - will generate hybrid candidates")
+
+ # ========================================================================
+ # ๐ฅ CRITICAL FIX: Update LLEGO population with evaluated candidate
+ # ========================================================================
+ # This is the MISSING LINK! After a candidate is evaluated, we need to add it
+ # to the LLEGO population so it can be used for crossover/mutation.
+ # Without this, the population only contains the seed, so Pareto front stays at 1!
+ #
+ # This is called for EVERY candidate that GEPA evaluates:
+ # - Seed prompt (baseline) โ added to population
+ # - New candidate 1 (from reflection/crossover/mutation) โ added to population
+ # - New candidate 2 โ added to population
+ # - etc.
+ if self.llego:
+ # Calculate average fitness from evaluation scores
+ if eval_batch.scores and len(eval_batch.scores) > 0:
+ avg_fitness = sum(eval_batch.scores) / len(eval_batch.scores)
+ else:
+ # Fallback: extract from trajectories if scores not available
+ scores = [t.get('evaluation_results', {}).get('composite_score', 0.0)
+ for t in eval_batch.trajectories if 'evaluation_results' in t]
+ avg_fitness = sum(scores) / len(scores) if scores else 0.0
+
+ self.logger.debug(f"Updating LLEGO population: fitness={avg_fitness:.4f}")
+
+ # Create PromptCandidate from evaluated prompt
+ from ..operators.llego_operators import PromptCandidate
+
+ # Check if this candidate already exists in population (avoid duplicates)
+ # ๐ฅ FIX: Normalize prompts for comparison (strip whitespace, remove quotes)
+ normalized_new_prompt = system_prompt.strip().strip('"\'')
+ existing_prompts = {p.prompt.strip().strip('"\'') for p in self.llego.population}
+
+ # Also check normalized versions
+ if normalized_new_prompt not in existing_prompts:
+ prompt_candidate = PromptCandidate(
+ prompt=system_prompt, # Keep original prompt (not normalized)
+ fitness=avg_fitness,
+ metadata={
+ 'generation': self.llego.current_generation,
+ 'operator': 'evaluated',
+ 'prompt_length': len(system_prompt),
+ 'word_count': len(system_prompt.split()),
+ 'evaluation_samples': len(eval_batch.scores) if eval_batch.scores else 0,
+ 'candidate_type': self._current_evaluation_type or 'unknown', # Store type for notation
+ 'dataset_evaluated': self._current_dataset_type or 'unknown'
+ }
+ )
+
+ # Update population - this will add the candidate and keep top N by fitness
+ population_before = len(self.llego.population)
+ self.llego.update_population([prompt_candidate])
+ population_after = len(self.llego.population)
+
+ self.logger.debug(f"Added to LLEGO population: fitness={avg_fitness:.4f}, size={population_after}")
+ else:
+ # Update fitness if candidate already exists (seed prompt, etc.)
+ # ๐ฅ FIX: Also normalize for comparison
+ updated = False
+ for p in self.llego.population:
+ normalized_existing = p.prompt.strip().strip('"\'')
+ if normalized_existing == normalized_new_prompt:
+ old_fitness = p.fitness
+ if avg_fitness > p.fitness:
+ p.fitness = avg_fitness
+ updated = True
+ self.logger.debug(f"Updated fitness: {old_fitness:.4f} โ {avg_fitness:.4f}")
+ # Update candidate type if we have new information
+ if self._current_evaluation_type and p.metadata:
+ old_type = p.metadata.get('candidate_type', 'unknown')
+ if self._current_evaluation_type != old_type:
+ p.metadata['candidate_type'] = self._current_evaluation_type
+ else:
+ self.logger.debug(f"โน๏ธ Candidate already exists with better/equal fitness: {p.fitness:.4f} >= {avg_fitness:.4f}")
+ break
+
+ if not updated:
+ self.logger.debug(f"Candidate already in population with higher fitness")
+ else:
+ self.logger.debug("LLEGO not initialized - skipping population update")
+
+ # ========================================================================
+ # ๐ฅ HYBRID MODE: Generate candidates at adapter level
+ # ========================================================================
+ if (self._config and
+ hasattr(self._config, 'enable_gepa_reflection_with_llego') and
+ self._config.enable_gepa_reflection_with_llego and
+ self._reflection_lm_client):
+
+ self.logger.debug("Generating hybrid candidates")
+
+ # Generate hybrid candidates FIRST
+ generated_candidates = self._generate_hybrid_candidates_adapter_level(
+ current_prompt=system_prompt,
+ eval_batch=eval_batch,
+ candidate=candidate
+ )
+
+ # ๐ฅ CRITICAL: Store generated candidates so we can inject them
+ # _generate_hybrid_candidates_adapter_level now returns list of dicts with metadata
+ if generated_candidates:
+ candidate_dicts = []
+ for cand in generated_candidates:
+ if isinstance(cand, dict) and 'prompt' in cand:
+ # Already a dict with metadata (preferred format)
+ candidate_dicts.append(cand)
+ elif isinstance(cand, str):
+ # Just a string - determine source based on position (fallback)
+ # This shouldn't happen if _generate_hybrid_candidates_adapter_level is fixed
+ self.logger.warning(f"โ ๏ธ Received string candidate instead of dict - using fallback logic")
+ if len(candidate_dicts) < self._config.num_gepa_reflection_candidates:
+ source = 'gepa_reflection'
+ elif len(candidate_dicts) < self._config.num_gepa_reflection_candidates + self._config.n_crossover:
+ source = 'llego_crossover'
+ else:
+ source = 'llego_mutation'
+ candidate_dicts.append({
+ 'prompt': cand,
+ 'source': source,
+ 'index': len(candidate_dicts) + 1
+ })
+ else:
+ self.logger.warning(f"โ ๏ธ Unknown candidate format: {type(cand)}")
+
+ self._generated_candidates = candidate_dicts
+
+ # Store candidate sources for tracking
+ for cand_dict in candidate_dicts:
+ if 'prompt' in cand_dict and 'source' in cand_dict:
+ self._candidate_sources[cand_dict['prompt']] = cand_dict['source']
+
+ # ๐ฅ CRITICAL: Inject into LLM client wrapper so it can return them when GEPA calls
+ # This is the key mechanism: when GEPA calls adapter.llm_client.generate() for proposals,
+ # our wrapper will detect it and return our pre-generated candidates
+ if hasattr(self.llm_client, '_adapter_generated_candidates'):
+ self.llm_client._adapter_generated_candidates = candidate_dicts.copy()
+ self.logger.debug(f"Injected {len(candidate_dicts)} candidates")
+ else:
+ try:
+ self.llm_client._adapter_generated_candidates = candidate_dicts.copy()
+ except Exception as e:
+ self.logger.error(f"Failed to inject candidates: {e}")
+
+ # Evaluate generated candidates on Dpareto for fair comparison
+ if hasattr(self, '_evaluating_generated_candidates'):
+ pass # Skip to prevent recursion
+ elif self._valset and len(self._valset) > 0:
+ self._evaluating_generated_candidates = True
+ self.logger.debug(f"Evaluating {len(candidate_dicts)} candidates on Dpareto ({len(self._valset)} samples)")
+
+ # ๐ฅ NEW: Collect all candidates with scores for batch update
+ candidates_with_scores = []
+
+ for i, cand_dict in enumerate(candidate_dicts, 1):
+ cand_prompt = cand_dict.get('prompt', '')
+ cand_source = cand_dict.get('source', 'unknown')
+
+ if not cand_prompt:
+ continue
+
+ # Normalize prompt for duplicate detection
+ normalized_prompt = cand_prompt.strip().strip('"\'')
+
+ # Check if already evaluated on Dpareto (avoid double evaluation)
+ if normalized_prompt in self._dpareto_evaluated_candidates:
+ existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt]
+
+ # Still add to batch for Pareto update (with existing score)
+ notation_map = {
+ 'seed': 'Sโ',
+ 'gepa_reflection': 'Sแตฃ',
+ 'llego_crossover': 'Oโโ',
+ 'llego_mutation': 'Oโแตคโ'
+ }
+ cand_notation = notation_map.get(cand_source, 'S')
+ candidates_with_scores.append({
+ 'prompt': cand_prompt,
+ 'score': existing_score,
+ 'type': cand_source,
+ 'notation': cand_notation
+ })
+ continue
+
+ # Evaluate this candidate on valset (Dpareto)
+ try:
+ # Set candidate type for proper logging
+ self._current_evaluation_type = cand_source
+
+ # ๐ฅ CRITICAL: Temporarily disable individual Pareto updates
+ # We'll do batch update after all evaluations
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ original_log_method = pareto_log.log_candidate_evaluation
+
+ # Temporarily replace to prevent individual updates
+ def noop_log(*args, **kwargs):
+ pass # Skip individual logging - we'll batch update later
+
+ pareto_log.log_candidate_evaluation = noop_log
+
+ # Evaluate on valset - THIS IS THE FAIR EVALUATION ON SAME DATASET
+ valset_eval = self.evaluate(
+ batch=self._valset, # Same valset as seed!
+ candidate={'system_prompt': cand_prompt, 'source': cand_source},
+ capture_traces=True
+ )
+
+ # Restore original method
+ pareto_log.log_candidate_evaluation = original_log_method
+
+ avg_score = sum(valset_eval.scores) / len(valset_eval.scores) if valset_eval.scores else 0.0
+
+ # Store evaluation result to avoid double evaluation
+ self._dpareto_evaluated_candidates[normalized_prompt] = (
+ avg_score,
+ cand_source,
+ 'evaluated_in_make_reflective_dataset'
+ )
+
+ self.logger.debug(f"Candidate {i} evaluated: score={avg_score:.4f}")
+
+ # Generate notation
+ notation_map = {
+ 'seed': 'Sโ',
+ 'gepa_reflection': 'Sแตฃ',
+ 'llego_crossover': 'Oโโ',
+ 'llego_mutation': 'Oโแตคโ'
+ }
+ cand_notation = notation_map.get(cand_source, 'S')
+
+ # Add to batch for Pareto update
+ candidates_with_scores.append({
+ 'prompt': cand_prompt,
+ 'score': avg_score,
+ 'type': cand_source,
+ 'notation': cand_notation
+ })
+
+ # ๐ฅ CRITICAL: Explicitly add this candidate to LLEGO population with Dpareto fitness
+ if self.llego:
+ from ..operators.llego_operators import PromptCandidate
+
+ # Check if already in population
+ existing_in_pop = False
+ for p in self.llego.population:
+ if p.prompt.strip().strip('"\'') == normalized_prompt:
+ # Update fitness if this Dpareto score is better
+ if avg_score > p.fitness:
+ old_fitness = p.fitness
+ p.fitness = avg_score
+ if p.metadata:
+ p.metadata['candidate_type'] = cand_source
+ p.metadata['dataset_evaluated'] = 'dpareto'
+ self.logger.debug(f"Updated LLEGO fitness: {old_fitness:.4f} โ {avg_score:.4f}")
+ existing_in_pop = True
+ break
+
+ if not existing_in_pop:
+ # Add new candidate to population
+ prompt_candidate = PromptCandidate(
+ prompt=cand_prompt,
+ fitness=avg_score,
+ metadata={
+ 'generation': self.llego.current_generation,
+ 'operator': 'evaluated_on_dpareto',
+ 'prompt_length': len(cand_prompt),
+ 'word_count': len(cand_prompt.split()),
+ 'evaluation_samples': len(valset_eval.scores) if valset_eval.scores else 0,
+ 'candidate_type': cand_source,
+ 'dataset_evaluated': 'dpareto'
+ }
+ )
+ self.llego.update_population([prompt_candidate])
+
+ except Exception as e:
+ self.logger.error(f" โ Error evaluating candidate #{i} on Dpareto: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+
+ # Batch Pareto front update
+ if candidates_with_scores:
+
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ added_candidates = pareto_log.batch_update_pareto_front(candidates_with_scores)
+
+ # ๐ฅ CRITICAL: Update queue with scores for best-candidate selection
+ # Create a mapping of prompt -> score for quick lookup
+ prompt_to_score = {c['prompt'].strip().strip('"\''): c['score'] for c in candidates_with_scores}
+
+ # Update candidates in queue with their scores
+ if hasattr(self.llm_client, '_adapter_generated_candidates'):
+ updated_queue = []
+ for cand in self.llm_client._adapter_generated_candidates:
+ if isinstance(cand, dict):
+ cand_prompt = cand.get('prompt', '')
+ normalized = cand_prompt.strip().strip('"\'')
+ if normalized in prompt_to_score:
+ # Update with score
+ cand['score'] = prompt_to_score[normalized]
+ updated_queue.append(cand)
+ else:
+ updated_queue.append(cand)
+ else:
+ updated_queue.append(cand)
+
+ self.llm_client._adapter_generated_candidates = updated_queue
+
+ self.logger.debug(f"Pareto update: {len(added_candidates)} added, front size={len(pareto_log.pareto_front)}")
+
+ # Clear flag after evaluation complete
+ self._evaluating_generated_candidates = False
+ elif not hasattr(self, '_evaluating_generated_candidates'):
+ self.logger.error("Valset not available - cannot evaluate generated candidates")
+
+ # Signal LLEGO-enhanced client for reflection mode
+ if self.llego and hasattr(self.llm_client, 'set_reflection_context'):
+ self.llm_client.set_reflection_context(
+ current_prompt=system_prompt,
+ feedback=eval_batch,
+ in_reflection=True
+ )
+
+ # ๐ฅ CRITICAL: Also set reflection context on reflection_lm_client if it exists
+ # This ensures hybrid mode candidate generation is triggered when GEPA calls reflection_lm_callable
+ if hasattr(self, 'reflection_lm_client') and self.reflection_lm_client:
+ if hasattr(self.reflection_lm_client, 'set_reflection_context'):
+ self.logger.info("๐ฅ CRITICAL: Setting reflection context on reflection_lm_client for hybrid mode")
+ self.reflection_lm_client.set_reflection_context(
+ current_prompt=system_prompt,
+ feedback=eval_batch,
+ in_reflection=True # This enables hybrid candidate generation!
+ )
+
+ self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update)
+
+ # Inject generated candidates into reflective dataset
+ suggested_prompts = []
+ if hasattr(self, '_generated_candidates') and self._generated_candidates:
+ suggested_prompts = [c['prompt'] for c in self._generated_candidates if isinstance(c, dict) and 'prompt' in c]
+ self.logger.debug(f"Injecting {len(suggested_prompts)} suggested prompts")
+
+ for component in components_to_update:
+ reflective_dataset[component] = []
+ for trace in eval_batch.trajectories:
+ # Generate feedback based on evaluation results
+ # ๐ Phase 2: Pass trace and current_prompt for LLM-as-Judge
+ feedback = self._generate_feedback(
+ trace['evaluation_results'],
+ trace=trace,
+ current_prompt=system_prompt
+ )
+
+ # Base reflection data
+ # ๐ฅ FIX: Strip image_base64 from input_data to prevent massive base64 strings in logs
+ input_data_clean = trace['input_data'].copy() if isinstance(trace['input_data'], dict) else {}
+ if 'image_base64' in input_data_clean:
+ input_data_clean['image_base64'] = f"[IMAGE_DATA_{len(input_data_clean['image_base64'])}_chars]"
+
+ # ๐ฅ FIX: Clean detailed_scores to remove any base64 references or large data
+ detailed_scores_clean = {}
+ if isinstance(trace['evaluation_results'], dict):
+ for key, value in trace['evaluation_results'].items():
+ # Skip any values that look like base64 (very long strings)
+ if isinstance(value, str) and len(value) > 1000:
+ detailed_scores_clean[key] = f"[DATA_{len(value)}_chars]"
+ else:
+ detailed_scores_clean[key] = value
+ else:
+ detailed_scores_clean = trace['evaluation_results']
+
+ reflection_entry = {
+ "current_prompt": system_prompt,
+ "input_data": input_data_clean, # Use cleaned version without full base64
+ "predicted_output": trace['predicted_output'],
+ "score": trace['evaluation_results'].get("composite_score", 0.0),
+ "feedback": feedback,
+ "detailed_scores": detailed_scores_clean # Cleaned scores without large data
+ }
+
+ # ๐ฅ CRITICAL: Only optimize system_prompt, NOT user_prompt
+ # The user_prompt contains the task description (command) and should NOT be modified
+ if component == 'system_prompt' and suggested_prompts:
+ # Add suggested improved prompts to the reflection entry
+ # GEPA might use these if the structure supports it
+ reflection_entry["suggested_improved_prompts"] = suggested_prompts
+ reflection_entry["num_suggestions"] = len(suggested_prompts)
+ # Also add the best suggested prompt as a direct suggestion
+ if suggested_prompts:
+ reflection_entry["suggested_prompt"] = suggested_prompts[0] # First candidate as primary suggestion
+ reflection_entry["optimize_component"] = "system_prompt_only" # Mark that we only optimize system_prompt
+ elif component != 'system_prompt':
+ # For non-system_prompt components (like user_prompt), do NOT add suggestions
+ # We only want to optimize system_prompt
+ reflection_entry["optimize_component"] = "skip" # Mark to skip optimization
+ self.logger.info(f"โ ๏ธ Skipping optimization for component '{component}' - only optimizing system_prompt")
+
+ reflective_dataset[component].append(reflection_entry)
+
+ total_samples = sum(len(data) for data in reflective_dataset.values())
+ avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0
+ self.logger.info(f"๐ Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}")
+
+ return reflective_dataset
+
+ def _generate_feedback(
+ self,
+ evaluation_results: Dict[str, Any],
+ trace: Optional[Dict[str, Any]] = None,
+ current_prompt: Optional[str] = None
+ ) -> str:
+ """
+ Generate feedback using hybrid approach:
+ - LLM-as-Judge for low/medium scores (detailed, actionable)
+ - Simple feedback for high scores (efficient)
+
+ Args:
+ evaluation_results: Evaluation scores and extracted data
+ trace: Full trace with input_data, predicted_output, etc. (optional)
+ current_prompt: The current system prompt being optimized (optional)
+
+ Returns:
+ Feedback string focused on prompt improvement
+ """
+ composite_score = evaluation_results.get("composite_score", 0.0)
+
+ # Check if LLM-as-Judge is enabled
+ use_llm_judge = getattr(self._config, 'use_llm_as_judge', True)
+ threshold = getattr(self._config, 'llm_as_judge_threshold', 0.8)
+
+ # ๐ฅ FIX: Check both attribute names (inconsistency in codebase)
+ reflection_lm = getattr(self, '_reflection_lm_client', None) or getattr(self, 'reflection_lm_client', None)
+
+ # Debug logging - use INFO so we can see what's happening
+ self.logger.info(f"๐ Feedback generation: score={composite_score:.4f}, use_llm_judge={use_llm_judge}, threshold={threshold}, has_trace={trace is not None}, has_reflection_lm={reflection_lm is not None}")
+ if trace:
+ input_data = trace.get('input_data', {})
+ predicted = trace.get('predicted_output', '')[:100] if trace.get('predicted_output') else 'N/A'
+ expected = input_data.get('output', '')[:100] if input_data.get('output') else 'N/A'
+ self.logger.info(f" Predicted preview: {predicted}...")
+ self.logger.info(f" Expected preview: {expected}...")
+
+ # Use LLM-as-Judge for scores needing improvement
+ if use_llm_judge and composite_score < threshold and trace:
+ if not reflection_lm:
+ self.logger.warning("โ ๏ธ LLM-as-Judge requested but reflection_lm_client not available - using simple feedback")
+ self.logger.warning(f" Checked: _reflection_lm_client={getattr(self, '_reflection_lm_client', None) is not None}, reflection_lm_client={getattr(self, 'reflection_lm_client', None) is not None}")
+ else:
+ try:
+ self.logger.info(f"๐ค Calling LLM-as-Judge for detailed feedback (score: {composite_score:.4f} < threshold: {threshold})")
+ feedback = self._llm_as_judge_feedback(
+ evaluation_results,
+ trace,
+ current_prompt
+ )
+ self.logger.info(f"โ
LLM-as-Judge returned feedback (length: {len(feedback)} chars)")
+ return feedback
+ except Exception as e:
+ self.logger.error(f"โ LLM-as-Judge failed: {e}, falling back to simple feedback")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ # Fall through to simple feedback
+
+ # Simple actionable feedback (for high scores or as fallback)
+ if composite_score >= threshold:
+ self.logger.debug(f"โ
Score {composite_score:.4f} >= threshold {threshold} - using simple feedback")
+ elif not trace:
+ self.logger.debug(f"โ ๏ธ No trace provided - using simple feedback")
+ elif not use_llm_judge:
+ self.logger.debug(f"โ ๏ธ LLM-as-Judge disabled - using simple feedback")
+
+ feedback = self._simple_actionable_feedback(
+ evaluation_results,
+ trace,
+ current_prompt
+ )
+
+ # ๐ฅ ADD FORMAT FEEDBACK: Append format-specific feedback if available
+ if self._detected_format and trace:
+ from ..utils.format_detection import generate_format_feedback
+ input_data = trace.get('input_data', {})
+ format_feedback = generate_format_feedback(
+ predicted_output=trace.get('predicted_output', ''),
+ expected_output=input_data.get('output', ''),
+ format_info=self._detected_format
+ )
+ if format_feedback:
+ feedback += format_feedback
+
+ return feedback
+
+ def _llm_as_judge_feedback(
+ self,
+ evaluation_results: Dict[str, Any],
+ trace: Dict[str, Any],
+ current_prompt: Optional[str] = None
+ ) -> str:
+ """
+ Generate detailed, actionable feedback using LLM-as-Judge.
+
+ ๐ฅ UNIVERSAL VERSION: Works for ANY task type (text, JSON, structured outputs).
+ No UI-specific assumptions. Pure semantic and structural comparison.
+
+ Args:
+ evaluation_results: Evaluation scores and extracted data
+ trace: Full trace with input_data, predicted_output, etc.
+ current_prompt: The current system prompt being optimized
+
+ Returns:
+ Detailed feedback string focused on prompt improvement
+ """
+ # Import universal judge prompt builder
+ from ..utils.universal_judge_prompt import (
+ build_universal_judge_prompt,
+ get_universal_judge_system_prompt,
+ format_universal_judge_feedback,
+ build_empty_output_feedback
+ )
+
+ # Extract data from trace
+ input_data = trace.get('input_data', {})
+ predicted_output = trace.get('predicted_output', '') or ''
+ expected_output = input_data.get('output', '') or ''
+ task_input = input_data.get('input', '') or ''
+
+ # Get image if available (for multi-modal tasks)
+ image_base64 = input_data.get('image', '') or input_data.get('image_base64', '')
+
+ # Log what we're working with
+ self.logger.info(f"๐ LLM-as-Judge input check:")
+ self.logger.info(f" predicted_output length: {len(predicted_output)} chars")
+ self.logger.info(f" expected_output length: {len(expected_output)} chars")
+ self.logger.info(f" image available: {bool(image_base64)} (length: {len(image_base64) if image_base64 else 0} chars)")
+ self.logger.info(f" predicted_output preview: {predicted_output[:200] if predicted_output else '[EMPTY]'}...")
+ self.logger.info(f" expected_output preview: {expected_output[:200] if expected_output else '[EMPTY]'}...")
+
+ # Handle empty predicted output specially
+ if not predicted_output or not predicted_output.strip():
+ self.logger.warning(f"โ ๏ธ Predicted output is empty - generating specialized feedback")
+ return build_empty_output_feedback(task_input, expected_output, current_prompt)
+
+ if not image_base64:
+ self.logger.debug(f"โน๏ธ No image provided - text-only analysis")
+
+ # Get the LLM for judging
+ judge_llm = getattr(self, '_reflection_lm_client', None) or getattr(self, 'reflection_lm_client', None)
+
+ if not judge_llm:
+ self.logger.error("โ CRITICAL: No reflection_lm_client available for LLM-as-Judge!")
+ raise ValueError("reflection_lm_client not available")
+
+ # Build the universal judge prompt
+ judge_prompt = build_universal_judge_prompt(
+ task_input=task_input,
+ predicted_output=predicted_output,
+ expected_output=expected_output,
+ current_prompt=current_prompt,
+ evaluation_results=evaluation_results,
+ image_base64=image_base64
+ )
+
+ # Get the universal system prompt
+ system_prompt = get_universal_judge_system_prompt(has_image=bool(image_base64))
+
+ # Call LLM-as-Judge
+ try:
+ self.logger.info(f"๐ค Calling Universal LLM-as-Judge for semantic analysis")
+ result = judge_llm.generate(
+ system_prompt=system_prompt,
+ user_prompt=judge_prompt,
+ image_base64=image_base64 if image_base64 else ""
+ )
+
+ if isinstance(result, dict):
+ judge_output = result.get('content', '')
+ else:
+ judge_output = str(result)
+
+ # Format the feedback using the universal formatter
+ score = evaluation_results.get('composite_score', 0.0)
+ feedback = format_universal_judge_feedback(
+ judge_output=judge_output,
+ task_input=task_input,
+ predicted_output=predicted_output,
+ expected_output=expected_output,
+ score=score
+ )
+
+ # ๐ฅ ADD FORMAT FEEDBACK: Append format-specific feedback
+ if self._detected_format:
+ from ..utils.format_detection import generate_format_feedback
+ format_feedback = generate_format_feedback(
+ predicted_output=predicted_output,
+ expected_output=expected_output,
+ format_info=self._detected_format
+ )
+ if format_feedback:
+ feedback += format_feedback
+
+ # Also add format constraint for next iteration
+ feedback += f"\n\n{self._detected_format['format_constraint']}"
+
+ self.logger.info(f"โ
Universal LLM-as-Judge generated feedback")
+ return feedback
+
+ except Exception as e:
+ self.logger.error(f"LLM-as-Judge failed: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ # Fallback to simple feedback
+ return self._simple_actionable_feedback(evaluation_results, trace, current_prompt)
+
+ def _extract_reasoning_from_expected(self, expected_output: str) -> str:
+ """Extract reasoning section from expected output."""
+ if not expected_output:
+ return ""
+
+ # Look for "Reason:" or "Reasoning:" section
+ reason_patterns = [
+ r'Reason[:\s]+(.*?)(?:\n\n|\Z)',
+ r'Reasoning[:\s]+(.*?)(?:\n\n|\Z)',
+ ]
+
+ for pattern in reason_patterns:
+ match = re.search(pattern, expected_output, re.IGNORECASE | re.DOTALL)
+ if match:
+ return match.group(1).strip()[:500] # Truncate to 500 chars
+
+ return ""
+
+ def _extract_reasoning_from_predicted(self, predicted_output: str) -> str:
+ """Extract reasoning from predicted output if available."""
+ # Similar to _extract_reasoning_from_expected
+ # Or return first 200 chars if no clear reasoning section
+ if not predicted_output:
+ return ""
+
+ # Look for reasoning patterns
+ reason_patterns = [
+ r'Reason[:\s]+(.*?)(?:\n\n|\Z)',
+ r'Reasoning[:\s]+(.*?)(?:\n\n|\Z)',
+ ]
+
+ for pattern in reason_patterns:
+ match = re.search(pattern, predicted_output, re.IGNORECASE | re.DOTALL)
+ if match:
+ return match.group(1).strip()[:500]
+
+ # If no reasoning found, return first 200 chars
+ if len(predicted_output) > 200:
+ return predicted_output[:200] + "..."
+ return predicted_output
+
+ def _simple_actionable_feedback(
+ self,
+ evaluation_results: Dict[str, Any],
+ trace: Dict[str, Any] = None,
+ current_prompt: Optional[str] = None
+ ) -> str:
+ """
+ Simple feedback without LLM-as-Judge.
+
+ ๐ฅ UNIVERSAL VERSION: Works for any task type.
+ """
+ composite_score = evaluation_results.get("composite_score", 0.0)
+ semantic_sim = evaluation_results.get("semantic_similarity", 0.0)
+ structural_sim = evaluation_results.get("structural_similarity", 0.0)
+
+ feedback_parts = []
+
+ # Extract task context if available
+ if trace:
+ input_data = trace.get('input_data', {})
+ predicted = trace.get('predicted_output', '')
+ expected = input_data.get('output', '')
+
+ # Check for empty output
+ if not predicted or not predicted.strip():
+ feedback_parts.append(
+ "โ CRITICAL: No output generated. "
+ "Add explicit output instructions to the prompt."
+ )
+ # Check for format mismatch
+ elif structural_sim < 0.5:
+ feedback_parts.append(
+ f"โ ๏ธ Format mismatch (structural similarity: {structural_sim:.0%}). "
+ "Add output format instructions (e.g., 'Return as JSON with fields: ...')."
+ )
+ # Check for semantic mismatch
+ elif semantic_sim < 0.5:
+ feedback_parts.append(
+ f"โ ๏ธ Semantic mismatch (similarity: {semantic_sim:.0%}). "
+ "The output meaning differs from expected. Add clearer task instructions."
+ )
+
+ # Score-based feedback
+ if composite_score >= 0.9:
+ feedback_parts.append("โ
Excellent match - prompt is working well.")
+ elif composite_score >= 0.8:
+ feedback_parts.append("โ
Good match - minor refinements possible.")
+ elif composite_score >= 0.6:
+ feedback_parts.append(
+ f"โ ๏ธ Partial match (score: {composite_score:.0%}). "
+ "Consider adding examples or more specific field names to the prompt."
+ )
+ elif composite_score >= 0.3:
+ feedback_parts.append(
+ f"โ ๏ธ Low match (score: {composite_score:.0%}). "
+ "The prompt needs clearer instructions about expected output format and content."
+ )
+ else:
+ feedback_parts.append(
+ f"โ Poor match (score: {composite_score:.0%}). "
+ "Major revision required - add explicit output format, field names, and examples."
+ )
+
+ return "\n".join(feedback_parts) if feedback_parts else f"Score: {composite_score:.0%}"
+
+ def get_best_candidate(self) -> Optional[Dict[str, str]]:
+ """
+ Get the best candidate from GEPA Pareto front.
+
+ GEPA Pareto front is the single source of truth because:
+ - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto
+ - All non-dominated candidates are added to GEPA Pareto front
+ - Therefore, the best candidate MUST be in GEPA Pareto front
+
+ Returns:
+ Best candidate dictionary from GEPA Pareto front, or None if empty
+ """
+ # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth)
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+
+ if pareto_log.pareto_front:
+ try:
+ # Get best candidate from GEPA Pareto front (highest score = best)
+ gepa_best = max(pareto_log.pareto_front, key=lambda x: x['score'])
+ gepa_fitness = gepa_best['score']
+ gepa_prompt = gepa_best['prompt']
+ gepa_type = gepa_best.get('type', 'unknown')
+ gepa_notation = gepa_best.get('notation', 'S')
+
+ self.logger.info(f"โ
Best candidate from GEPA Pareto front: {gepa_notation} with f({gepa_notation})={gepa_fitness:.4f}")
+ self.logger.info(f" Type: {gepa_type}, Prompt length: {len(gepa_prompt)} chars")
+ self.logger.info(f" ๐ก GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)")
+
+ return {
+ 'system_prompt': gepa_prompt,
+ 'fitness': gepa_fitness,
+ 'source': 'gepa_pareto_front',
+ 'candidate_type': gepa_type,
+ 'notation': gepa_notation
+ }
+ except Exception as e:
+ self.logger.error(f"โ Failed to get best from GEPA Pareto front: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+
+ # EDGE CASE: Pareto front empty (shouldn't happen, but handle gracefully)
+ self.logger.warning("โ ๏ธ GEPA Pareto front is empty - no best candidate available")
+ self.logger.warning(" This should not happen if all candidates are evaluated on Dpareto")
+ return None
+
+ def get_best_score(self) -> float:
+ """Get the best score from GEPA Pareto front (single source of truth)."""
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+
+ if pareto_log.pareto_front:
+ try:
+ gepa_best_fitness = max(p['score'] for p in pareto_log.pareto_front)
+ return gepa_best_fitness
+ except Exception as e:
+ self.logger.warning(f"โ ๏ธ Failed to get best fitness from GEPA Pareto front: {e}")
+
+ # Edge case: Pareto front empty - fallback to adapter's score
+ return self._best_score
+
+ def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0):
+ """
+ Pretty print the new proposed candidate prompt.
+
+ Args:
+ candidate: The new candidate prompt from GEPA
+ iteration: Current optimization iteration
+ """
+ system_prompt = candidate.get('system_prompt', '')
+ candidate_source = candidate.get('source', 'unknown')
+
+ # Store source in adapter state so evaluate() can access it
+ self._current_evaluation_type = candidate_source
+
+ # Also store in mapping by prompt text for lookup
+ if candidate_source != 'unknown' and system_prompt:
+ self._candidate_sources[system_prompt] = candidate_source
+
+ # Use clean logger for simpler output
+ from ..utils.clean_logger import get_clean_logger
+ clean_log = get_clean_logger()
+
+ # Update iteration if needed
+ if iteration > clean_log.current_iteration:
+ clean_log.log_iteration_start(iteration, seed_prompt=None)
+
+ # Don't log here - let evaluate() handle it with full context
+
+ def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch,
+ components_to_update: List[str]):
+ """
+ Pretty print the reflection dataset creation process.
+
+ Args:
+ candidate: Current candidate being evaluated
+ eval_batch: Evaluation results
+ components_to_update: Components being updated
+ """
+ system_prompt = candidate.get('system_prompt', '')
+
+ self.logger.info(f"๐ DEBUG: Inside _log_reflection_dataset_creation")
+ self.logger.info(f"๐ DEBUG: system_prompt length: {len(system_prompt)}")
+ self.logger.info(f"๐ DEBUG: eval_batch.scores: {eval_batch.scores}")
+ self.logger.info(f"๐ DEBUG: eval_batch.trajectories: {len(eval_batch.trajectories) if eval_batch.trajectories else 0}")
+
+ # Determine candidate notation
+ notation_map = {'seed': 'Sโ', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโโ', 'llego_mutation': 'Oโแตคโ'}
+ notation = notation_map.get(self._current_evaluation_type, 'S')
+ cand_num = self._evaluation_count if hasattr(self, '_evaluation_count') else '?'
+ cand_label = f"{notation}{cand_num}"
+
+ # Use logger for the main output too
+ self.logger.info("\n" + "="*80)
+ self.logger.info("๐ REFLECTION DATASET CREATION")
+ self.logger.info("="*80)
+
+ self.logger.info(f"\n๐ CURRENT PROMPT BEING ANALYZED: {cand_label}")
+ self.logger.info(f" Candidate Type: {self._current_evaluation_type or 'unknown'}")
+ self.logger.info("-" * 40)
+ self.logger.info(f'"{system_prompt}"')
+ self.logger.info("-" * 40)
+
+ self.logger.info(f"\n๐ EVALUATION SUMMARY:")
+ self.logger.info("-" * 40)
+ if eval_batch.scores:
+ avg_score = sum(eval_batch.scores) / len(eval_batch.scores)
+ min_score = min(eval_batch.scores)
+ max_score = max(eval_batch.scores)
+ self.logger.info(f" โข Average Score: {avg_score:.4f}")
+ self.logger.info(f" โข Min Score: {min_score:.4f}")
+ self.logger.info(f" โข Max Score: {max_score:.4f}")
+ self.logger.info(f" โข Total Samples: {len(eval_batch.scores)}")
+
+ self.logger.info(f"\n๐ฏ COMPONENTS TO UPDATE:")
+ self.logger.info("-" * 40)
+ for i, component in enumerate(components_to_update, 1):
+ self.logger.info(f" {i}. {component}")
+
+ if eval_batch.trajectories:
+ self.logger.info(f"\n๐ DETAILED ANALYSIS (FULL FEEDBACK - NO TRUNCATION):")
+ self.logger.info("-" * 80)
+ for i, trace in enumerate(eval_batch.trajectories[:5], 1): # Show first 5 samples with FULL details
+ evaluation_results = trace['evaluation_results']
+ composite_score = evaluation_results.get("composite_score", 0.0)
+
+ # Extract element IDs for concise logging
+ predicted_element = evaluation_results.get('predicted_element', 'Unknown')
+ expected_element = evaluation_results.get('expected_element', 'Unknown')
+
+ # Concise, direct logging with candidate notation
+ status_icon = "โ
" if composite_score == 1.0 else "โ"
+
+ # Add notation for candidate type
+ notation_map = {'seed': 'Sโ', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโโ', 'llego_mutation': 'Oโแตคโ'}
+ notation = notation_map.get(self._current_evaluation_type, 'S')
+
+ self.logger.info(f" [{notation}] Sample {i}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status_icon}")
+
+ # ๐ฅ FIX: Pass trace and current_prompt to enable LLM-as-Judge!
+ feedback = self._generate_feedback(
+ evaluation_results,
+ trace=trace, # Pass the full trace!
+ current_prompt=system_prompt # Pass current prompt being analyzed!
+ )
+ self.logger.info(f" ๐ฌ FEEDBACK (FULL):")
+ self.logger.info(f" \"{feedback}\"")
+
+ if len(eval_batch.trajectories) > 5:
+ self.logger.info(f"\n ... and {len(eval_batch.trajectories) - 5} more samples (all logged similarly)")
+
+ self.logger.info("="*80)
+
+ def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str:
+ """
+ ๐ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions.
+
+ NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text.
+ However, this extraction logic serves as a safety net in case the LLM still adds:
+ "Based on the performance analysis...
+ ### Recommendations...
+ ### Revised Prompt Example:
+ [THE ACTUAL PROMPT HERE]
+ ### Conclusion..."
+
+ This is now a defensive measure, not the primary mechanism.
+
+ Args:
+ reflection_output: Full reflection output (should be clean prompt, but may contain analysis)
+
+ Returns:
+ str: Clean, extracted prompt (or original if extraction fails or not needed)
+ """
+ if not reflection_output or not isinstance(reflection_output, str):
+ return reflection_output
+
+ # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:"
+ patterns = [
+ r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)',
+ ]
+
+ for pattern in patterns:
+ match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL)
+ if match:
+ extracted = match.group(1).strip()
+ # Clean up common artifacts
+ extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE)
+ extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE)
+ extracted = extracted.strip()
+
+ if len(extracted) > 50: # Reasonable minimum length for a prompt
+ self.logger.debug(f"โ
Extracted clean prompt using pattern: {pattern[:50]}...")
+ self.logger.debug(f" Original length: {len(reflection_output)} chars")
+ self.logger.debug(f" Extracted length: {len(extracted)} chars")
+ return extracted
+
+ # Pattern 2: If output starts with a quote or prompt-like structure
+ # Look for text that starts with "You are..." and is substantial
+ if 'You are' in reflection_output:
+ # Find the longest continuous block that starts with "You are"
+ prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)',
+ reflection_output, re.IGNORECASE | re.DOTALL)
+ if prompt_match:
+ extracted = prompt_match.group(1).strip()
+ if len(extracted) > 50:
+ self.logger.debug(f"โ
Extracted prompt starting with 'You are...'")
+ return extracted
+
+ # Pattern 3: If the reflection output is actually just a clean prompt (no analysis)
+ # Check if it's relatively short and doesn't contain analysis keywords
+ analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion',
+ 'optimization', 'analysis', 'feedback']
+ if (len(reflection_output) < 2000 and
+ not any(keyword in reflection_output.lower() for keyword in analysis_keywords)):
+ # Likely a clean prompt, return as-is
+ self.logger.debug(f"โ
Reflection output appears to be a clean prompt (no analysis detected)")
+ return reflection_output.strip()
+
+ # Fallback: Return original (with warning)
+ self.logger.warning(f"โ ๏ธ Could not extract clean prompt from reflection output")
+ self.logger.warning(f" Output length: {len(reflection_output)} chars")
+ self.logger.warning(f" Output preview: {reflection_output[:200]}...")
+ self.logger.warning(f" Returning original output (may contain analysis text)")
+ return reflection_output.strip()
+
+ def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]:
+ """
+ ๐ฅ OPTIMIZED: Parse N prompt variations from JSON format response.
+
+ Uses robust JSON parsing with multiple fallback strategies:
+ 1. Extract JSON from markdown code blocks (```json ... ```)
+ 2. Find JSON object directly in text
+ 3. Attempt JSON repair for common issues
+ 4. Fallback to numbered section parsing if JSON fails
+
+ Args:
+ response_text: LLM response containing JSON with variations
+ num_expected: Expected number of variations
+
+ Returns:
+ List[str]: List of prompt variations (in order by index)
+
+ Raises:
+ ValueError: If parsing fails and no valid variations found
+ """
+ import json
+ import re
+
+ if not response_text or not isinstance(response_text, str):
+ raise ValueError("Empty or invalid response text")
+
+ # Strategy 1: Extract JSON from markdown code block
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(1)
+ try:
+ data = json.loads(json_str)
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError as e:
+ self.logger.debug(f"JSON in code block invalid: {e}, trying repair...")
+
+ # Strategy 2: Find JSON object directly in text
+ json_match = re.search(r'\{[^{}]*"variations"[^{}]*\[.*?\]\s*[^{}]*\}', response_text, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(0)
+ try:
+ data = json.loads(json_str)
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ # Try to find largest JSON object
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+ if json_match:
+ try:
+ data = json.loads(json_match.group(0))
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 3: Attempt JSON repair (common issues: trailing commas, unescaped quotes)
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+ if json_match:
+ json_str = json_match.group(0)
+ # Try common repairs
+ repaired = re.sub(r',\s*}', '}', json_str) # Remove trailing commas before }
+ repaired = re.sub(r',\s*]', ']', repaired) # Remove trailing commas before ]
+ try:
+ data = json.loads(repaired)
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 4: Fallback to numbered section parsing
+ self.logger.warning(f"JSON parsing failed, trying numbered section fallback...")
+ try:
+ return self._parse_numbered_section_variations(response_text, num_expected)
+ except ValueError:
+ pass
+
+ # All strategies failed
+ raise ValueError(f"Could not parse {num_expected} variations from response. Response preview: {response_text[:300]}...")
+
+ def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]:
+ """Extract and validate variations from parsed JSON data."""
+ if not isinstance(data, dict):
+ raise ValueError("JSON data is not a dictionary")
+
+ variations_list = data.get('variations', [])
+ if not isinstance(variations_list, list):
+ raise ValueError("'variations' field is not a list")
+
+ if len(variations_list) < num_expected:
+ self.logger.warning(f"Expected {num_expected} variations, found {len(variations_list)} in JSON")
+
+ # Extract and sort by index
+ variations_with_index = []
+ for var in variations_list:
+ if not isinstance(var, dict):
+ continue
+ index = var.get('index', 0)
+ prompt = var.get('prompt', '')
+ if prompt and isinstance(prompt, str):
+ variations_with_index.append((index, prompt.strip()))
+
+ # Sort by index
+ variations_with_index.sort(key=lambda x: x[0])
+
+ # Extract just the prompts
+ variations = [v[1] for v in variations_with_index]
+
+ # Validate count
+ if len(variations) < num_expected:
+ self.logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}")
+ # Pad with duplicates if needed (not ideal but better than failing)
+ while len(variations) < num_expected:
+ variations.append(variations[-1] if variations else "")
+
+ # Take first N if we got more
+ variations = variations[:num_expected]
+
+ # Validate all variations are non-empty
+ if not all(v for v in variations):
+ raise ValueError(f"Some variations are empty after parsing")
+
+ return variations
+
+ def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]:
+ """
+ Fallback parser: Extract variations from numbered sections.
+
+ Format: --- VARIATION N --- or Variation N: or similar
+ """
+ variations = []
+
+ # Pattern 1: --- VARIATION N ---
+ pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)'
+ matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE)
+
+ # Pattern 2: Variation N:
+ pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)'
+ matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE)
+
+ # Pattern 3: Numbered list (1. 2. 3.)
+ pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)'
+ matches3 = re.findall(pattern3, response_text, re.DOTALL)
+
+ # Use the pattern with most matches
+ matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3)
+
+ if len(matches) >= num_expected:
+ # Sort by index
+ matches.sort(key=lambda x: int(x[0]))
+ # Extract prompts
+ variations = [match[1].strip() for match in matches[:num_expected]]
+
+ if len(variations) != num_expected:
+ raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}")
+
+ return variations
+
+ def _generate_hybrid_candidates_adapter_level(
+ self,
+ current_prompt: str,
+ eval_batch: EvaluationBatch,
+ candidate: Dict[str, str]
+ ) -> List[str]:
+ """
+ ๐ฅ ADAPTER-LEVEL HYBRID CANDIDATE GENERATION
+
+ Generate candidates from BOTH GEPA reflection AND LLEGO operators
+ when GEPA's adapter mode ignores the reflection_lm parameter.
+
+ This method:
+ 1. Builds comprehensive feedback from evaluation results
+ 2. Generates GEPA reflection candidates
+ 3. Generates LLEGO crossover/mutation candidates
+ 4. Logs ALL candidates with FULL prompts (no truncation)
+ 5. Stores candidates for potential use
+
+ Args:
+ current_prompt: The current prompt being optimized
+ eval_batch: Evaluation results with trajectories
+ candidate: Current candidate dict
+
+ Returns:
+ List of generated candidate prompts
+ """
+ try:
+ from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+
+ all_candidates = []
+ gepa_count = 0
+
+ # ๐ฅ CRITICAL: Pass format info to LLM client before generating candidates
+ if self._detected_format and self._reflection_lm_client:
+ if isinstance(self._reflection_lm_client, LLEGOEnhancedLLMClient):
+ self._reflection_lm_client._detected_format = self._detected_format
+ self.logger.info(f"๐ Passed format info to reflection LLM: {self._detected_format['format_type']}")
+
+ self.logger.info(f"๐ฅ STEP 1: Building comprehensive feedback from evaluation")
+
+ # ๐ฅ REMOVED: Excessive diagnostic logs - moved to DEBUG level
+ # Build comprehensive feedback text from trajectories
+ if not hasattr(eval_batch, 'trajectories'):
+ self.logger.error(f"โ eval_batch has no 'trajectories' attribute! Type: {type(eval_batch)}")
+ return []
+
+ trajectories = eval_batch.trajectories
+ if not trajectories:
+ self.logger.warning(f"โ ๏ธ eval_batch.trajectories is empty - no feedback to generate candidates from")
+ return []
+
+ self.logger.debug(f"Processing {len(trajectories)} trajectories for feedback generation")
+
+ feedback_lines = []
+ feedback_lines.append(f"Current prompt performance analysis:\n")
+ feedback_lines.append(f"Current prompt:\n{current_prompt}\n")
+ feedback_lines.append(f"\nEvaluation results:\n")
+
+ for i, trace in enumerate(trajectories[:8], 1): # Use up to 8 samples for feedback
+ try:
+ eval_results = trace.get('evaluation_results', {})
+ score = eval_results.get("composite_score", 0.0) if isinstance(eval_results, dict) else 0.0
+ input_data = trace.get('input_data', {})
+ predicted = trace.get('predicted_output', '')
+ expected = input_data.get('output', '') if isinstance(input_data, dict) else ''
+
+ # ๐ฅ FIX: Clean input_data to remove base64 images before logging
+ input_data_clean = input_data.copy() if isinstance(input_data, dict) else {}
+ if 'image_base64' in input_data_clean:
+ input_data_clean['image_base64'] = f"[IMAGE_DATA_{len(input_data_clean['image_base64'])}_chars]"
+
+ feedback_lines.append(f" Sample {i}:")
+ feedback_lines.append(f" Input: {input_data_clean.get('input', '') if isinstance(input_data_clean, dict) else ''}")
+ feedback_lines.append(f" Expected: {expected}")
+ feedback_lines.append(f" Predicted: {predicted}")
+ feedback_lines.append(f" Score: {score:.4f}")
+
+ if isinstance(eval_results, dict):
+ # ๐ฅ FIX: Pass trace and current_prompt to enable LLM-as-Judge!
+ feedback = self._generate_feedback(
+ eval_results,
+ trace=trace, # Pass the full trace!
+ current_prompt=current_prompt # Pass current prompt!
+ )
+ feedback_lines.append(f" Feedback: {feedback}")
+ else:
+ feedback_lines.append(f" Feedback: Evaluation results not in expected format")
+ feedback_lines.append("")
+ except Exception as e:
+ self.logger.error(f"โ Error processing trace {i}: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ continue
+
+ feedback_text = "\n".join(feedback_lines)
+
+ self.logger.info(f"\n๐ FULL FEEDBACK TEXT (NO TRUNCATION):")
+ self.logger.info(feedback_text)
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # PART 1: GEPA REFLECTION CANDIDATES
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ self.logger.info(f"๐ PART 2: GEPA REFLECTION - Semantic Understanding")
+
+ num_gepa = self._config.num_gepa_reflection_candidates if hasattr(self._config, 'num_gepa_reflection_candidates') else 3
+
+ self.logger.info(f"\n๐ Generating {num_gepa} GEPA Reflection candidates in single optimized call...")
+
+ # Set reflection context
+ if isinstance(self._reflection_lm_client, LLEGOEnhancedLLMClient):
+ self._reflection_lm_client.set_reflection_context(
+ current_prompt=current_prompt,
+ feedback=eval_batch,
+ in_reflection=True
+ )
+
+ # ๐ฅ OPTIMIZED: Single call with JSON format for multiple variations
+ try:
+ # Precision-engineered system prompt requesting JSON format
+ optimization_system_prompt = f"""You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate {num_gepa} DISTINCT variations of improved prompts that address the identified issues through DIFFERENT improvement strategies.
+
+CRITICAL OUTPUT FORMAT - MUST BE VALID JSON:
+{{
+ "variations": [
+ {{
+ "index": 1,
+ "prompt": "[First improved prompt text - complete and self-contained]"
+ }},
+ {{
+ "index": 2,
+ "prompt": "[Second improved prompt text - complete and self-contained]"
+ }},
+ {{
+ "index": 3,
+ "prompt": "[Third improved prompt text - complete and self-contained]"
+ }}
+ ]
+}}
+
+DIVERSITY REQUIREMENTS:
+- Variation 1: Focus on clarity, specificity, and explicit instructions
+- Variation 2: Focus on edge case handling, robustness, and error prevention
+- Variation 3: Focus on structural organization, examples, and step-by-step guidance
+- Each variation must be MEANINGFULLY DIFFERENT (not just rewordings)
+- Each variation must address ALL feedback issues but through different approaches
+
+QUALITY STANDARDS (apply to all variations):
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure each prompt is self-contained and unambiguous
+- Preserve the core task domain and output format requirements
+
+OUTPUT FORMAT:
+- Output MUST be valid JSON (can be wrapped in ```json ... ``` markdown code block)
+- Generate EXACTLY {num_gepa} variations
+- Index must be 1, 2, 3, ... (sequential, starting at 1)
+- Each "prompt" field must contain the complete, self-contained prompt text
+- NO explanations, NO analysis, NO meta-commentary - just the JSON structure
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+- Any text outside the JSON structure
+
+Output ONLY the JSON object with the variations."""
+
+ # Construct user prompt with clear structure
+ optimization_user_prompt = f"""CURRENT PROMPT (to be improved):
+{current_prompt}
+
+{feedback_text}
+
+TASK: Generate {num_gepa} DISTINCT variations of improved prompts. Each variation should:
+- Address ALL feedback issues identified above
+- Use a DIFFERENT improvement strategy (clarity, robustness, structure)
+- Be meaningfully different from the others (not just rewordings)
+- Be complete and self-contained
+
+Remember: Output ONLY the JSON object with {num_gepa} variations. No explanations."""
+
+ result = self._reflection_lm_client.generate(
+ system_prompt=optimization_system_prompt,
+ user_prompt=optimization_user_prompt,
+ image_base64=""
+ )
+
+ if isinstance(result, dict):
+ response_text = result.get("content", str(result))
+ else:
+ response_text = str(result)
+
+ # Parse JSON variations
+ gepa_variations = self._parse_json_variations(response_text, num_gepa)
+
+ # Add all variations to candidates
+ for idx, variation_prompt in enumerate(gepa_variations, 1):
+ # ๐ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite instructions
+ gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt)
+
+ if gepa_candidate != variation_prompt:
+ self.logger.debug(f" Variation {idx}: Extracted clean prompt (removed {len(variation_prompt) - len(gepa_candidate)} chars)")
+
+ all_candidates.append({
+ 'prompt': gepa_candidate,
+ 'source': 'gepa_reflection',
+ 'index': idx
+ })
+
+ # ๐ฅ CAPTURE CANDIDATE FOR LIVE UI DISPLAY
+ try:
+ import sys
+ if 'app' in sys.modules:
+ app_module = sys.modules['app']
+ if hasattr(app_module, 'add_candidate_to_store'):
+ app_module.add_candidate_to_store({
+ 'prompt': gepa_candidate,
+ 'source': 'gepa_reflection',
+ 'timestamp': f"Candidate #{idx}"
+ })
+ except Exception:
+ pass # Silent fail - UI capture is optional
+
+ self.logger.info(f"\nโ
GEPA REFLECTION CANDIDATE #{idx}/{num_gepa} (FULL PROMPT - NO TRUNCATION):")
+ self.logger.info(f"{'โ'*80}")
+ self.logger.info(f"{gepa_candidate}")
+ self.logger.info(f"{'โ'*80}")
+ self.logger.info(f" Length: {len(gepa_candidate)} chars, Words: {len(gepa_candidate.split())}")
+
+ gepa_count = len(all_candidates)
+ self.logger.info(f"\nโ
GEPA Reflection: {gepa_count} candidates generated in single optimized call")
+
+ except Exception as e:
+ self.logger.error(f"โ Error generating GEPA reflection candidates: {e}")
+ self.logger.warning(f" Falling back to sequential generation...")
+ import traceback
+ self.logger.debug(traceback.format_exc())
+
+ # Fallback: Sequential generation (when JSON parsing fails)
+ for i in range(num_gepa):
+ self.logger.info(f"\n๐ Generating GEPA Reflection candidate #{i+1}/{num_gepa} (fallback mode)...")
+ try:
+ fallback_user_prompt = f"""CURRENT PROMPT (to be improved):
+{current_prompt}
+
+{feedback_text}
+
+TASK: Generate an improved version of the CURRENT PROMPT that addresses all issues identified in the evaluation feedback above.
+
+Remember: Output ONLY the improved prompt text. No explanations."""
+
+ result = self._reflection_lm_client.generate(
+ system_prompt=self._FALLBACK_SYSTEM_PROMPT,
+ user_prompt=fallback_user_prompt,
+ image_base64=""
+ )
+
+ if isinstance(result, dict):
+ gepa_candidate_raw = result.get("content", str(result))
+ else:
+ gepa_candidate_raw = str(result)
+
+ gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw)
+
+ all_candidates.append({
+ 'prompt': gepa_candidate,
+ 'source': 'gepa_reflection',
+ 'index': i + 1
+ })
+
+ # ๐ฅ CAPTURE CANDIDATE FOR LIVE UI DISPLAY
+ try:
+ import sys
+ if 'app' in sys.modules:
+ app_module = sys.modules['app']
+ if hasattr(app_module, 'add_candidate_to_store'):
+ app_module.add_candidate_to_store({
+ 'prompt': gepa_candidate,
+ 'source': 'gepa_reflection',
+ 'timestamp': f"Fallback #{i+1}"
+ })
+ except Exception:
+ pass # Silent fail - UI capture is optional
+ except Exception as fallback_error:
+ self.logger.error(f"โ Error in fallback generation #{i+1}: {fallback_error}")
+
+ gepa_count = len(all_candidates)
+ if gepa_count > 0:
+ self.logger.info(f"\nโ
GEPA Reflection: {gepa_count} candidates generated")
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # PART 2: LLEGO GENETIC OPERATORS
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ self.logger.info(f"๐งฌ PART 3: LLEGO GENETIC OPERATORS - Structural Diversity")
+
+ if self.llego:
+ # ๐ฅ FIX 2: Get Pareto front from GEPA (not LLEGO population)
+ # This ensures LLEGO operators use true non-dominated solutions
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ gepa_pareto_front = pareto_log.pareto_front
+
+ # Convert GEPA Pareto front to PromptCandidate format
+ pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+ pareto_front = pareto_candidates
+
+ self.logger.info(f" Using GEPA Pareto front (size: {len(gepa_pareto_front)})")
+ self.logger.info(f" Converted to {len(pareto_front)} PromptCandidate objects")
+ for idx, p in enumerate(pareto_front, 1):
+ cand_type = p.metadata.get('candidate_type', 'unknown') if p.metadata else 'unknown'
+ notation = p.metadata.get('notation', 'S') if p.metadata else 'S'
+ self.logger.info(f" {notation}: [fitness={p.fitness:.3f}, type={cand_type}, length={len(p.prompt)} chars]")
+
+ # Create LLM callable for LLEGO
+ def llm_callable(genetic_prompt: str) -> str:
+ # ๐ฅ LLEGO genetic prompt already contains full instructions
+ # Use minimal system prompt to avoid instruction conflict
+ result = self._reflection_lm_client.generate(
+ system_prompt="You are an expert prompt engineer. Follow the instructions provided in the user message to generate an improved prompt. Output only the prompt text, no explanations.",
+ user_prompt=genetic_prompt,
+ image_base64=""
+ )
+ if isinstance(result, dict):
+ return result.get('content', str(result))
+ return str(result)
+
+ # Generate LLEGO offspring
+ try:
+ llego_prompts = self.llego.evolve_generation(
+ llm=llm_callable,
+ pareto_front=pareto_front
+ )
+
+ n_crossover = self._config.n_crossover if hasattr(self._config, 'n_crossover') else 2
+ crossover_count = min(n_crossover, len(llego_prompts))
+
+ for i, prompt in enumerate(llego_prompts):
+ if i < crossover_count:
+ source = 'llego_crossover'
+ else:
+ source = 'llego_mutation'
+
+ all_candidates.append({
+ 'prompt': prompt,
+ 'source': source,
+ 'index': i + 1
+ })
+
+ # ๐ฅ CAPTURE CANDIDATE FOR LIVE UI DISPLAY
+ try:
+ import sys
+ if 'app' in sys.modules:
+ app_module = sys.modules['app']
+ if hasattr(app_module, 'add_candidate_to_store'):
+ app_module.add_candidate_to_store({
+ 'prompt': prompt,
+ 'source': source,
+ 'timestamp': f"Candidate #{i+1}"
+ })
+ except Exception:
+ pass # Silent fail - UI capture is optional
+
+ border_char = "โ" if source == 'llego_crossover' else "โ"
+ self.logger.info(f"\n{border_char*80}")
+ self.logger.info(f"{border_char} {'๐ LLEGO CROSSOVER' if source == 'llego_crossover' else '๐ฒ LLEGO MUTATION'} candidate #{i+1}")
+ self.logger.info(f"{border_char*80}")
+ self.logger.info(f"{prompt}")
+ self.logger.info(f"{border_char*80}")
+ self.logger.info(f" Length: {len(prompt)} chars, Words: {len(prompt.split())}")
+
+ self.logger.info(f"โ
LLEGO Genetic Operators: {len(llego_prompts)} candidates generated")
+
+ except Exception as e:
+ self.logger.error(f"โ Error generating LLEGO candidates: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # SUMMARY
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ self.logger.info(f"\n{'='*80}")
+ self.logger.info(f"๐ ADAPTER-LEVEL HYBRID GENERATION SUMMARY")
+ self.logger.info(f"{'='*80}")
+ self.logger.info(f" ๐ GEPA Reflection: {gepa_count} candidates")
+ self.logger.info(f" ๐ LLEGO Crossover: {len([c for c in all_candidates if c['source'] == 'llego_crossover'])} candidates")
+ self.logger.info(f" ๐ฒ LLEGO Mutation: {len([c for c in all_candidates if c['source'] == 'llego_mutation'])} candidates")
+ self.logger.info(f" ๐ฆ TOTAL: {len(all_candidates)} diverse candidates")
+ self.logger.info(f"{'='*80}\n")
+
+ # Store candidates (GEPA might access them through some mechanism)
+ self._generated_candidates = all_candidates
+
+ # Log each candidate with FULL text
+ self.logger.info(f"\n{'='*80}")
+ self.logger.info(f"๐ ALL GENERATED CANDIDATES (FULL PROMPTS - NO TRUNCATION)")
+ self.logger.info(f"{'='*80}")
+ for i, cand in enumerate(all_candidates, 1):
+ source_emoji = "๐" if cand['source'] == 'gepa_reflection' else "๐" if cand['source'] == 'llego_crossover' else "๐ฒ"
+ self.logger.info(f"\n{source_emoji} CANDIDATE #{i} - {cand['source'].upper().replace('_', ' ')}")
+ self.logger.info(f"{cand['prompt']}")
+ self.logger.info(f" Length: {len(cand['prompt'])} characters")
+ self.logger.info(f" Words: {len(cand['prompt'].split())} words")
+ self.logger.info(f"{'='*80}\n")
+
+ # Return candidates as list of dicts with metadata (not just strings)
+ # This ensures source information is preserved
+ return all_candidates # Return full dicts with source info
+
+ except Exception as e:
+ self.logger.error(f"\n{'โ'*80}")
+ self.logger.error(f"โ CRITICAL ERROR in _generate_hybrid_candidates_adapter_level!")
+ self.logger.error(f"โ Error: {str(e)}")
+ self.logger.error(f"{'โ'*80}\n")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ return []
+
+ def propose_new_texts(
+ self,
+ candidate: Dict[str, str],
+ reflective_dataset: Dict[str, List[Dict[str, Any]]],
+ components_to_update: List[str]
+ ) -> Dict[str, str]:
+ """
+ ๐ฅ CRITICAL: This method is called by GEPA to propose new component texts.
+
+ This is the KEY integration point - GEPA checks if adapter.propose_new_texts exists,
+ and if it does, uses it instead of the default InstructionProposalSignature.
+
+ This method:
+ 1. Uses reflective_dataset to generate improved prompts
+ 2. Optionally uses LLEGO for additional diversity
+ 3. Returns dict mapping component_name -> new component text
+
+ Args:
+ candidate: Current candidate dict (component_name -> component_text)
+ reflective_dataset: Feedback data per component (from make_reflective_dataset)
+ components_to_update: List of component names to update
+
+ Returns:
+ Dict mapping component_name -> new component text
+ """
+ self.logger.info(f"\n{'='*80}")
+ self.logger.info(f"๐ฏ PROPOSE_NEW_TEXTS CALLED BY GEPA")
+ self.logger.info(f"{'='*80}")
+ self.logger.info(f" Components to update: {components_to_update}")
+ self.logger.info(f" Reflective dataset keys: {list(reflective_dataset.keys())}")
+
+ # ๐ฅ FIX: Check if we already generated candidates in hybrid mode
+ # If yes, return one of them instead of generating a new one (avoids duplicate work and context overflow)
+ if hasattr(self, '_generated_candidates') and self._generated_candidates:
+ self.logger.info(f"\nโ
HYBRID MODE: Using pre-generated candidates from make_reflective_dataset")
+ self.logger.info(f" Available candidates: {len(self._generated_candidates)}")
+ self.logger.info(f" Returning first candidate (GEPA will evaluate all of them)")
+
+ # Return the first candidate (GEPA will get others via queue)
+ first_candidate = self._generated_candidates[0]
+ new_texts = {}
+ for component in components_to_update:
+ if isinstance(first_candidate, dict) and 'prompt' in first_candidate:
+ new_texts[component] = first_candidate['prompt']
+ source = first_candidate.get('source', 'unknown')
+ self.logger.info(f" Returning {source} candidate (length: {len(first_candidate['prompt'])} chars)")
+ else:
+ new_texts[component] = str(first_candidate)
+
+ self.logger.info(f"{'='*80}\n")
+ return new_texts
+
+ new_texts = {}
+
+ # Check if we have reflection_lm_client (required for proposal)
+ if not self._reflection_lm_client:
+ self.logger.error("โ reflection_lm_client not available - cannot generate proposals")
+ # Fallback: return current candidate (no change)
+ for component in components_to_update:
+ new_texts[component] = candidate.get(component, '')
+ return new_texts
+
+ # For each component to update
+ for component_name in components_to_update:
+ self.logger.info(f"๐ Proposing new text for component: {component_name}")
+
+ current_text = candidate.get(component_name, '')
+ dataset = reflective_dataset.get(component_name, [])
+
+ if not dataset:
+ self.logger.warning(f"โ ๏ธ No feedback data for {component_name}, keeping current text")
+ new_texts[component_name] = current_text
+ continue
+
+ self.logger.info(f" Current text length: {len(current_text)} chars")
+ self.logger.info(f" Feedback examples: {len(dataset)}")
+
+ # Generate improved prompt using reflection LM
+ try:
+ # ๐ฅ FIX: Clean dataset to remove base64 images (prevents context overflow)
+ cleaned_dataset = []
+ for item in dataset:
+ cleaned_item = item.copy()
+ # Remove or truncate base64 image data
+ if 'image_base64' in cleaned_item:
+ img_len = len(cleaned_item['image_base64'])
+ cleaned_item['image_base64'] = f'[IMAGE_DATA_REMOVED_{img_len}_chars]'
+ if 'image' in cleaned_item and isinstance(cleaned_item['image'], str) and len(cleaned_item['image']) > 1000:
+ img_len = len(cleaned_item['image'])
+ cleaned_item['image'] = f'[IMAGE_DATA_REMOVED_{img_len}_chars]'
+ # Also clean any nested detailed_scores
+ if 'detailed_scores' in cleaned_item and isinstance(cleaned_item['detailed_scores'], dict):
+ for key in list(cleaned_item['detailed_scores'].keys()):
+ val = cleaned_item['detailed_scores'][key]
+ if isinstance(val, str) and len(val) > 5000:
+ cleaned_item['detailed_scores'][key] = f'[LARGE_DATA_REMOVED_{len(val)}_chars]'
+ cleaned_dataset.append(cleaned_item)
+
+ self.logger.info(f" ๐ Cleaned dataset: removed base64 images to prevent context overflow")
+
+ # Use GEPA's default instruction proposal format
+ from gepa.strategies.instruction_proposal import InstructionProposalSignature
+
+ # Build input dict for GEPA's instruction proposal
+ input_dict = {
+ "current_instruction_doc": current_text,
+ "dataset_with_feedback": cleaned_dataset # Use cleaned dataset!
+ }
+
+ # Generate prompt using GEPA's signature
+ prompt = InstructionProposalSignature.prompt_renderer(input_dict)
+
+ # Call reflection LM to generate new instruction
+ self.logger.info(f" Generating improved prompt via reflection LM...")
+
+ result = self._reflection_lm_client.generate(
+ system_prompt="You are an expert prompt engineer. Follow the instructions in the user message to generate an improved prompt.",
+ user_prompt=prompt,
+ image_base64=""
+ )
+
+ # Extract response
+ if isinstance(result, dict):
+ response_text = result.get("content", str(result))
+ else:
+ response_text = str(result)
+
+ # Extract instruction using GEPA's extractor
+ extracted = InstructionProposalSignature.output_extractor(response_text)
+ new_instruction = extracted.get("new_instruction", response_text.strip())
+
+ # Clean up the instruction (remove markdown, quotes, etc.)
+ new_instruction = self._clean_extracted_prompt(new_instruction)
+
+ self.logger.info(f" โ
Generated new text (length: {len(new_instruction)} chars)")
+ self.logger.info(f" Preview: {new_instruction[:150]}...")
+
+ new_texts[component_name] = new_instruction
+
+ except Exception as e:
+ self.logger.error(f"โ Error generating proposal for {component_name}: {e}")
+ import traceback
+ self.logger.error(traceback.format_exc())
+ # Fallback: return current text
+ new_texts[component_name] = current_text
+
+ self.logger.info(f"\n{'='*80}")
+ self.logger.info(f"โ
PROPOSE_NEW_TEXTS COMPLETE")
+ self.logger.info(f" Generated {len(new_texts)} new component texts")
+ self.logger.info(f"{'='*80}\n")
+
+ return new_texts
+
+ def _clean_extracted_prompt(self, prompt: str) -> str:
+ """
+ Clean extracted prompt by removing markdown, quotes, and extra whitespace.
+
+ Args:
+ prompt: Raw extracted prompt text
+
+ Returns:
+ Cleaned prompt text
+ """
+ if not prompt:
+ return prompt
+
+ # Remove markdown code blocks
+ prompt = re.sub(r'```[\w]*\n?', '', prompt)
+ prompt = re.sub(r'```', '', prompt)
+
+ # Remove quotes if entire prompt is quoted
+ prompt = prompt.strip()
+ if (prompt.startswith('"') and prompt.endswith('"')) or \
+ (prompt.startswith("'") and prompt.endswith("'")):
+ prompt = prompt[1:-1]
+
+ # Remove leading/trailing whitespace
+ prompt = prompt.strip()
+
+ return prompt
\ No newline at end of file
diff --git a/src/gepa_optimizer/data/__init__.py b/src/gepa_optimizer/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fbd2af54c03c26e5d54c79d12e6d16140d76190
--- /dev/null
+++ b/src/gepa_optimizer/data/__init__.py
@@ -0,0 +1,27 @@
+"""
+Data module for GEPA Optimizer
+"""
+
+from .converters import UniversalConverter
+from .loaders import DataLoader
+from .validators import DataValidator
+from .scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset
+from .validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split
+from .index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split
+
+__all__ = [
+ "UniversalConverter",
+ "DataLoader",
+ "DataValidator",
+ # Scroll dataset
+ "ScrollDatasetLoader",
+ "load_scroll_dataset",
+ # Validation dataset
+ "ValidationDatasetLoader",
+ "load_validation_dataset",
+ "load_validation_split",
+ # Index caching dataset
+ "IndexCachingDatasetLoader",
+ "load_index_caching_dataset",
+ "load_index_caching_split",
+]
diff --git a/src/gepa_optimizer/data/converters.py b/src/gepa_optimizer/data/converters.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc46ec186d799e47b92c756e594a8f796b8e8364
--- /dev/null
+++ b/src/gepa_optimizer/data/converters.py
@@ -0,0 +1,265 @@
+"""
+Universal converter for dataset to GEPA format with 3-way split (train/val/test)
+"""
+
+import os
+import json
+from typing import Any, List, Tuple, Union, Dict, Optional
+from pathlib import Path
+import pandas as pd
+import logging
+
+from .loaders import DataLoader
+from ..utils.exceptions import DatasetError
+from ..models.config import DataSplitConfig
+
+logger = logging.getLogger(__name__)
+
+class UniversalConverter:
+ """
+ Universal converter for datasets to GEPA format.
+
+ Handles 3-way splitting (train/val/test) with configurable ratios and
+ graceful handling of small datasets.
+ """
+
+ def __init__(self, data_split_config: Optional[DataSplitConfig] = None):
+ """
+ Initialize converter with optional split configuration.
+
+ Args:
+ data_split_config: Configuration for train/val/test splits.
+ If None, uses default 60/20/20 split.
+ """
+ self.supported_extensions = [
+ '.csv', '.json', '.jsonl', '.txt', '.md',
+ '.png', '.jpg', '.jpeg'
+ ]
+ self.loader = DataLoader()
+ self.data_split_config = data_split_config or DataSplitConfig()
+
+ def convert(
+ self,
+ dataset: Union[List[Any], str, Any, Dict[str, Any]],
+ split_config: Optional[DataSplitConfig] = None
+ ) -> Tuple[List[dict], List[dict], List[dict]]:
+ """
+ Convert any dataset to GEPA format with 3-way split (train/val/test).
+
+ Args:
+ dataset: Input dataset in any supported format
+ split_config: Optional split configuration (overrides instance config)
+
+ Returns:
+ Tuple of (trainset, valset, testset) where:
+ - trainset: Used for reflection/feedback (Dfeedback in GEPA paper)
+ - valset: Used for Pareto selection (Dpareto in GEPA paper)
+ - testset: Held-out for final evaluation (not passed to GEPA)
+
+ Raises:
+ DatasetError: If dataset cannot be converted or is too small
+ """
+ try:
+ # Use provided split config or instance default
+ config = split_config or self.data_split_config
+
+ # Handle UI tree dataset format
+ if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset':
+ return self.convert_ui_tree_dataset(
+ dataset.get('json_dir', 'json_tree'),
+ dataset.get('screenshots_dir', 'screenshots'),
+ split_config=config
+ )
+ elif isinstance(dataset, str):
+ data = self._load_from_path(dataset)
+ elif hasattr(dataset, 'to_dict'): # pandas DataFrame
+ data = dataset.to_dict(orient='records')
+ elif isinstance(dataset, list):
+ data = dataset
+ else:
+ data = [dataset]
+
+ logger.info(f"Normalized data length: {len(data)}")
+ standardized = self._standardize(data)
+ train, val, test = self._split_three_way(standardized, config)
+ return train, val, test
+ except (FileNotFoundError, ValueError, TypeError) as e:
+ raise DatasetError(f"Failed to convert dataset: {str(e)}")
+
+ def _load_from_path(self, path: str) -> List[Any]:
+ """Load data from file path"""
+ p = Path(path)
+ if not p.exists():
+ raise FileNotFoundError(f"File not found: {path}")
+
+ ext = p.suffix.lower()
+ if ext in self.supported_extensions:
+ return [self.loader.load(p)]
+ else:
+ raise DatasetError(f"Unsupported file extension: {ext}")
+
+ def _standardize(self, data: List[Any]) -> List[dict]:
+ """Standardize data to input/output format
+
+ Handles both UI tree JSON format and simple text inputs.
+ UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str}
+ Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc.
+ """
+ out = []
+ for item in data:
+ if not isinstance(item, dict):
+ item = {'input': str(item)}
+
+ # Handle UI tree JSON format
+ if 'ui_tree' in item and 'screenshot' in item:
+ ui_tree = item['ui_tree']
+ input_text = ui_tree.get('text', '')
+ output_text = item.get('expected_output', '')
+ image = item.get('screenshot', '')
+ out.append({'input': input_text, 'output': output_text, 'image': image})
+ # Handle simple text format
+ else:
+ inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or ''
+ outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or ''
+ image = self._extract(item, ['image', 'image_base64', 'screenshot']) or ''
+ out.append({'input': inp, 'output': outp, 'image': image})
+
+ return out
+
+ def _extract(self, d: dict, keys: List[str]) -> Union[str, None]:
+ """Extract value by trying multiple keys"""
+ for k in keys:
+ if k in d:
+ return d[k]
+ return None
+
+ def _split_three_way(
+ self,
+ data: List[dict],
+ config: DataSplitConfig
+ ) -> Tuple[List[dict], List[dict], List[dict]]:
+ """
+ Split data into train, validation, and test sets.
+
+ Args:
+ data: Standardized dataset
+ config: Split configuration with ratios and strategies
+
+ Returns:
+ Tuple of (train, val, test) datasets
+
+ Raises:
+ ValueError: If dataset is too small for configured splits
+ """
+ dataset_size = len(data)
+
+ # ๐ฅ NEW: Log adaptive strategy if being used
+ if config.small_dataset_strategy == 'adaptive':
+ train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size)
+ logger.info(
+ f"๐ Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): "
+ f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% "
+ f"(prioritizes validation for reliable candidate ranking)"
+ )
+
+ # Get split indices from config
+ try:
+ train_end, val_end, test_end, _ = config.get_split_indices(dataset_size)
+ except ValueError as e:
+ logger.error(f"Dataset split error: {e}")
+ raise DatasetError(str(e))
+
+ # Perform the split
+ train = data[:train_end]
+ val = data[train_end:val_end]
+ test = data[val_end:test_end]
+
+ # Log split information with strategy
+ strategy_note = ""
+ if config.small_dataset_strategy == 'adaptive':
+ strategy_note = " (adaptive)"
+ logger.info(
+ f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), "
+ f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), "
+ f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)"
+ )
+
+ # Validate splits are not empty
+ if len(train) == 0:
+ raise DatasetError("Training set is empty after split")
+ if len(val) == 0:
+ logger.warning("Validation set is empty - this may cause issues with Pareto selection")
+ val = [train[-1]] # Use last training sample as fallback
+ if len(test) == 0:
+ logger.warning("Test set is empty - final evaluation will not be performed")
+
+ return train, val, test
+
+ def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]:
+ """
+ DEPRECATED: Legacy 2-way split for backwards compatibility.
+
+ Use _split_three_way() instead for production code.
+
+ Args:
+ data: Standardized dataset
+ ratio: Train ratio (0.0-1.0)
+
+ Returns:
+ Tuple of (train, val) datasets
+ """
+ import warnings
+ warnings.warn(
+ "_split() is deprecated. Use _split_three_way() for 3-way splitting.",
+ DeprecationWarning,
+ stacklevel=2
+ )
+
+ split = max(1, int(len(data) * ratio))
+ train = data[:split]
+ val = data[split:] or data[-1:] # Ensure val is not empty
+ return train, val
+
+ def convert_ui_tree_dataset(
+ self,
+ json_dir: str,
+ screenshots_dir: str,
+ split_config: Optional[DataSplitConfig] = None
+ ) -> Tuple[List[dict], List[dict], List[dict]]:
+ """
+ Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split.
+
+ Args:
+ json_dir: Directory containing JSON files
+ screenshots_dir: Directory containing screenshot images
+ split_config: Optional split configuration (overrides instance config)
+
+ Returns:
+ Tuple of (train_data, val_data, test_data) in GEPA format
+
+ Raises:
+ DatasetError: If dataset cannot be loaded or is invalid
+ """
+ try:
+ # Load paired dataset
+ dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir)
+
+ if not dataset:
+ raise DatasetError("No valid image-JSON pairs found")
+
+ logger.info(f"Loaded {len(dataset)} UI tree samples")
+
+ # Use provided config or instance default
+ config = split_config or self.data_split_config
+
+ # Split into train/val/test
+ train, val, test = self._split_three_way(dataset, config)
+
+ logger.info(
+ f"Split UI tree dataset: {len(train)} train, "
+ f"{len(val)} validation, {len(test)} test"
+ )
+ return train, val, test
+
+ except Exception as e:
+ raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}")
diff --git a/src/gepa_optimizer/data/index_caching_loader.py b/src/gepa_optimizer/data/index_caching_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e38edd78fbd009bca23e28505da62c89841c99
--- /dev/null
+++ b/src/gepa_optimizer/data/index_caching_loader.py
@@ -0,0 +1,278 @@
+"""
+Index Caching Dataset Loader
+
+Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format.
+"""
+
+import os
+import json
+import base64
+import logging
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class IndexCachingDatasetLoader:
+ """
+ Loads index caching dataset from JSON file.
+
+ Expected JSON format:
+ [
+ {
+ "command": "Tap on first option from the suggestion",
+ "image": "element_images/QMxgc_14_0_tap_IkALe_element.png",
+ "xml": "xml/IkALe__debug.xml",
+ "expected": {
+ "is_index_based": true,
+ "index_value": 1,
+ "parent_element_id": "aaaabf",
+ "element_id_of_nth_child_of_parent": "aaaabg",
+ "selected_element_is_correct": true
+ }
+ },
+ ...
+ ]
+
+ Converts to GEPA format:
+ - input: command text (seed prompt will be provided in test script)
+ - output: JSON string with expected values
+ - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
+ - input: Command + XML content (combined in user prompt)
+ - metadata: All original fields plus converted values
+ """
+
+ def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None):
+ """
+ Initialize index caching dataset loader.
+
+ Args:
+ json_path: Path to JSON file. Default: "./note2_debug.json" or from env var
+ base_dir: Base directory for resolving relative paths in JSON.
+ Default: Directory containing JSON file
+
+ Raises:
+ FileNotFoundError: If JSON file doesn't exist
+ json.JSONDecodeError: If JSON file is invalid
+ """
+ # Get JSON path from env or use default
+ if json_path is None:
+ json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json")
+
+ self.json_path = Path(json_path).resolve()
+
+ if not self.json_path.exists():
+ raise FileNotFoundError(
+ f"Dataset file not found: {self.json_path}\n"
+ f"Make sure note2_debug.json exists in the project root."
+ )
+
+ # Base directory for resolving relative paths
+ if base_dir is None:
+ base_dir = self.json_path.parent
+ self.base_dir = Path(base_dir).resolve()
+
+ def load_dataset(self) -> List[Dict[str, Any]]:
+ """
+ Load dataset from JSON file and convert to GEPA format.
+
+ Returns:
+ List of dataset items in GEPA format:
+ [
+ {
+ "input": "Tap on first option from the suggestion", # Command only
+ "output": '{"is_index_based": true, "index_value": 1, ...}', # Expected JSON
+ "image_base64": "", # TOP LEVEL
+ "metadata": {
+ "command": "...",
+ "image_path": "...",
+ "xml_path": "...",
+ "expected": {...}
+ }
+ },
+ ...
+ ]
+
+ Raises:
+ FileNotFoundError: If image or XML file doesn't exist
+ json.JSONDecodeError: If JSON file is invalid
+ """
+ # Load JSON file
+ with open(self.json_path, "r", encoding="utf-8") as f:
+ dataset = json.load(f)
+
+ gepa_dataset = []
+
+ for idx, entry in enumerate(dataset):
+ command = entry.get("command", "")
+ image_path = entry.get("image", "")
+ xml_path = entry.get("xml", "")
+ expected = entry.get("expected", {})
+
+ # Resolve paths relative to base_dir
+ abs_image_path = (self.base_dir / image_path).resolve()
+ abs_xml_path = (self.base_dir / xml_path).resolve()
+
+ # Validate paths
+ if not abs_image_path.exists():
+ raise FileNotFoundError(
+ f"Image file not found: {abs_image_path}\n"
+ f"Entry {idx + 1}: {command}"
+ )
+
+ if not abs_xml_path.exists():
+ raise FileNotFoundError(
+ f"XML file not found: {abs_xml_path}\n"
+ f"Entry {idx + 1}: {command}"
+ )
+
+ # Load and encode image
+ with open(abs_image_path, "rb") as f:
+ image_data = f.read()
+ image_base64 = base64.b64encode(image_data).decode("utf-8")
+
+ # Load XML content
+ with open(abs_xml_path, "r", encoding="utf-8") as f:
+ xml_content = f.read()
+
+ # Convert expected to JSON string
+ expected_json = json.dumps(expected, ensure_ascii=False)
+
+ # Create user prompt with command + XML content
+ # The XML will be included in the user prompt text (as the agent does)
+ user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```"
+
+ # For reflection, we don't need full XML - just the command is enough
+ # Reflection is about improving the prompt based on evaluation feedback,
+ # not analyzing specific XML structures
+ reflection_input = command # Just the command, no XML
+
+ # Create GEPA format item
+ gepa_item = {
+ "input": user_prompt, # Command + XML content (for evaluation)
+ "reflection_input": reflection_input, # Just command (for reflection)
+ "output": expected_json, # Expected output as JSON string
+ "image_base64": image_base64, # TOP LEVEL for UniversalConverter
+ "metadata": {
+ "command": command,
+ "image_path": str(image_path),
+ "xml_path": str(xml_path),
+ "abs_image_path": str(abs_image_path),
+ "abs_xml_path": str(abs_xml_path),
+ "xml_content": xml_content, # Store XML separately in metadata
+ "expected": expected,
+ "dataset_index": idx
+ }
+ }
+
+ gepa_dataset.append(gepa_item)
+
+ return gepa_dataset
+
+ def load_split(
+ self,
+ train_ratio: float = 0.6,
+ val_ratio: float = 0.4
+ ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Load dataset and split into train/val sets (no test set).
+
+ Args:
+ train_ratio: Ratio for training set (default: 0.6)
+ val_ratio: Ratio for validation set (default: 0.4)
+
+ Returns:
+ Tuple of (train_set, val_set)
+
+ Raises:
+ ValueError: If ratios don't sum to 1.0
+ """
+ if abs(train_ratio + val_ratio - 1.0) > 0.01:
+ raise ValueError(
+ f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}"
+ )
+
+ dataset = self.load_dataset()
+ total = len(dataset)
+
+ train_end = int(total * train_ratio)
+
+ train_set = dataset[:train_end]
+ val_set = dataset[train_end:]
+
+ return train_set, val_set
+
+
+def load_index_caching_dataset(
+ json_path: Optional[str] = None,
+ base_dir: Optional[str] = None
+) -> List[Dict[str, Any]]:
+ """
+ Convenience function to load index caching dataset.
+
+ Args:
+ json_path: Path to JSON file
+ base_dir: Base directory for resolving relative paths
+
+ Returns:
+ List of dataset items in GEPA format
+ """
+ loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
+ return loader.load_dataset()
+
+
+def load_index_caching_split(
+ json_path: Optional[str] = None,
+ base_dir: Optional[str] = None,
+ train_ratio: float = 0.6,
+ val_ratio: float = 0.4
+) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Convenience function to load and split index caching dataset.
+
+ Args:
+ json_path: Path to JSON file
+ base_dir: Base directory for resolving relative paths
+ train_ratio: Ratio for training set
+ val_ratio: Ratio for validation set
+
+ Returns:
+ Tuple of (train_set, val_set) - no test set
+ """
+ loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
+ return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio)
+
+
+# Example usage
+if __name__ == "__main__":
+ print("๐ Testing Index Caching Dataset Loader...")
+
+ # Test loading
+ try:
+ loader = IndexCachingDatasetLoader(json_path="./note2_debug.json")
+ dataset = loader.load_dataset()
+
+ print(f"\nโ
Loaded {len(dataset)} items")
+
+ # Show sample
+ if dataset:
+ sample = dataset[0]
+ print(f"\n๐ Sample Item:")
+ print(f" Command: {sample['input']}")
+ print(f" Image path: {sample['metadata']['image_path']}")
+ print(f" XML path: {sample['metadata']['xml_path']}")
+ print(f" Expected: {sample['output'][:100]}...")
+ print(f" Image base64 length: {len(sample['image_base64'])}")
+ print(f" XML content length: {len(sample['metadata'].get('xml_content', ''))}")
+
+ # Test split
+ train, val = loader.load_split()
+ print(f"\n๐ Dataset Split:")
+ print(f" Training: {len(train)} samples")
+ print(f" Validation: {len(val)} samples")
+ print(f" Test: Not used (no test set)")
+
+ except Exception as e:
+ print(f"โ Error: {e}")
+
diff --git a/src/gepa_optimizer/data/loaders.py b/src/gepa_optimizer/data/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f70b857e9972f5dac3a267ec6f3db9d073ca0b0
--- /dev/null
+++ b/src/gepa_optimizer/data/loaders.py
@@ -0,0 +1,237 @@
+"""
+Data loading utilities for various file formats
+"""
+
+import json
+import base64
+import pandas as pd
+from typing import Any, Optional, Union, List , Dict
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+class DataLoader:
+ """
+ Utility class for loading data from various sources
+ """
+
+ def __init__(self):
+ self.supported_formats = [
+ '.csv', '.json', '.jsonl', '.txt', '.md', '.xlsx',
+ '.png', '.jpg', '.jpeg'
+ ]
+
+ def load(self, source: Union[str, Path], format_hint: Optional[str] = None) -> Optional[Any]:
+ """
+ Load data from any supported source
+
+ Args:
+ source: File path or data source
+ format_hint: Optional format hint to override auto-detection
+
+ Returns:
+ Loaded data or None if failed
+ """
+ try:
+ path = Path(source)
+
+ if not path.exists():
+ logger.error(f"File not found: {source}")
+ return None
+
+ # Use format hint or detect from extension
+ file_format = format_hint or path.suffix.lower()
+
+ if file_format == '.csv':
+ return self.load_csv(path)
+ elif file_format == '.json':
+ return self.load_json(path)
+ elif file_format == '.jsonl':
+ return self.load_jsonl(path)
+ elif file_format in ['.txt', '.md']:
+ return self.load_text(path)
+ elif file_format == '.xlsx':
+ return self.load_excel(path)
+ elif file_format in ['.png', '.jpg', '.jpeg']:
+ return self.load_image_base64(path)
+ else:
+ logger.warning(f"Unsupported format: {file_format}")
+ return None
+
+ except Exception as e:
+ logger.error(f"Failed to load data from {source}: {str(e)}")
+ return None
+
+ def load_csv(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
+ """Load CSV file as pandas DataFrame"""
+ try:
+ df = pd.read_csv(path)
+ logger.info(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
+ return df
+ except Exception as e:
+ logger.error(f"Failed to load CSV {path}: {str(e)}")
+ return None
+
+ def load_json(self, path: Union[str, Path]) -> Optional[Any]:
+ """Load JSON file"""
+ try:
+ with open(path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+
+ if isinstance(data, list):
+ logger.info(f"Loaded JSON with {len(data)} items")
+ else:
+ logger.info("Loaded JSON object")
+
+ return data
+ except Exception as e:
+ logger.error(f"Failed to load JSON {path}: {str(e)}")
+ return None
+
+ def load_jsonl(self, path: Union[str, Path]) -> Optional[List[Dict]]:
+ """Load JSONL (JSON Lines) file"""
+ try:
+ data = []
+ with open(path, 'r', encoding='utf-8') as f:
+ for line_num, line in enumerate(f, 1):
+ line = line.strip()
+ if line:
+ try:
+ data.append(json.loads(line))
+ except json.JSONDecodeError as e:
+ logger.warning(f"Invalid JSON on line {line_num}: {str(e)}")
+
+ logger.info(f"Loaded JSONL with {len(data)} items")
+ return data
+ except Exception as e:
+ logger.error(f"Failed to load JSONL {path}: {str(e)}")
+ return None
+
+ def load_text(self, path: Union[str, Path]) -> Optional[str]:
+ """Load plain text file"""
+ try:
+ with open(path, 'r', encoding='utf-8') as f:
+ content = f.read()
+
+ logger.info(f"Loaded text file with {len(content)} characters")
+ return content
+ except Exception as e:
+ logger.error(f"Failed to load text {path}: {str(e)}")
+ return None
+
+ def load_excel(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
+ """Load Excel file as pandas DataFrame"""
+ try:
+ df = pd.read_excel(path)
+ logger.info(f"Loaded Excel with {len(df)} rows and {len(df.columns)} columns")
+ return df
+ except Exception as e:
+ logger.error(f"Failed to load Excel {path}: {str(e)}")
+ return None
+
+ def load_image_base64(self, path: Union[str, Path]) -> Optional[str]:
+ """Load image file and encode as Base64 string"""
+ try:
+ with open(path, 'rb') as f:
+ encoded_string = base64.b64encode(f.read()).decode('utf-8')
+ logger.info(f"Loaded image {path} and encoded to Base64")
+ return encoded_string
+ except Exception as e:
+ logger.error(f"Failed to load image {path}: {str(e)}")
+ return None
+
+ def is_supported_format(self, file_path: Union[str, Path]) -> bool:
+ """Check if file format is supported"""
+ path = Path(file_path)
+ return path.suffix.lower() in self.supported_formats
+
+ def get_file_info(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+ """Get information about a file"""
+ path = Path(file_path)
+
+ if not path.exists():
+ return {'exists': False}
+
+ return {
+ 'exists': True,
+ 'size': path.stat().st_size,
+ 'format': path.suffix.lower(),
+ 'supported': self.is_supported_format(path),
+ 'name': path.name,
+ 'stem': path.stem,
+ 'parent': str(path.parent)
+ }
+
+ def load_ui_tree_dataset(self, json_dir: str, screenshots_dir: str) -> List[Dict[str, Any]]:
+ """
+ Load UI tree dataset by pairing JSON files with corresponding screenshots
+
+ Args:
+ json_dir: Directory containing JSON files (e.g., "json_tree")
+ screenshots_dir: Directory containing screenshot images (e.g., "screenshots")
+
+ Returns:
+ List of dictionaries with 'input', 'output', and 'image' keys
+ """
+ json_path = Path(json_dir)
+ screenshots_path = Path(screenshots_dir)
+
+ if not json_path.exists():
+ raise FileNotFoundError(f"JSON directory not found: {json_dir}")
+ if not screenshots_path.exists():
+ raise FileNotFoundError(f"Screenshots directory not found: {screenshots_dir}")
+
+ dataset = []
+
+ # Get all JSON files
+ json_files = list(json_path.glob("*.json"))
+ logger.info(f"Found {len(json_files)} JSON files in {json_dir}")
+
+ for json_file in json_files:
+ # Extract filename without extension (e.g., "2" from "2.json")
+ file_stem = json_file.stem
+
+ # Look for corresponding image file
+ image_extensions = ['.jpg', '.jpeg', '.png']
+ image_file = None
+
+ for ext in image_extensions:
+ potential_image = screenshots_path / f"{file_stem}{ext}"
+ if potential_image.exists():
+ image_file = potential_image
+ break
+
+ if not image_file:
+ logger.warning(f"No corresponding image found for {json_file.name}")
+ continue
+
+ try:
+ # Load JSON content
+ json_data = self.load_json(json_file)
+ if not json_data:
+ logger.warning(f"Failed to load JSON: {json_file}")
+ continue
+
+ # Load image as base64
+ image_base64 = self.load_image_base64(image_file)
+ if not image_base64:
+ logger.warning(f"Failed to load image: {image_file}")
+ continue
+
+ # Create dataset entry
+ dataset_entry = {
+ 'input': 'Extract UI elements from this screenshot and provide the complete UI tree structure',
+ 'output': json.dumps(json_data, indent=2), # Convert JSON to string
+ 'image': image_base64
+ }
+
+ dataset.append(dataset_entry)
+ logger.debug(f"Loaded pair: {json_file.name} + {image_file.name}")
+
+ except Exception as e:
+ logger.error(f"Error loading {json_file.name}: {str(e)}")
+ continue
+
+ logger.info(f"Successfully loaded {len(dataset)} image-JSON pairs")
+ return dataset
diff --git a/src/gepa_optimizer/data/scroll_dataset_loader.py b/src/gepa_optimizer/data/scroll_dataset_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c29e9e6119b2aca10de309e709db45374fb95c
--- /dev/null
+++ b/src/gepa_optimizer/data/scroll_dataset_loader.py
@@ -0,0 +1,334 @@
+"""
+Scroll Element Dataset Loader for Drizz Mobile App Testing
+
+Loads screenshots with bounding boxes and commands to identify scroll elements.
+Converts to GEPA-compatible format for prompt optimization.
+"""
+
+import base64
+import random
+import logging
+from typing import List, Dict, Any, Tuple, Optional
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ScrollDatasetLoader:
+ """
+ GENERIC dataset loader for image-based tasks.
+
+ This is a LIBRARY class - NO hardcoded assumptions about:
+ - What the task is (OCR, element detection, classification, etc.)
+ - Input format (questions, commands, descriptions, etc.)
+ - Output format (IDs, text, JSON, etc.)
+
+ Users define their dataset in the test script and pass it here.
+
+ Dataset format per item: (image_filename, input_text, expected_output)
+
+ Example usage (ANY task):
+ # Define YOUR dataset in YOUR test script
+ my_dataset = [
+ ("img1.png", "What is the main color?", "blue"),
+ ("img2.png", "Count the objects", "5"),
+ ("img3.png", "Describe the scene", "A cat on a sofa"),
+ ]
+
+ # Pass to loader
+ loader = ScrollDatasetLoader(
+ images_dir="images",
+ dataset_config=my_dataset
+ )
+ data = loader.load_dataset()
+ """
+
+ def __init__(
+ self,
+ images_dir: str = "images",
+ dataset_config: Optional[List[Tuple[str, str, str]]] = None
+ ):
+ """
+ Initialize dataset loader.
+
+ Args:
+ images_dir: Directory containing images
+ dataset_config: List of (image_filename, input_text, expected_output) tuples.
+ REQUIRED - no hardcoded defaults to keep library generic.
+
+ Raises:
+ FileNotFoundError: If images_dir doesn't exist
+ ValueError: If dataset_config is None
+ """
+ self.images_dir = Path(images_dir)
+
+ if not self.images_dir.exists():
+ raise FileNotFoundError(f"Images directory not found: {images_dir}")
+
+ if dataset_config is None:
+ raise ValueError(
+ "dataset_config is required. This is a library class - define your "
+ "dataset in the test script:\n"
+ " dataset = [('img1.png', 'your input', 'expected output'), ...]\n"
+ " loader = ScrollDatasetLoader(images_dir='...', dataset_config=dataset)"
+ )
+
+ self.dataset_config = dataset_config
+
+ def load_dataset(self) -> List[Dict[str, Any]]:
+ """
+ Load complete dataset with images.
+
+ Phase 1: Includes element_id extraction from expected output.
+
+ Returns:
+ List of dataset items in GEPA format:
+ [
+ {
+ "input": "Command: Scroll down by 70%",
+ "output": "3",
+ "image_base64": "", # TOP LEVEL
+ "metadata": {
+ "image_path": "images/5.png",
+ "input_text": "Command: Scroll down by 70%",
+ "expected_output": "3",
+ "image_filename": "5.png",
+ "element_id": 3 # Extracted integer (None if extraction fails)
+ }
+ },
+ ...
+ ]
+ """
+ dataset = []
+
+ # Generic variable names - no assumptions about data type
+ for image_filename, input_text, expected_output in self.dataset_config:
+ image_path = self.images_dir / image_filename
+
+ # Validate image exists
+ if not image_path.exists():
+ logger.warning(f"Image not found: {image_path}")
+ continue
+
+ # Read and encode image
+ try:
+ image_base64 = self._encode_image(image_path)
+ except Exception as e:
+ logger.warning(f"Error encoding {image_filename}: {e}")
+ continue
+
+ # ๐ฅ Phase 1: Extract element_id from expected_output for robust evaluation
+ element_id = self._extract_element_id(expected_output)
+ if element_id is None:
+ logger.warning(f"Could not extract element_id from '{expected_output}' in {image_filename}")
+
+ # Create dataset item - COMPLETELY GENERIC
+ # NO assumptions about output format (element IDs, commands, etc.)
+ # Just: image + input text + expected output text
+ # Library doesn't know or care what the task is!
+ # IMPORTANT: Put image_base64 at TOP LEVEL for UniversalConverter to find it
+ dataset_item = {
+ "input": input_text, # Generic input text (ANY format)
+ "output": expected_output, # Generic expected output (ANY format, full reasoning)
+ "image_base64": image_base64, # TOP LEVEL for converter
+ "metadata": {
+ "image_path": str(image_path),
+ "input_text": input_text,
+ "expected_output": expected_output,
+ "image_filename": image_filename,
+ "element_id": element_id # NEW: Extracted element ID (int or None)
+ }
+ }
+
+ dataset.append(dataset_item)
+
+ if not dataset:
+ raise ValueError("No valid images found in dataset")
+
+ logger.info(f"Loaded {len(dataset)} scroll element detection samples")
+ return dataset
+
+ def _extract_element_id(self, expected_output: str) -> Optional[int]:
+ """
+ Extract element ID from expected output string.
+
+ Handles multiple formats:
+ - "Element: 4"
+ - "Element 4"
+ - "4" (standalone)
+ - "Element: 4, Description: ..." (full reasoning)
+
+ Args:
+ expected_output: Full expected output string with reasoning
+
+ Returns:
+ Element ID as integer, or None if not found
+ """
+ import re
+
+ if not expected_output:
+ return None
+
+ # Pattern 1: "Element: X" or "Element X" (case insensitive)
+ patterns = [
+ r'element[:\s]+(\d+)', # "Element: 4" or "Element 4"
+ r'\belement\s+(\d+)\b', # "element 4" (word boundary)
+ ]
+
+ for pattern in patterns:
+ match = re.search(pattern, expected_output, re.IGNORECASE)
+ if match:
+ try:
+ element_id = int(match.group(1))
+ # Validate range (reasonable UI element IDs)
+ if 1 <= element_id <= 100:
+ return element_id
+ except (ValueError, IndexError):
+ continue
+
+ # Pattern 2: First standalone number (if no "Element:" pattern found)
+ # Only use if it's a reasonable element ID (1-100)
+ number_match = re.search(r'\b(\d{1,3})\b', expected_output)
+ if number_match:
+ try:
+ element_id = int(number_match.group(1))
+ if 1 <= element_id <= 100: # Reasonable range for UI elements
+ return element_id
+ except ValueError:
+ pass
+
+ return None
+
+ def _encode_image(self, image_path: Path) -> str:
+ """
+ Encode image to base64 string.
+
+ Args:
+ image_path: Path to image file
+
+ Returns:
+ Base64 encoded image string
+ """
+ with open(image_path, "rb") as image_file:
+ encoded = base64.b64encode(image_file.read()).decode('utf-8')
+ return encoded
+
+ def split_dataset(
+ self,
+ dataset: List[Dict[str, Any]],
+ train_size: int = 4,
+ val_size: int = 1,
+ test_size: int = 1,
+ shuffle: bool = True,
+ seed: Optional[int] = None
+ ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Split dataset into train, validation, and test sets.
+
+ ๐ฅ NEW: Added shuffling support to ensure different image distribution
+ across splits, preventing hard images from always landing in validation set.
+
+ Args:
+ dataset: Complete dataset
+ train_size: Number of samples for training (default: 4)
+ val_size: Number of samples for validation (default: 1)
+ test_size: Number of samples for test (default: 1)
+ shuffle: Whether to shuffle dataset before splitting (default: True)
+ seed: Random seed for reproducible shuffling (default: None = random)
+
+ Returns:
+ Tuple of (train_set, val_set, test_set)
+ """
+ n = len(dataset)
+
+ # Validate split sizes
+ total_size = train_size + val_size + test_size
+ if total_size > n:
+ logger.warning(f"Requested split ({total_size}) exceeds dataset size ({n}). Adjusting split proportionally...")
+ ratio = n / total_size
+ train_size = int(train_size * ratio)
+ val_size = int(val_size * ratio)
+ test_size = n - train_size - val_size
+
+ # ๐ฅ CRITICAL: Shuffle dataset to ensure different image distribution
+ # This prevents the same hard images from always being in validation set
+ dataset_copy = dataset.copy() # Don't modify original
+ if shuffle:
+ if seed is not None:
+ random.seed(seed)
+ logger.debug(f"Shuffling dataset with seed={seed} for reproducible splits")
+ else:
+ logger.debug(f"Shuffling dataset randomly (no seed)")
+ random.shuffle(dataset_copy)
+ else:
+ logger.warning(f"Not shuffling dataset - using original order")
+
+ # Split shuffled dataset
+ train_set = dataset_copy[:train_size]
+ val_set = dataset_copy[train_size:train_size + val_size]
+ test_set = dataset_copy[train_size + val_size:train_size + val_size + test_size]
+
+ logger.info(f"Dataset split: {len(train_set)} train, {len(val_set)} val, {len(test_set)} test")
+
+ # Log which images are in each split for debugging
+ if shuffle:
+ train_images = [item['metadata'].get('image_filename', 'N/A') for item in train_set]
+ val_images = [item['metadata'].get('image_filename', 'N/A') for item in val_set]
+ test_images = [item['metadata'].get('image_filename', 'N/A') for item in test_set]
+ print(f" Train images: {train_images[:5]}{'...' if len(train_images) > 5 else ''}")
+ print(f" Val images: {val_images}")
+ print(f" Test images: {test_images[:5]}{'...' if len(test_images) > 5 else ''}")
+
+ return train_set, val_set, test_set
+
+
+def load_scroll_dataset(
+ images_dir: str = "images",
+ dataset_config: List[Tuple[str, str, str]] = None,
+ split: bool = True
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Convenience function to load image-based dataset (GENERIC).
+
+ Args:
+ images_dir: Directory containing images
+ dataset_config: List of (image_filename, input_text, expected_output) tuples
+ split: Whether to split into train/val/test
+
+ Returns:
+ If split=True: (train_set, val_set, test_set)
+ If split=False: (full_dataset, [], [])
+
+ Example (works for ANY task):
+ dataset_config = [
+ ("img1.png", "What color is the sky?", "blue"),
+ ("img2.png", "Count the dogs", "2"),
+ ]
+ train, val, test = load_scroll_dataset(
+ images_dir="images",
+ dataset_config=dataset_config
+ )
+ """
+ loader = ScrollDatasetLoader(images_dir, dataset_config=dataset_config)
+ dataset = loader.load_dataset()
+
+ if split:
+ return loader.split_dataset(dataset)
+ else:
+ return dataset, [], []
+
+
+# Example usage (for testing the library loader itself)
+if __name__ == "__main__":
+ print("๐ Testing Scroll Dataset Loader...")
+ print("โ ๏ธ NOTE: This is a library class. Define your dataset in your test script.")
+ print("\nExample:")
+ print(" dataset_config = [")
+ print(" ('image1.png', 'Scroll down by 50%', '3'),")
+ print(" ('image2.png', 'Swipe left', '4'),")
+ print(" ]")
+ print(" train, val, test = load_scroll_dataset(")
+ print(" images_dir='images',")
+ print(" dataset_config=dataset_config")
+ print(" )")
+
diff --git a/src/gepa_optimizer/data/validation_dataset_loader.py b/src/gepa_optimizer/data/validation_dataset_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c500db6127ae4c136eaf7c17ce0af0b88eca955
--- /dev/null
+++ b/src/gepa_optimizer/data/validation_dataset_loader.py
@@ -0,0 +1,376 @@
+"""
+Validation Dataset Loader for UI Validation Use Case
+
+Loads validation datapoints from SQLite database and converts to GEPA-compatible format.
+Supports filtering by data_type (trainset/valset/testset) and confirmed status.
+"""
+
+import os
+import sqlite3
+import base64
+import logging
+from typing import List, Dict, Any, Optional, Literal
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ValidationDatasetLoader:
+ """
+ Loads validation dataset from SQLite database.
+
+ Database schema:
+ - validation_data: id, image_id, command, result (0/1), reasoning, data_type, confirmed, created_at
+ - images: image_id, mime, bytes (BLOB), created_at
+
+ Converts to GEPA format:
+ - input: command text (seed prompt will be provided in test script)
+ - output: "true" or "false" (converted from 0/1)
+ - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
+ - metadata: All original fields plus converted values
+
+ Note: The seed prompt is NOT stored in database - it will be provided in the test script.
+ The input field contains just the command, and the image is at top level.
+ """
+
+ def __init__(
+ self,
+ db_path: Optional[str] = None,
+ confirmed_only: bool = True
+ ):
+ """
+ Initialize validation dataset loader.
+
+ Args:
+ db_path: Path to SQLite database file.
+ Default: "./validation_data.db" or from VD_DB_PATH env var
+ confirmed_only: If True, only load datapoints where confirmed=1.
+ Default: True (only manually reviewed data)
+
+ Raises:
+ FileNotFoundError: If database file doesn't exist
+ sqlite3.Error: If database connection fails
+ """
+ # Get database path from env or use default
+ if db_path is None:
+ db_path = os.getenv("VD_DB_PATH", "./validation_data.db")
+
+ self.db_path = Path(db_path).resolve()
+
+ if not self.db_path.exists():
+ raise FileNotFoundError(
+ f"Database file not found: {self.db_path}\n"
+ f"Make sure validation_data_ui_server_async.py has been run at least once to create the database."
+ )
+
+ self.confirmed_only = confirmed_only
+
+ def load_dataset(
+ self,
+ data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
+ confirmed_only: Optional[bool] = None
+ ) -> List[Dict[str, Any]]:
+ """
+ Load dataset from database and convert to GEPA format.
+
+ Args:
+ data_type: Filter by data_type. If None, loads all types.
+ Options: "trainset", "valset", "testset"
+ confirmed_only: Override instance default. If True, only load confirmed datapoints.
+ If None, uses instance default (self.confirmed_only)
+
+ Returns:
+ List of dataset items in GEPA format:
+ [
+ {
+ "input": "Validate Submit button is visible", # Command only (seed prompt in test script)
+ "output": "true", # or "false" (converted from 0/1)
+ "image_base64": "", # TOP LEVEL (image + command together)
+ "metadata": {
+ "id": 1,
+ "image_id": "abc123...",
+ "command": "Validate Submit button is visible",
+ "result": True, # Boolean
+ "result_int": 1, # Original 0/1
+ "reasoning": "Detailed explanation...",
+ "data_type": "trainset",
+ "confirmed": True,
+ "created_at": "2024-01-01 12:00:00"
+ }
+ },
+ ...
+ ]
+
+ Note: Seed prompt is provided separately in test script, not in database.
+
+ Raises:
+ sqlite3.Error: If database query fails
+ ValueError: If no datapoints found matching criteria
+ """
+ # Use provided confirmed_only or instance default
+ use_confirmed = confirmed_only if confirmed_only is not None else self.confirmed_only
+
+ conn = sqlite3.connect(str(self.db_path))
+ conn.row_factory = sqlite3.Row # Access columns by name
+ dataset = []
+
+ try:
+ # Build query with filters
+ query = """
+ SELECT
+ v.id,
+ v.image_id,
+ v.command,
+ v.result,
+ v.reasoning,
+ v.data_type,
+ v.confirmed,
+ v.created_at,
+ i.mime,
+ i.bytes
+ FROM validation_data v
+ INNER JOIN images i ON v.image_id = i.image_id
+ WHERE 1=1
+ """
+ params = []
+
+ # Add filters
+ if use_confirmed:
+ query += " AND v.confirmed = 1"
+
+ if data_type:
+ query += " AND v.data_type = ?"
+ params.append(data_type)
+
+ query += " ORDER BY v.id ASC"
+
+ # Execute query
+ cursor = conn.execute(query, params)
+ rows = cursor.fetchall()
+
+ if not rows:
+ filter_msg = []
+ if use_confirmed:
+ filter_msg.append("confirmed=1")
+ if data_type:
+ filter_msg.append(f"data_type='{data_type}'")
+
+ filter_str = " with filters: " + ", ".join(filter_msg) if filter_msg else ""
+ raise ValueError(
+ f"No datapoints found{filter_str} in database: {self.db_path}\n"
+ f"Make sure you have generated and saved datapoints using the validation UI."
+ )
+
+ # Convert rows to GEPA format
+ for row in rows:
+ # Convert 0/1 to "true"/"false" string for GEPA
+ result_str = "true" if row["result"] == 1 else "false"
+
+ # Encode image bytes to base64
+ image_base64 = base64.b64encode(row["bytes"]).decode("utf-8")
+
+ # Create GEPA format item
+ # Input: command (seed prompt will be provided in test script)
+ # Image: separate at top level (image_base64)
+ # Output: "true" or "false" (converted from 0/1)
+ dataset_item = {
+ "input": row["command"], # Just the command - seed prompt will be in test script
+ "output": result_str, # "true" or "false" (string)
+ "image_base64": image_base64, # TOP LEVEL for UniversalConverter (image + command together)
+ "metadata": {
+ "id": row["id"],
+ "image_id": row["image_id"],
+ "command": row["command"], # Keep original for reference
+ "result": bool(row["result"]), # Boolean for reference
+ "result_int": row["result"], # Original 0/1 for reference
+ "reasoning": row["reasoning"],
+ "data_type": row["data_type"],
+ "confirmed": bool(row["confirmed"]),
+ "created_at": row["created_at"],
+ "mime": row["mime"],
+ }
+ }
+
+ dataset.append(dataset_item)
+
+ # Log summary
+ data_type_str = f" ({data_type})" if data_type else ""
+ confirmed_str = " (confirmed only)" if use_confirmed else " (all)"
+ logger.info(f"Loaded {len(dataset)} validation datapoints{data_type_str}{confirmed_str}")
+
+ return dataset
+
+ finally:
+ conn.close()
+
+ def load_split_dataset(
+ self,
+ confirmed_only: Optional[bool] = None
+ ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Load dataset split by data_type (trainset/valset/testset).
+
+ Convenience method that loads all three splits at once.
+
+ Args:
+ confirmed_only: Override instance default. If True, only load confirmed datapoints.
+
+ Returns:
+ Tuple of (train_set, val_set, test_set) in GEPA format
+
+ Example:
+ loader = ValidationDatasetLoader(db_path="./validation_data.db")
+ train, val, test = loader.load_split_dataset()
+ """
+ train_set = self.load_dataset(data_type="trainset", confirmed_only=confirmed_only)
+ val_set = self.load_dataset(data_type="valset", confirmed_only=confirmed_only)
+ test_set = self.load_dataset(data_type="testset", confirmed_only=confirmed_only)
+
+ logger.info(f"Dataset Split Summary: Training={len(train_set)}, Validation={len(val_set)}, Test={len(test_set)}, Total={len(train_set) + len(val_set) + len(test_set)}")
+
+ return train_set, val_set, test_set
+
+ def get_dataset_stats(self) -> Dict[str, Any]:
+ """
+ Get statistics about the dataset in the database.
+
+ Returns:
+ Dictionary with dataset statistics:
+ {
+ "total": 100,
+ "confirmed": 95,
+ "unconfirmed": 5,
+ "by_data_type": {
+ "trainset": 70,
+ "valset": 15,
+ "testset": 15
+ },
+ "by_result": {
+ "true": 50,
+ "false": 50
+ }
+ }
+ """
+ conn = sqlite3.connect(str(self.db_path))
+ conn.row_factory = sqlite3.Row
+
+ try:
+ stats = {}
+
+ # Total counts
+ total = conn.execute("SELECT COUNT(*) FROM validation_data").fetchone()[0]
+ confirmed = conn.execute("SELECT COUNT(*) FROM validation_data WHERE confirmed = 1").fetchone()[0]
+ stats["total"] = total
+ stats["confirmed"] = confirmed
+ stats["unconfirmed"] = total - confirmed
+
+ # By data_type
+ data_type_rows = conn.execute("""
+ SELECT data_type, COUNT(*) as count
+ FROM validation_data
+ GROUP BY data_type
+ """).fetchall()
+ stats["by_data_type"] = {row["data_type"]: row["count"] for row in data_type_rows}
+
+ # By result (true/false)
+ result_rows = conn.execute("""
+ SELECT result, COUNT(*) as count
+ FROM validation_data
+ GROUP BY result
+ """).fetchall()
+ stats["by_result"] = {
+ "true": sum(row["count"] for row in result_rows if row["result"] == 1),
+ "false": sum(row["count"] for row in result_rows if row["result"] == 0)
+ }
+
+ return stats
+
+ finally:
+ conn.close()
+
+
+def load_validation_dataset(
+ db_path: Optional[str] = None,
+ data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
+ confirmed_only: bool = True
+) -> List[Dict[str, Any]]:
+ """
+ Convenience function to load validation dataset.
+
+ Args:
+ db_path: Path to SQLite database file. Default: "./validation_data.db"
+ data_type: Filter by data_type. If None, loads all types.
+ confirmed_only: If True, only load confirmed datapoints.
+
+ Returns:
+ List of dataset items in GEPA format
+
+ Example:
+ # Load all confirmed training data
+ train_data = load_validation_dataset(data_type="trainset", confirmed_only=True)
+
+ # Load all confirmed data
+ all_data = load_validation_dataset(confirmed_only=True)
+ """
+ loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
+ return loader.load_dataset(data_type=data_type, confirmed_only=confirmed_only)
+
+
+def load_validation_split(
+ db_path: Optional[str] = None,
+ confirmed_only: bool = True
+) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+ """
+ Convenience function to load validation dataset split by data_type.
+
+ Args:
+ db_path: Path to SQLite database file. Default: "./validation_data.db"
+ confirmed_only: If True, only load confirmed datapoints.
+
+ Returns:
+ Tuple of (train_set, val_set, test_set) in GEPA format
+
+ Example:
+ train, val, test = load_validation_split(confirmed_only=True)
+ """
+ loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
+ return loader.load_split_dataset(confirmed_only=confirmed_only)
+
+
+# Example usage and testing
+if __name__ == "__main__":
+ print("๐ Testing Validation Dataset Loader...")
+
+ try:
+ loader = ValidationDatasetLoader()
+
+ # Get stats
+ print("\n๐ Dataset Statistics:")
+ stats = loader.get_dataset_stats()
+ print(f" Total: {stats['total']}")
+ print(f" Confirmed: {stats['confirmed']}")
+ print(f" Unconfirmed: {stats['unconfirmed']}")
+ print(f" By data_type: {stats['by_data_type']}")
+ print(f" By result: {stats['by_result']}")
+
+ # Load split dataset
+ print("\n๐ฆ Loading split dataset...")
+ train, val, test = loader.load_split_dataset()
+
+ # Show sample
+ if train:
+ sample = train[0]
+ print(f"\n๐ Sample Training Item:")
+ print(f" Input: {sample['input']}")
+ print(f" Output: {sample['output']}")
+ print(f" Image ID: {sample['metadata']['image_id'][:8]}...")
+ print(f" Data Type: {sample['metadata']['data_type']}")
+ print(f" Result: {sample['metadata']['result']} (int: {sample['metadata']['result_int']})")
+
+ except FileNotFoundError as e:
+ print(f"โ {e}")
+ print("\n๐ก Make sure validation_data_ui_server_async.py has been run to create the database.")
+ except ValueError as e:
+ print(f"โ {e}")
+ print("\n๐ก Generate and save some datapoints using the validation UI first.")
+
diff --git a/src/gepa_optimizer/data/validators.py b/src/gepa_optimizer/data/validators.py
new file mode 100644
index 0000000000000000000000000000000000000000..28fbc6048f91a25c1a4d54befc7f62be4498f898
--- /dev/null
+++ b/src/gepa_optimizer/data/validators.py
@@ -0,0 +1,207 @@
+"""
+Data validation utilities for GEPA optimizer
+"""
+
+from typing import List, Dict, Any, Optional, Tuple
+import logging
+
+logger = logging.getLogger(__name__)
+
+class DataValidator:
+ """
+ Validates datasets for completeness and GEPA compatibility
+ """
+
+ def __init__(self):
+ self.required_fields = ['input', 'output']
+ self.optional_fields = ['metadata', 'id', 'tags']
+
+ def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
+ """
+ Validate entire dataset
+
+ Args:
+ dataset: List of data items to validate
+
+ Returns:
+ Tuple[bool, List[str]]: (is_valid, list_of_errors)
+ """
+ errors = []
+
+ # Basic dataset checks
+ if not dataset:
+ errors.append("Dataset is empty")
+ return False, errors
+
+ if not isinstance(dataset, list):
+ errors.append("Dataset must be a list")
+ return False, errors
+
+ # Validate each item
+ for idx, item in enumerate(dataset):
+ item_errors = self.validate_item(item, idx)
+ errors.extend(item_errors)
+
+ # Check for minimum dataset size
+ if len(dataset) < 2:
+ errors.append("Dataset should have at least 2 items for proper train/val split")
+
+ # Log validation results
+ if errors:
+ logger.warning(f"Dataset validation failed with {len(errors)} errors")
+ else:
+ logger.info(f"Dataset validation passed for {len(dataset)} items")
+
+ return len(errors) == 0, errors
+
+ def validate_item(self, item: Dict[str, Any], index: Optional[int] = None) -> List[str]:
+ """
+ Validate a single dataset item
+
+ Args:
+ item: Single data item to validate
+ index: Optional item index for error reporting
+
+ Returns:
+ List[str]: List of validation errors
+ """
+ errors = []
+ item_ref = f"item {index}" if index is not None else "item"
+
+ # Check if item is a dictionary
+ if not isinstance(item, dict):
+ errors.append(f"{item_ref}: Must be a dictionary")
+ return errors
+
+ # Check for required fields
+ if 'input' not in item:
+ errors.append(f"{item_ref}: Missing required 'input' field")
+ elif not isinstance(item['input'], str):
+ errors.append(f"{item_ref}: 'input' field must be a string")
+ elif not item['input'].strip():
+ errors.append(f"{item_ref}: 'input' field cannot be empty")
+
+ # Check output field (can be empty but should exist for supervised learning)
+ if 'output' in item:
+ if not isinstance(item['output'], str):
+ errors.append(f"{item_ref}: 'output' field must be a string")
+
+ # Validate metadata if present
+ if 'metadata' in item and not isinstance(item['metadata'], dict):
+ errors.append(f"{item_ref}: 'metadata' field must be a dictionary")
+
+ return errors
+
+ def validate_gepa_format(self, gepa_data: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
+ """
+ Validate data in GEPA format
+
+ Args:
+ gepa_data: Data in GEPA format
+
+ Returns:
+ Tuple[bool, List[str]]: (is_valid, list_of_errors)
+ """
+ errors = []
+
+ if not gepa_data:
+ errors.append("GEPA dataset is empty")
+ return False, errors
+
+ for idx, item in enumerate(gepa_data):
+ if 'input' not in item:
+ errors.append(f"GEPA item {idx}: Missing 'input' field")
+
+ if 'expected_output' not in item:
+ errors.append(f"GEPA item {idx}: Missing 'expected_output' field")
+
+ if 'metadata' not in item:
+ errors.append(f"GEPA item {idx}: Missing 'metadata' field")
+ elif not isinstance(item['metadata'], dict):
+ errors.append(f"GEPA item {idx}: 'metadata' must be a dictionary")
+
+ return len(errors) == 0, errors
+
+ def validate_split(self, trainset: List[Dict], valset: List[Dict]) -> Tuple[bool, List[str]]:
+ """
+ Validate train/validation split
+
+ Args:
+ trainset: Training data
+ valset: Validation data
+
+ Returns:
+ Tuple[bool, List[str]]: (is_valid, list_of_errors)
+ """
+ errors = []
+
+ if not trainset:
+ errors.append("Training set is empty")
+
+ if not valset:
+ errors.append("Validation set is empty")
+
+ # Check proportions
+ total_size = len(trainset) + len(valset)
+ if total_size > 0:
+ train_ratio = len(trainset) / total_size
+ if train_ratio < 0.5:
+ errors.append(f"Training set too small: {train_ratio:.2%} of total data")
+ elif train_ratio > 0.95:
+ errors.append(f"Validation set too small: {1-train_ratio:.2%} of total data")
+
+ return len(errors) == 0, errors
+
+ def get_dataset_stats(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
+ """
+ Get statistics about the dataset
+
+ Args:
+ dataset: Dataset to analyze
+
+ Returns:
+ Dict[str, Any]: Dataset statistics
+ """
+ if not dataset:
+ return {'total_items': 0, 'valid': False}
+
+ stats = {
+ 'total_items': len(dataset),
+ 'has_output': sum(1 for item in dataset if item.get('output')),
+ 'avg_input_length': 0,
+ 'avg_output_length': 0,
+ 'empty_inputs': 0,
+ 'empty_outputs': 0
+ }
+
+ input_lengths = []
+ output_lengths = []
+
+ for item in dataset:
+ if isinstance(item, dict):
+ input_text = item.get('input', '')
+ output_text = item.get('output', '')
+
+ if isinstance(input_text, str):
+ input_lengths.append(len(input_text))
+ if not input_text.strip():
+ stats['empty_inputs'] += 1
+
+ if isinstance(output_text, str):
+ output_lengths.append(len(output_text))
+ if not output_text.strip():
+ stats['empty_outputs'] += 1
+
+ if input_lengths:
+ stats['avg_input_length'] = sum(input_lengths) / len(input_lengths)
+
+ if output_lengths:
+ stats['avg_output_length'] = sum(output_lengths) / len(output_lengths)
+
+ # Determine if dataset looks valid
+ stats['valid'] = (
+ stats['total_items'] > 0 and
+ stats['empty_inputs'] < stats['total_items'] * 0.5 # Less than 50% empty inputs
+ )
+
+ return stats
diff --git a/src/gepa_optimizer/evaluation/__init__.py b/src/gepa_optimizer/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e464dcf245f5c1bf1f11eba3eb30d64fe60499be
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/__init__.py
@@ -0,0 +1,28 @@
+"""
+Evaluation module for GEPA Optimizer
+
+Includes:
+- UniversalSemanticEvaluator: Works for ANY task (recommended for general use)
+- BaseEvaluator: Abstract base class for custom evaluators
+- Task-specific evaluators for specialized use cases
+"""
+
+from .base_evaluator import BaseEvaluator
+from .universal_evaluator import UniversalSemanticEvaluator, create_universal_evaluator
+from .ui_evaluator import UITreeEvaluator
+from .scroll_evaluator import ScrollElementEvaluator
+from .validation_evaluator import ValidationEvaluator
+from .index_caching_evaluator import IndexCachingEvaluator
+
+__all__ = [
+ # Universal (recommended)
+ "UniversalSemanticEvaluator",
+ "create_universal_evaluator",
+ # Base class
+ "BaseEvaluator",
+ # Task-specific
+ "UITreeEvaluator",
+ "ScrollElementEvaluator",
+ "ValidationEvaluator",
+ "IndexCachingEvaluator",
+]
diff --git a/src/gepa_optimizer/evaluation/base_evaluator.py b/src/gepa_optimizer/evaluation/base_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63f322935ceffa0d9fb38a7d1b2049c078bda6c
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/base_evaluator.py
@@ -0,0 +1,51 @@
+"""
+Base evaluator class for all evaluation strategies.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BaseEvaluator(ABC):
+ """
+ Abstract base class for all evaluation strategies.
+
+ This enforces a consistent interface while allowing complete customization
+ of evaluation logic for any use case.
+ """
+
+ def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+ """
+ Initialize evaluator with optional metric weights.
+
+ Args:
+ metric_weights: Optional weights for different metrics.
+ If None, subclasses should provide defaults.
+ """
+ self.metric_weights = metric_weights or {}
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+ @abstractmethod
+ def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]:
+ """
+ Evaluate predicted output against expected output.
+
+ Args:
+ predicted: The model's predicted output
+ expected: The ground truth expected output
+
+ Returns:
+ Dictionary with metric names as keys and scores as values.
+ Must include 'composite_score' key for GEPA integration.
+ """
+ pass
+
+ def validate_weights(self) -> bool:
+ """Validate that metric weights sum to approximately 1.0"""
+ if not self.metric_weights:
+ return True
+
+ total = sum(self.metric_weights.values())
+ return abs(total - 1.0) < 0.01 # Allow small floating point errors
diff --git a/src/gepa_optimizer/evaluation/index_caching_evaluator.py b/src/gepa_optimizer/evaluation/index_caching_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d4ca53c9277645fa8037e27c65aae633fe47ca1
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/index_caching_evaluator.py
@@ -0,0 +1,357 @@
+"""
+Index Caching Evaluator for Index-Based Element Selection Use Case
+
+Evaluates predicted index caching results against expected results.
+Compares all 5 fields with equal weight:
+- is_index_based
+- index_value
+- parent_element_id
+- element_id_of_nth_child_of_parent
+- selected_element_is_correct
+"""
+
+from typing import Dict, Any, Optional
+import json
+import re
+import logging
+
+from .base_evaluator import BaseEvaluator
+
+
+class IndexCachingEvaluator(BaseEvaluator):
+ """
+ Evaluator for index caching use case.
+
+ Features:
+ - Compares all 5 fields with equal weight (20% each)
+ - Parses JSON from LLM response
+ - Handles null values correctly
+ - Returns detailed field-by-field comparison
+ """
+
+ def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+ """
+ Initialize index caching evaluator.
+
+ Args:
+ metric_weights: Weights for evaluation metrics
+ Default: Equal weight for all 5 fields (0.2 each)
+ """
+ # Each field gets 20% weight (5 fields * 0.2 = 1.0)
+ default_weights = {
+ "is_index_based_match": 0.2,
+ "index_value_match": 0.2,
+ "parent_element_id_match": 0.2,
+ "element_id_of_nth_child_match": 0.2,
+ "selected_element_correct_match": 0.2,
+ }
+
+ weights = metric_weights or default_weights
+ super().__init__(metric_weights=weights)
+
+ def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+ """
+ Evaluate predicted index caching result against expected result.
+
+ Args:
+ predicted: LLM's output (JSON string with all 5 fields)
+ expected: Expected output (JSON string or dict with all 5 fields)
+
+ Returns:
+ Dictionary with evaluation metrics:
+ {
+ "is_index_based_match": 1.0 or 0.0,
+ "index_value_match": 1.0 or 0.0,
+ "parent_element_id_match": 1.0 or 0.0,
+ "element_id_of_nth_child_match": 1.0 or 0.0,
+ "selected_element_correct_match": 1.0 or 0.0,
+ "composite_score": 0.0 to 1.0,
+ "predicted_output": str,
+ "expected_output": str,
+ "field_scores": {...},
+ "evaluation_reason": str
+ }
+ """
+ if not predicted or not expected:
+ return {
+ "is_index_based_match": 0.0,
+ "index_value_match": 0.0,
+ "parent_element_id_match": 0.0,
+ "element_id_of_nth_child_match": 0.0,
+ "selected_element_correct_match": 0.0,
+ "composite_score": 0.0,
+ "predicted_output": str(predicted).strip() if predicted else "",
+ "expected_output": str(expected).strip() if expected else "",
+ "field_scores": {},
+ "evaluation_reason": "โ Empty or missing input/output"
+ }
+
+ # Parse expected (could be JSON string or dict)
+ try:
+ if isinstance(expected, str):
+ expected_dict = json.loads(expected)
+ else:
+ expected_dict = expected
+ except (json.JSONDecodeError, TypeError):
+ # If expected is already a dict from dataset
+ expected_dict = expected if isinstance(expected, dict) else {}
+
+ # Parse predicted (must be JSON string)
+ try:
+ predicted_dict = self._parse_json_response(predicted)
+ except Exception as e:
+ # Log the actual response for debugging
+ response_preview = predicted[:200] if predicted else "(empty)"
+ self.logger.warning(f"Failed to parse predicted JSON: {e}")
+ self.logger.warning(f"Response preview: {response_preview}...")
+ predicted_dict = {}
+
+ # NOTE: "notes" field is present in the output but is NOT used for scoring or reflection
+ # It's kept for reference but ignored in evaluation
+
+ # Compare each field (only the 5 core fields, ignoring "notes")
+ field_scores = {}
+ field_reasons = []
+
+ # 1. is_index_based (boolean)
+ pred_is_index = predicted_dict.get("is_index_based")
+ exp_is_index = expected_dict.get("is_index_based")
+ is_index_match = (pred_is_index == exp_is_index) if (pred_is_index is not None and exp_is_index is not None) else False
+ field_scores["is_index_based"] = 1.0 if is_index_match else 0.0
+ field_reasons.append(f"is_index_based: {pred_is_index} vs {exp_is_index} โ {'โ
' if is_index_match else 'โ'}")
+
+ # 2. index_value (int or null)
+ pred_index_val = predicted_dict.get("index_value")
+ exp_index_val = expected_dict.get("index_value")
+ # Handle null/None comparison
+ index_val_match = (pred_index_val == exp_index_val) or (pred_index_val is None and exp_index_val is None)
+ field_scores["index_value"] = 1.0 if index_val_match else 0.0
+ field_reasons.append(f"index_value: {pred_index_val} vs {exp_index_val} โ {'โ
' if index_val_match else 'โ'}")
+
+ # 3. parent_element_id (string or null)
+ pred_parent = predicted_dict.get("parent_element_id")
+ exp_parent = expected_dict.get("parent_element_id")
+ # Handle null/None comparison
+ parent_match = (pred_parent == exp_parent) or (pred_parent is None and exp_parent is None)
+ field_scores["parent_element_id"] = 1.0 if parent_match else 0.0
+ field_reasons.append(f"parent_element_id: {pred_parent} vs {exp_parent} โ {'โ
' if parent_match else 'โ'}")
+
+ # 4. element_id_of_nth_child_of_parent (string or null)
+ pred_element = predicted_dict.get("element_id_of_nth_child_of_parent")
+ exp_element = expected_dict.get("element_id_of_nth_child_of_parent")
+ # Handle null/None comparison
+ element_match = (pred_element == exp_element) or (pred_element is None and exp_element is None)
+ field_scores["element_id_of_nth_child_of_parent"] = 1.0 if element_match else 0.0
+ field_reasons.append(f"element_id_of_nth_child: {pred_element} vs {exp_element} โ {'โ
' if element_match else 'โ'}")
+
+ # 5. selected_element_is_correct (boolean)
+ pred_selected = predicted_dict.get("selected_element_is_correct")
+ exp_selected = expected_dict.get("selected_element_is_correct")
+ selected_match = (pred_selected == exp_selected) if (pred_selected is not None and exp_selected is not None) else False
+ field_scores["selected_element_is_correct"] = 1.0 if selected_match else 0.0
+ field_reasons.append(f"selected_element_is_correct: {pred_selected} vs {exp_selected} โ {'โ
' if selected_match else 'โ'}")
+
+ # Calculate composite score (weighted average)
+ composite_score = (
+ field_scores["is_index_based"] * 0.2 +
+ field_scores["index_value"] * 0.2 +
+ field_scores["parent_element_id"] * 0.2 +
+ field_scores["element_id_of_nth_child_of_parent"] * 0.2 +
+ field_scores["selected_element_is_correct"] * 0.2
+ )
+
+ # Build evaluation reason
+ all_match = composite_score == 1.0
+ reason = "โ
All fields match!" if all_match else f"โ Partial match ({composite_score:.1%})"
+ reason += "\n" + "\n".join(f" {r}" for r in field_reasons)
+
+ # Log evaluation details
+ self.logger.info(f"\n{'โ'*70}")
+ self.logger.info(f"๐ INDEX CACHING EVALUATION")
+ self.logger.info(f"{'โ'*70}")
+ self.logger.info(f" ๐ฏ COMPOSITE SCORE: {composite_score:.2f} ({composite_score:.1%})")
+ for field, score in field_scores.items():
+ status = "โ
" if score == 1.0 else "โ"
+ self.logger.info(f" {status} {field}: {score:.0f}")
+ self.logger.info(f"{'โ'*70}\n")
+
+ return {
+ "is_index_based_match": field_scores["is_index_based"],
+ "index_value_match": field_scores["index_value"],
+ "parent_element_id_match": field_scores["parent_element_id"],
+ "element_id_of_nth_child_match": field_scores["element_id_of_nth_child_of_parent"],
+ "selected_element_correct_match": field_scores["selected_element_is_correct"],
+ "composite_score": composite_score,
+ "predicted_output": predicted,
+ "expected_output": json.dumps(expected_dict) if isinstance(expected_dict, dict) else str(expected),
+ "predicted_dict": predicted_dict,
+ "expected_dict": expected_dict,
+ "field_scores": field_scores,
+ "evaluation_reason": reason
+ }
+
+ def _parse_json_response(self, response: str) -> Dict[str, Any]:
+ """
+ Parse JSON from LLM response, handling markdown code blocks and various formats.
+
+ Args:
+ response: LLM response string (may contain markdown)
+
+ Returns:
+ Parsed JSON dictionary (empty dict if parsing fails)
+ """
+ if not response or not isinstance(response, str):
+ return {}
+
+ response = response.strip()
+
+ # If response is empty, return empty dict
+ if not response:
+ return {}
+
+ # Strategy 1: Try to extract JSON from markdown code block
+ json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
+ if json_match:
+ try:
+ json_str = json_match.group(1).strip()
+ return json.loads(json_str)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 2: Find JSON object in response (handle nested braces)
+ json_start = response.find('{')
+ if json_start != -1:
+ # Find matching closing brace
+ brace_count = 0
+ json_end = json_start
+ for i in range(json_start, len(response)):
+ if response[i] == '{':
+ brace_count += 1
+ elif response[i] == '}':
+ brace_count -= 1
+ if brace_count == 0:
+ json_end = i + 1
+ break
+
+ if brace_count == 0:
+ json_str = response[json_start:json_end]
+ try:
+ return json.loads(json_str)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 3: Try to find any JSON-like structure (more lenient)
+ # Look for patterns like {"key": "value"} even if not perfectly formatted
+ json_pattern = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response, re.DOTALL)
+ if json_pattern:
+ try:
+ return json.loads(json_pattern.group(0))
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 4: Try parsing entire response as JSON
+ try:
+ return json.loads(response)
+ except json.JSONDecodeError:
+ pass
+
+ # If all strategies fail, return empty dict
+ self.logger.debug(f"Could not parse JSON from response: {response[:100]}...")
+ return {}
+
+ def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+ """
+ Get summary statistics for a batch of evaluations.
+
+ Args:
+ results: List of evaluation result dictionaries
+
+ Returns:
+ Summary statistics including accuracy per field and overall
+ """
+ if not results:
+ return {
+ "total_samples": 0,
+ "overall_accuracy": 0.0,
+ "field_accuracies": {},
+ "perfect_matches": 0
+ }
+
+ total = len(results)
+ perfect_matches = sum(1 for r in results if r.get("composite_score", 0.0) == 1.0)
+ overall_accuracy = perfect_matches / total if total > 0 else 0.0
+
+ # Calculate accuracy per field
+ field_accuracies = {
+ "is_index_based": sum(1 for r in results if r.get("is_index_based_match", 0.0) == 1.0) / total,
+ "index_value": sum(1 for r in results if r.get("index_value_match", 0.0) == 1.0) / total,
+ "parent_element_id": sum(1 for r in results if r.get("parent_element_id_match", 0.0) == 1.0) / total,
+ "element_id_of_nth_child": sum(1 for r in results if r.get("element_id_of_nth_child_match", 0.0) == 1.0) / total,
+ "selected_element_is_correct": sum(1 for r in results if r.get("selected_element_correct_match", 0.0) == 1.0) / total,
+ }
+
+ return {
+ "total_samples": total,
+ "overall_accuracy": overall_accuracy,
+ "field_accuracies": field_accuracies,
+ "perfect_matches": perfect_matches,
+ "partial_matches": total - perfect_matches
+ }
+
+
+# Example usage and testing
+if __name__ == "__main__":
+ print("๐ Testing Index Caching Evaluator...")
+
+ evaluator = IndexCachingEvaluator()
+
+ # Test cases
+ test_cases = [
+ # (predicted, expected, should_be_perfect)
+ (
+ '{"is_index_based": true, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}',
+ {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True},
+ True
+ ),
+ (
+ '{"is_index_based": false, "index_value": null, "parent_element_id": null, "element_id_of_nth_child_of_parent": null, "selected_element_is_correct": true}',
+ {"is_index_based": False, "index_value": None, "parent_element_id": None, "element_id_of_nth_child_of_parent": None, "selected_element_is_correct": True},
+ True
+ ),
+ (
+ '{"is_index_based": true, "index_value": 3, "parent_element_id": null, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": true}',
+ {"is_index_based": True, "index_value": 3, "parent_element_id": None, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": True},
+ True
+ ),
+ (
+ '{"is_index_based": true, "index_value": 2, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}',
+ {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True},
+ False # index_value mismatch
+ ),
+ ]
+
+ print("\n๐ Running test cases:")
+ print("-" * 80)
+
+ results = []
+ for predicted, expected, should_be_perfect in test_cases:
+ result = evaluator.evaluate(predicted, expected)
+ is_perfect = result["composite_score"] == 1.0
+
+ status = "โ
" if is_perfect == should_be_perfect else "โ"
+ print(f"{status} Test: Perfect match = {is_perfect} (expected {should_be_perfect})")
+ print(f" Score: {result['composite_score']:.2f}")
+ print()
+
+ results.append(result)
+
+ # Summary
+ print("\n๐ Summary:")
+ summary = evaluator.get_evaluation_summary(results)
+ print(f" Total: {summary['total_samples']}")
+ print(f" Perfect matches: {summary['perfect_matches']}")
+ print(f" Overall accuracy: {summary['overall_accuracy']:.1%}")
+ print(f" Field accuracies:")
+ for field, acc in summary['field_accuracies'].items():
+ print(f" {field}: {acc:.1%}")
+
diff --git a/src/gepa_optimizer/evaluation/scroll_evaluator.py b/src/gepa_optimizer/evaluation/scroll_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..64171b2b9ae384339eec3d846e7d34c90fb36070
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/scroll_evaluator.py
@@ -0,0 +1,251 @@
+"""
+GENERIC String Match Evaluator
+
+Compares predicted output against expected output (simple string comparison).
+NO assumptions about what the output represents (IDs, text, JSON, etc.).
+
+Let GEPA discover the correct output format through evolution and feedback!
+"""
+
+from typing import Dict, Any
+
+try:
+ from .base_evaluator import BaseEvaluator
+except ImportError:
+ # For standalone testing
+ import sys
+ from pathlib import Path
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+ from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
+
+
+class ScrollElementEvaluator(BaseEvaluator):
+ """
+ GENERIC evaluator - just compares strings!
+
+ NO assumptions about:
+ - Output format (element IDs, text, JSON, etc.)
+ - Output structure
+ - What the task is
+
+ GEPA will learn the correct format through feedback and evolution.
+ """
+
+ def __init__(self, metric_weights: Dict[str, float] = None):
+ """
+ Initialize evaluator.
+
+ Args:
+ metric_weights: Weights for evaluation metrics
+ Default: {"output_match": 1.0}
+ """
+ default_weights = {
+ "output_match": 1.0 # Simple string comparison
+ }
+
+ weights = metric_weights or default_weights
+ super().__init__(metric_weights=weights)
+
+ def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+ """
+ Binary evaluation with element ID extraction.
+
+ Phase 1 Implementation:
+ - Extracts element IDs using regex patterns (flexible format support)
+ - Uses INTEGER comparison for robustness (prevents "4" vs "14" bugs)
+ - Binary scoring: correct element = 1.0, wrong/missing = 0.0
+
+ Scoring Strategy:
+ 1. Extract element ID from both predicted and expected outputs
+ 2. Compare using integer arithmetic (not string comparison)
+ 3. Return 1.0 if match, 0.0 otherwise (no partial credit)
+
+ Args:
+ predicted: LLM's output (may include verbose explanation)
+ expected: Expected output (may include verbose explanation)
+
+ Returns:
+ Dictionary with evaluation metrics and extracted element IDs
+ """
+ import re
+
+ if not predicted or not expected:
+ return {
+ "content_match": 0.0,
+ "output_match": 0.0,
+ "composite_score": 0.0,
+ "predicted_output": str(predicted).strip() if predicted else "",
+ "expected_output": str(expected).strip() if expected else "",
+ "predicted_element": "None",
+ "expected_element": "None",
+ "evaluation_reason": "โ Empty or missing input/output"
+ }
+
+ predicted_str = str(predicted).strip()
+ expected_str = str(expected).strip()
+
+ # 1. Extract element numbers using MULTIPLE strategies (flexible!)
+ # Strategy A: "Element: X" or "Element X" (explicit format)
+ element_pattern_a = r'element[:\s]+(\d+)'
+
+ # Strategy B: "element X" or "Element X" anywhere in text
+ element_pattern_b = r'\belement\s+(\d+)\b'
+
+ # Strategy C: Just find ANY number if other strategies fail (last resort)
+ number_pattern = r'\b(\d+)\b'
+
+ # Try to extract from predicted
+ pred_match = re.search(element_pattern_a, predicted_str, re.IGNORECASE)
+ if not pred_match:
+ pred_match = re.search(element_pattern_b, predicted_str, re.IGNORECASE)
+ if not pred_match:
+ # Last resort: find first number in the text
+ pred_match = re.search(number_pattern, predicted_str)
+
+ # Try to extract from expected
+ exp_match = re.search(element_pattern_a, expected_str, re.IGNORECASE)
+ if not exp_match:
+ exp_match = re.search(element_pattern_b, expected_str, re.IGNORECASE)
+ if not exp_match:
+ exp_match = re.search(number_pattern, expected_str)
+
+ # 2. Check if we found element numbers in both
+ if not exp_match:
+ # Expected doesn't have element pattern - fallback to exact match
+ content_score = 1.0 if predicted_str.lower() == expected_str.lower() else 0.0
+ elif not pred_match:
+ # Predicted doesn't have element number - WRONG
+ content_score = 0.0
+ else:
+ # Both have element pattern - compare using INTEGER comparison
+ pred_element = pred_match.group(1)
+ exp_element = exp_match.group(1)
+
+ # ๐ฅ Phase 1: Use INTEGER comparison for robustness
+ # This prevents bugs like "4" != "14" string comparison issues
+ try:
+ pred_num = int(pred_element)
+ exp_num = int(exp_element)
+
+ # Integer comparison (more robust than string)
+ content_score = 1.0 if pred_num == exp_num else 0.0
+
+ # Log comparison for debugging
+ if pred_num != exp_num:
+ import logging
+ logger = logging.getLogger(__name__)
+ logger.debug(f"Element mismatch: predicted={pred_num}, expected={exp_num}")
+
+ except (ValueError, TypeError) as e:
+ # Fallback to string comparison if conversion fails
+ import logging
+ logger = logging.getLogger(__name__)
+ logger.warning(f"Could not convert elements to integers: {e}, using string comparison")
+ content_score = 1.0 if pred_element == exp_element else 0.0
+
+ # 3. Binary score and reason
+ if content_score == 1.0:
+ composite_score = 1.0
+ reason = "โ
Correct! Element number matches"
+ else:
+ composite_score = 0.0
+ if pred_match and exp_match:
+ reason = "โ Wrong element number (predicted different element)"
+ else:
+ reason = "โ Missing or invalid element number"
+
+ pred_element = pred_match.group(1) if pred_match else "None"
+ exp_element = exp_match.group(1) if exp_match else "None"
+
+ # Detailed logging for transparency
+ import logging
+ logger = logging.getLogger(__name__)
+ logger.info(f"\n{'โ'*70}")
+ logger.info(f"๐ EVALUATION DETAILS")
+ logger.info(f"{'โ'*70}")
+ logger.info(f" Expected: '{expected_str}' (Element: {exp_element})")
+ logger.info(f" Predicted: '{predicted_str}' (Element: {pred_element})")
+ logger.info(f" {'โ'*66}")
+ logger.info(f" ๐ฏ SCORE: {composite_score:.2f} - {reason}")
+ logger.info(f"{'โ'*70}\n")
+
+ return {
+ "content_match": content_score,
+ "output_match": composite_score, # This is what GEPA uses
+ "composite_score": composite_score,
+ "predicted_output": predicted_str,
+ "expected_output": expected_str,
+ "predicted_element": pred_element,
+ "expected_element": exp_element,
+ "evaluation_reason": reason
+ }
+
+ def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+ """
+ Get summary statistics for a batch of evaluations.
+
+ Args:
+ results: List of evaluation result dictionaries
+
+ Returns:
+ Summary statistics
+ """
+ if not results:
+ return {
+ "total_samples": 0,
+ "accuracy": 0.0,
+ "correct_predictions": 0
+ }
+
+ total = len(results)
+ correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
+ accuracy = correct / total if total > 0 else 0.0
+
+ return {
+ "total_samples": total,
+ "accuracy": accuracy,
+ "correct_predictions": correct,
+ "incorrect_predictions": total - correct
+ }
+
+
+# Example usage and testing
+if __name__ == "__main__":
+ print("๐ Testing Scroll Element Evaluator...")
+
+ evaluator = ScrollElementEvaluator()
+
+ # Test cases
+ test_cases = [
+ ("4", "4", True),
+ ("Element: 4", "4", True),
+ ("Element 4", "4", True),
+ ("The element to interact with is 4", "4", True),
+ ("Element ID: 4", "4", True),
+ ("Click on element 4 to scroll", "4", True),
+ ("5", "4", False),
+ ("Element: 5", "4", False),
+ ("No element found", "4", False),
+ ("", "4", False),
+ ]
+
+ print("\n๐ Running test cases:")
+ print("-" * 80)
+
+ results = []
+ for predicted, expected, should_match in test_cases:
+ result = evaluator.evaluate(predicted, expected)
+ match = result["composite_score"] == 1.0
+
+ status = "โ
" if match == should_match else "โ"
+ print(f"{status} Predicted: '{predicted}' | Expected: '{expected}' | Match: {match}")
+
+ results.append(result)
+
+ # Summary
+ print("\n๐ Summary:")
+ summary = evaluator.get_evaluation_summary(results)
+ print(f" Total: {summary['total_samples']}")
+ print(f" Correct: {summary['correct_predictions']}")
+ print(f" Accuracy: {summary['accuracy']:.1%}")
+
diff --git a/src/gepa_optimizer/evaluation/ui_evaluator.py b/src/gepa_optimizer/evaluation/ui_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4aadf122212ad2d07ae1c7c4f23342eb5be88a3
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/ui_evaluator.py
@@ -0,0 +1,297 @@
+"""
+UI Tree Evaluator for GEPA Optimizer
+"""
+
+import json
+import logging
+import difflib
+from typing import Any, Dict, List, Optional
+
+from .base_evaluator import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+class UITreeEvaluator(BaseEvaluator):
+ """
+ Comprehensive evaluator for UI tree extraction quality.
+ """
+
+ def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+ """
+ Initializes the UITreeEvaluator with configurable metric weights.
+
+ Args:
+ metric_weights: A dictionary of weights for different metrics.
+ If None, default weights will be used.
+ """
+ # Set default weights for UI tree evaluation
+ default_weights = {
+ "element_completeness": 0.3, # How many elements are captured
+ "element_type_accuracy": 0.25, # Correct element types (Button, Text, etc.)
+ "text_content_accuracy": 0.2, # Text content matches
+ "hierarchy_accuracy": 0.15, # Parent-child relationships
+ "style_accuracy": 0.1, # Style properties captured
+ }
+
+ # Use provided weights or defaults
+ weights = metric_weights or default_weights
+
+ # Initialize parent class
+ super().__init__(metric_weights=weights)
+
+ # Normalize weights
+ self._normalize_weights()
+
+ def _normalize_weights(self):
+ """Normalize weights to sum to 1.0"""
+ total_weight = sum(self.metric_weights.values())
+ if total_weight > 0:
+ self.metric_weights = {k: v / total_weight for k, v in self.metric_weights.items()}
+ else:
+ self.logger.warning("Total metric weight is zero. Scores will be zero.")
+
+ def evaluate(self, predicted_json: Dict[str, Any], expected_json: Dict[str, Any]) -> Dict[str, float]:
+ """
+ Generates a weighted composite score from individual metrics.
+
+ Args:
+ predicted_json: The JSON generated by the LLM.
+ expected_json: The ground truth JSON.
+
+ Returns:
+ A dictionary of individual metric scores and the composite score.
+ """
+ scores = {
+ "element_completeness": self.calculate_element_completeness(predicted_json, expected_json),
+ "element_type_accuracy": self.calculate_element_type_accuracy(predicted_json, expected_json),
+ "text_content_accuracy": self.calculate_text_content_accuracy(predicted_json, expected_json),
+ "hierarchy_accuracy": self.calculate_hierarchy_accuracy(predicted_json, expected_json),
+ "style_accuracy": self.calculate_style_accuracy(predicted_json, expected_json),
+ }
+
+ composite_score = sum(scores[metric] * self.metric_weights.get(metric, 0) for metric in scores)
+ scores["composite_score"] = composite_score
+
+ # Add detailed logging for debugging
+ logger.debug(f"Evaluation scores: {scores}")
+ logger.debug(f"Composite score: {composite_score:.4f}")
+
+ # Add small improvement bonus for better prompts (encourage GEPA to accept improvements)
+ # This helps GEPA recognize even tiny improvements
+ if composite_score > 0.05: # If we have any meaningful content
+ composite_score = min(composite_score + 0.001, 1.0) # Small bonus to encourage acceptance
+
+ return scores
+
+ def calculate_element_completeness(self, predicted: Dict, expected: Dict) -> float:
+ """
+ Calculates how many UI elements are captured in the predicted JSON.
+ This is the most important metric for UI tree extraction.
+ """
+ def _count_elements(node):
+ """Count total elements in the tree"""
+ if not isinstance(node, dict):
+ return 0
+ count = 1 # Count current node
+ for child in node.get("children", []):
+ count += _count_elements(child)
+ return count
+
+ try:
+ predicted_count = _count_elements(predicted)
+ expected_count = _count_elements(expected)
+
+ if expected_count == 0:
+ return 1.0 if predicted_count == 0 else 0.0
+
+ # Score based on how many elements are captured
+ completeness_ratio = predicted_count / expected_count
+
+ # Give bonus for capturing more elements (up to 1.0)
+ # Penalize heavily for missing elements
+ if completeness_ratio >= 1.0:
+ return 1.0 # Perfect or better
+ elif completeness_ratio >= 0.8:
+ return completeness_ratio # Good coverage
+ elif completeness_ratio >= 0.5:
+ return completeness_ratio * 0.8 # Moderate coverage with penalty
+ else:
+ return completeness_ratio * 0.5 # Poor coverage with heavy penalty
+
+ except Exception as e:
+ logger.warning(f"Error calculating element completeness: {e}")
+ return 0.0
+
+ def calculate_element_type_accuracy(self, predicted: Dict, expected: Dict) -> float:
+ """
+ Calculates element type accuracy by comparing the 'type' attribute of corresponding nodes.
+ Focuses on common UI element types like Button, Text, Image, etc.
+ """
+ def _get_all_types(node):
+ if not isinstance(node, dict):
+ return []
+ types = [node.get("type")]
+ for child in node.get("children", []):
+ types.extend(_get_all_types(child))
+ return [t for t in types if t is not None]
+
+ try:
+ predicted_types = _get_all_types(predicted)
+ expected_types = _get_all_types(expected)
+
+ if not expected_types:
+ return 1.0 if not predicted_types else 0.5
+
+ if not predicted_types:
+ return 0.0
+
+ # Count matching types with frequency consideration
+ expected_type_counts = {}
+ for t in expected_types:
+ expected_type_counts[t] = expected_type_counts.get(t, 0) + 1
+
+ predicted_type_counts = {}
+ for t in predicted_types:
+ predicted_type_counts[t] = predicted_type_counts.get(t, 0) + 1
+
+ # Calculate accuracy based on type matches
+ total_matches = 0
+ for type_name, expected_count in expected_type_counts.items():
+ predicted_count = predicted_type_counts.get(type_name, 0)
+ # Count matches up to the expected count
+ total_matches += min(predicted_count, expected_count)
+
+ return total_matches / len(expected_types) if expected_types else 0.0
+
+ except Exception as e:
+ logger.warning(f"Error calculating element type accuracy: {e}")
+ return 0.0
+
+ def calculate_hierarchy_accuracy(self, predicted: Dict, expected: Dict) -> float:
+ """
+ Calculates hierarchy accuracy by comparing parent-child relationships.
+ """
+ def _get_hierarchy_structure(node, parent_type="ROOT"):
+ """Extract hierarchy structure as (parent_type, child_type) pairs"""
+ if not isinstance(node, dict):
+ return []
+
+ current_type = node.get("type", "unknown")
+ hierarchy = [(parent_type, current_type)]
+
+ for child in node.get("children", []):
+ hierarchy.extend(_get_hierarchy_structure(child, current_type))
+
+ return hierarchy
+
+ try:
+ predicted_hierarchy = _get_hierarchy_structure(predicted)
+ expected_hierarchy = _get_hierarchy_structure(expected)
+
+ if not expected_hierarchy:
+ return 1.0 if not predicted_hierarchy else 0.5
+
+ if not predicted_hierarchy:
+ return 0.0
+
+ # Count matching hierarchy relationships
+ expected_hierarchy_set = set(expected_hierarchy)
+ predicted_hierarchy_set = set(predicted_hierarchy)
+
+ matches = len(expected_hierarchy_set.intersection(predicted_hierarchy_set))
+ total_expected = len(expected_hierarchy_set)
+
+ return matches / total_expected if total_expected > 0 else 0.0
+
+ except Exception as e:
+ logger.warning(f"Error calculating hierarchy accuracy: {e}")
+ return 0.0
+
+ def calculate_text_content_accuracy(self, predicted: Dict, expected: Dict) -> float:
+ """
+ Calculates text content accuracy by comparing the 'text' attribute of corresponding nodes.
+ """
+ def _get_all_texts(node):
+ if not isinstance(node, dict):
+ return []
+ texts = [node.get("text")]
+ for child in node.get("children", []):
+ texts.extend(_get_all_texts(child))
+ return [t for t in texts if t is not None and str(t).strip()]
+
+ try:
+ predicted_texts = _get_all_texts(predicted)
+ expected_texts = _get_all_texts(expected)
+
+ if not expected_texts:
+ return 1.0 if not predicted_texts else 0.5 # Partial credit if predicted has texts but expected doesn't
+
+ if not predicted_texts:
+ return 0.0 # No predicted texts, so no match
+
+ total_similarity = 0.0
+ for p_text in predicted_texts:
+ best_similarity = 0.0
+ for e_text in expected_texts:
+ similarity = difflib.SequenceMatcher(None, str(p_text).strip(), str(e_text).strip()).ratio()
+ best_similarity = max(best_similarity, similarity)
+ total_similarity += best_similarity
+
+ # Average similarity over all predicted texts
+ if not predicted_texts and not expected_texts:
+ return 1.0
+ elif not predicted_texts:
+ return 0.0
+ else:
+ return total_similarity / len(predicted_texts)
+ except Exception as e:
+ logger.warning(f"Error calculating text content accuracy: {e}")
+ return 0.0
+
+ def calculate_style_accuracy(self, predicted: Dict, expected: Dict) -> float:
+ """
+ Calculates style accuracy by comparing style properties.
+ """
+ def _get_all_styles(node):
+ """Extract all style properties from the tree"""
+ if not isinstance(node, dict):
+ return []
+
+ styles = []
+ if "style" in node and isinstance(node["style"], dict):
+ styles.append(node["style"])
+
+ for child in node.get("children", []):
+ styles.extend(_get_all_styles(child))
+
+ return styles
+
+ try:
+ predicted_styles = _get_all_styles(predicted)
+ expected_styles = _get_all_styles(expected)
+
+ if not expected_styles:
+ return 1.0 if not predicted_styles else 0.5
+
+ if not predicted_styles:
+ return 0.0
+
+ # Calculate style property overlap
+ total_style_properties = 0
+ matching_properties = 0
+
+ for exp_style in expected_styles:
+ for prop_name, prop_value in exp_style.items():
+ total_style_properties += 1
+
+ # Find matching property in predicted styles
+ for pred_style in predicted_styles:
+ if prop_name in pred_style and pred_style[prop_name] == prop_value:
+ matching_properties += 1
+ break
+
+ return matching_properties / total_style_properties if total_style_properties > 0 else 0.0
+
+ except Exception as e:
+ logger.warning(f"Error calculating style accuracy: {e}")
+ return 0.0
diff --git a/src/gepa_optimizer/evaluation/universal_evaluator.py b/src/gepa_optimizer/evaluation/universal_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..16714de4ecad31a7a44a0551c3a7b45918a34b2a
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/universal_evaluator.py
@@ -0,0 +1,911 @@
+"""
+Universal Semantic Evaluator for ANY prompt optimization use case.
+
+This evaluator uses LLM-powered semantic analysis to compare predicted vs expected outputs,
+enabling prompt optimization for ANY task without requiring custom evaluator code.
+
+Key Features:
+- Semantic understanding (not just string matching)
+- Works with text, JSON, numbers, structured outputs
+- Provides rich feedback for GEPA reflection
+- No task-specific assumptions
+"""
+
+import json
+import re
+import logging
+from typing import Dict, Any, Optional, List
+from difflib import SequenceMatcher
+
+from .base_evaluator import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class UniversalSemanticEvaluator(BaseEvaluator):
+ """
+ Universal evaluator using LLM for semantic comparison.
+
+ Works for ANY task without hardcoded assumptions:
+ - Text outputs: "The answer is 42" vs "42"
+ - JSON outputs: {"count": 23} vs {"count": 22}
+ - Structured data: Lists, nested objects
+ - Multi-modal: Image descriptions, analysis results
+
+ Evaluation Strategy:
+ 1. Quick checks (exact match, empty handling)
+ 2. Structural comparison (for JSON/structured data)
+ 3. LLM semantic analysis (for meaning understanding)
+ 4. Combine into composite score with rich feedback
+ """
+
+ def __init__(
+ self,
+ llm_client=None,
+ use_llm_analysis: bool = True,
+ semantic_weight: float = 0.6,
+ structural_weight: float = 0.25,
+ exact_match_bonus: float = 0.15,
+ metric_weights: Optional[Dict[str, float]] = None
+ ):
+ """
+ Initialize Universal Semantic Evaluator.
+
+ Args:
+ llm_client: LLM client for semantic analysis (optional, falls back to heuristics)
+ use_llm_analysis: Whether to use LLM for semantic comparison
+ semantic_weight: Weight for semantic similarity (0.0-1.0)
+ structural_weight: Weight for structural similarity (0.0-1.0)
+ exact_match_bonus: Bonus weight for exact matches (0.0-1.0)
+ metric_weights: Optional custom weights (overrides above)
+ """
+ default_weights = metric_weights or {
+ "semantic_similarity": semantic_weight,
+ "structural_similarity": structural_weight,
+ "exact_match": exact_match_bonus
+ }
+ super().__init__(metric_weights=default_weights)
+
+ self.llm_client = llm_client
+ self.use_llm_analysis = use_llm_analysis and llm_client is not None
+
+ # Cache for LLM analysis to reduce API calls
+ self._analysis_cache: Dict[str, Dict] = {}
+
+ logger.info(f"๐ฏ Universal Semantic Evaluator initialized")
+ logger.info(f" LLM analysis: {'enabled' if self.use_llm_analysis else 'disabled (using heuristics)'}")
+ logger.info(f" Weights: semantic={semantic_weight}, structural={structural_weight}, exact={exact_match_bonus}")
+
+ def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]:
+ """
+ Evaluate predicted output against expected output using semantic understanding.
+
+ Args:
+ predicted: The model's predicted output (string, dict, or any serializable type)
+ expected: The ground truth expected output
+
+ Returns:
+ Dictionary with metrics including 'composite_score' (required for GEPA)
+ """
+ # Convert to strings for comparison
+ predicted_str = self._to_string(predicted)
+ expected_str = self._to_string(expected)
+
+ # Initialize result
+ result = {
+ "composite_score": 0.0,
+ "exact_match": 0.0,
+ "semantic_similarity": 0.0,
+ "structural_similarity": 0.0,
+ "predicted_output": predicted_str[:500], # Truncate for logging
+ "expected_output": expected_str[:500],
+ "analysis": {},
+ "improvement_feedback": ""
+ }
+
+ # Handle empty/missing outputs
+ if not predicted_str or not predicted_str.strip():
+ result["improvement_feedback"] = "โ Output is EMPTY. The prompt must instruct the model to produce output."
+ result["analysis"] = {"status": "empty_predicted"}
+ return result
+
+ if not expected_str or not expected_str.strip():
+ result["improvement_feedback"] = "โ ๏ธ Expected output is empty - cannot evaluate."
+ result["analysis"] = {"status": "empty_expected"}
+ result["composite_score"] = 0.5 # Neutral score
+ return result
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # STEP 1: Exact Match Check (Fast Path)
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ normalized_pred = self._normalize(predicted_str)
+ normalized_exp = self._normalize(expected_str)
+
+ if normalized_pred == normalized_exp:
+ result["exact_match"] = 1.0
+ result["semantic_similarity"] = 1.0
+ result["structural_similarity"] = 1.0
+ result["composite_score"] = 1.0
+ result["improvement_feedback"] = "โ
Perfect match! Output exactly matches expected."
+ result["analysis"] = {"status": "exact_match"}
+ return result
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # STEP 1.5: FORMAT MISMATCH DETECTION (CRITICAL FIX)
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # ๐ฅ CRITICAL: Detect when expected is JSON but predicted is narrative text
+ # This causes catastrophically low scores and needs explicit handling
+ expected_is_json = self._try_parse_json(expected_str) is not None
+ predicted_is_json = self._try_parse_json(predicted_str) is not None
+
+ format_mismatch = expected_is_json and not predicted_is_json
+ if format_mismatch:
+ # Expected JSON but got narrative - this is a CRITICAL format error
+ # Give partial credit for semantic content but penalize heavily for format
+ result["analysis"]["format_mismatch"] = True
+ result["improvement_feedback"] = (
+ "โ FORMAT ERROR: Expected JSON output but received narrative text. "
+ "The prompt MUST enforce JSON output format. "
+ "Add explicit instructions like: 'Output ONLY valid JSON, no explanations.' "
+ "Consider adding: 'Do NOT write prose or explanations.'"
+ )
+ # Still evaluate semantic content but cap the score
+ # This gives feedback for improving the prompt
+ logger.warning(f"โ ๏ธ Format mismatch: expected JSON ({len(expected_str)} chars), got narrative ({len(predicted_str)} chars)")
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # STEP 2: Structural Comparison (for JSON/structured data)
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ structural_result = self._compare_structure(predicted_str, expected_str)
+ result["structural_similarity"] = structural_result["score"]
+ result["analysis"]["structural"] = structural_result.get("details", {})
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # STEP 3: Semantic Analysis
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ if self.use_llm_analysis:
+ semantic_result = self._llm_semantic_analysis(predicted_str, expected_str)
+ else:
+ semantic_result = self._heuristic_semantic_analysis(predicted_str, expected_str)
+
+ result["semantic_similarity"] = semantic_result["score"]
+ result["analysis"]["semantic"] = semantic_result.get("details", {})
+ result["improvement_feedback"] = semantic_result.get("feedback", "")
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # STEP 4: Compute Composite Score
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ weights = self.metric_weights
+ composite = (
+ result["semantic_similarity"] * weights.get("semantic_similarity", 0.6) +
+ result["structural_similarity"] * weights.get("structural_similarity", 0.25) +
+ result["exact_match"] * weights.get("exact_match", 0.15)
+ )
+
+ # ๐ฅ CRITICAL FIX: Apply format mismatch penalty
+ # If expected JSON but got narrative, cap the score to encourage format compliance
+ if result.get("analysis", {}).get("format_mismatch"):
+ # Cap at 0.3 to indicate "partial semantic match but wrong format"
+ # This ensures format-correct outputs always score higher
+ composite = min(composite, 0.30)
+ logger.debug(f"๐ Format mismatch penalty applied: score capped at {composite:.3f}")
+
+ result["composite_score"] = min(max(composite, 0.0), 1.0)
+
+ # Add score breakdown to feedback
+ if not result["improvement_feedback"]:
+ result["improvement_feedback"] = self._generate_default_feedback(result)
+
+ # Log evaluation
+ logger.debug(f"๐ Evaluation: composite={result['composite_score']:.3f}, "
+ f"semantic={result['semantic_similarity']:.3f}, "
+ f"structural={result['structural_similarity']:.3f}")
+
+ # #region agent log
+ try:
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_evaluator.py:final_score", "message": "Final evaluation score breakdown", "data": {"composite": result["composite_score"], "semantic": result["semantic_similarity"], "structural": result["structural_similarity"], "exact_match": result["exact_match"], "format_mismatch": result.get("analysis", {}).get("format_mismatch", False), "predicted_preview": predicted_str[:150] if predicted_str else "EMPTY", "expected_preview": expected_str[:150] if expected_str else "EMPTY"}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ except Exception as _e:
+ pass # Silent fail for instrumentation
+ # #endregion
+
+ return result
+
+ def _to_string(self, value: Any) -> str:
+ """Convert any value to string for comparison."""
+ if value is None:
+ return ""
+ if isinstance(value, str):
+ return value.strip()
+ if isinstance(value, dict):
+ try:
+ return json.dumps(value, sort_keys=True, indent=2)
+ except (TypeError, ValueError):
+ return str(value)
+ if isinstance(value, (list, tuple)):
+ try:
+ return json.dumps(list(value), sort_keys=True)
+ except (TypeError, ValueError):
+ return str(value)
+ return str(value).strip()
+
+ def _normalize(self, text: str) -> str:
+ """Normalize text for comparison (lowercase, whitespace)."""
+ # Lowercase and normalize whitespace
+ normalized = ' '.join(text.lower().split())
+ # Remove common punctuation that doesn't affect meaning
+ normalized = re.sub(r'[.,;:!?\'"]+$', '', normalized)
+ return normalized
+
+ def _compare_structure(self, predicted: str, expected: str) -> Dict[str, Any]:
+ """
+ Compare structural similarity (especially for JSON/structured outputs).
+
+ Returns:
+ Dict with 'score' (0.0-1.0) and 'details'
+ """
+ result = {"score": 0.0, "details": {}}
+
+ # Try to parse as JSON
+ pred_json = self._try_parse_json(predicted)
+ exp_json = self._try_parse_json(expected)
+
+ if pred_json is not None and exp_json is not None:
+ # Both are valid JSON - do structural comparison
+ return self._compare_json_structures(pred_json, exp_json)
+
+ # Fallback: Compare as text structure
+ return self._compare_text_structure(predicted, expected)
+
+ def _try_parse_json(self, text: str) -> Optional[Any]:
+ """
+ Try to parse text as JSON with robust extraction.
+
+ ๐ฅ FIX: LLMs often wrap JSON in markdown code blocks or add extra text.
+ This method now handles multiple formats:
+ - Direct JSON
+ - ```json ... ``` blocks
+ - ``` ... ``` blocks (no language tag)
+ - JSON embedded in prose
+ - Escaped newlines and quotes
+ """
+ if not text or not isinstance(text, str):
+ return None
+
+ # ๐ฅ PREPROCESSING: Clean common LLM output issues
+ cleaned = text.strip()
+
+ # Remove BOM and other invisible characters
+ cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d')
+
+ # Strategy 1: Try direct parse (cleanest case)
+ try:
+ return json.loads(cleaned)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 2: Extract JSON from markdown code block (```json ... ```)
+ # More permissive regex that handles optional language tags
+ json_match = re.search(r'```(?:json|JSON)?\s*([\{|\[].*?[\}|\]])\s*```', cleaned, re.DOTALL)
+ if json_match:
+ try:
+ return json.loads(json_match.group(1))
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 3: Find JSON using balanced brace matching (handles nested objects)
+ def extract_balanced_json(s: str, start_char: str, end_char: str) -> Optional[str]:
+ """Extract JSON with balanced braces/brackets."""
+ count = 0
+ start_idx = -1
+ for i, char in enumerate(s):
+ if char == start_char:
+ if count == 0:
+ start_idx = i
+ count += 1
+ elif char == end_char:
+ count -= 1
+ if count == 0 and start_idx >= 0:
+ return s[start_idx:i+1]
+ return None
+
+ # Try to find JSON object
+ json_obj = extract_balanced_json(cleaned, '{', '}')
+ if json_obj:
+ try:
+ return json.loads(json_obj)
+ except json.JSONDecodeError:
+ # Try to repair common issues
+ repaired = self._repair_json(json_obj)
+ try:
+ return json.loads(repaired)
+ except json.JSONDecodeError:
+ pass
+
+ # Try to find JSON array
+ json_arr = extract_balanced_json(cleaned, '[', ']')
+ if json_arr:
+ try:
+ return json.loads(json_arr)
+ except json.JSONDecodeError:
+ repaired = self._repair_json(json_arr)
+ try:
+ return json.loads(repaired)
+ except json.JSONDecodeError:
+ pass
+
+ return None
+
+ def _repair_json(self, json_str: str) -> str:
+ """
+ Attempt to repair common JSON issues from LLM output.
+
+ Fixes:
+ - Trailing commas before } or ]
+ - Single quotes instead of double quotes
+ - Unquoted keys
+ - Comments (// and /* */)
+ """
+ repaired = json_str
+
+ # Remove trailing commas
+ repaired = re.sub(r',\s*}', '}', repaired)
+ repaired = re.sub(r',\s*]', ']', repaired)
+
+ # Remove single-line comments
+ repaired = re.sub(r'//[^\n]*', '', repaired)
+
+ # Remove multi-line comments
+ repaired = re.sub(r'/\*.*?\*/', '', repaired, flags=re.DOTALL)
+
+ # Replace single quotes with double quotes (but be careful with apostrophes)
+ # Only replace when it looks like a JSON delimiter
+ def replace_single_quotes(match):
+ content = match.group(0)
+ # Skip if it looks like an apostrophe in a word
+ if re.match(r"'\w+'\s*:", content) or re.match(r":\s*'[^']*'", content):
+ return content.replace("'", '"')
+ return content
+
+ # Basic single quote replacement for keys
+ repaired = re.sub(r"'([^']+)'\s*:", r'"\1":', repaired)
+
+ return repaired
+
+ def _compare_json_structures(self, pred: Any, exp: Any) -> Dict[str, Any]:
+ """Compare two JSON structures."""
+ result = {"score": 0.0, "details": {"type": "json", "matches": [], "mismatches": []}}
+
+ if type(pred) != type(exp):
+ result["details"]["mismatches"].append(f"Type mismatch: predicted={type(pred).__name__}, expected={type(exp).__name__}")
+ result["score"] = 0.2 # Some credit for being JSON
+ return result
+
+ if isinstance(pred, dict) and isinstance(exp, dict):
+ return self._compare_dicts(pred, exp)
+ elif isinstance(pred, list) and isinstance(exp, list):
+ return self._compare_lists(pred, exp)
+ else:
+ # Primitive types
+ if pred == exp:
+ result["score"] = 1.0
+ result["details"]["matches"].append(f"Values match: {pred}")
+ else:
+ result["score"] = self._value_similarity(pred, exp)
+ result["details"]["mismatches"].append(f"Value mismatch: predicted={pred}, expected={exp}")
+ return result
+
+ def _compare_dicts(self, pred: dict, exp: dict) -> Dict[str, Any]:
+ """
+ Compare two dictionaries with CASE-INSENSITIVE key matching.
+
+ ๐ฅ FIX: LLMs often produce keys like 'Category' when expected is 'category'.
+ This method now normalizes keys before comparison for fair scoring.
+ """
+ result = {"score": 0.0, "details": {"type": "dict", "matches": [], "mismatches": [], "missing_keys": [], "extra_keys": []}}
+
+ # ๐ฅ NORMALIZE: Convert all keys to lowercase for comparison
+ # Also handle common variations like underscores vs camelCase
+ def normalize_key(key: str) -> str:
+ """Normalize key: lowercase, underscores to nothing, strip spaces."""
+ return re.sub(r'[_\s-]', '', str(key).lower())
+
+ # Build normalized key mappings
+ pred_normalized = {normalize_key(k): (k, v) for k, v in pred.items()}
+ exp_normalized = {normalize_key(k): (k, v) for k, v in exp.items()}
+
+ pred_norm_keys = set(pred_normalized.keys())
+ exp_norm_keys = set(exp_normalized.keys())
+
+ # Check for missing/extra keys (using normalized comparison)
+ missing_norm = exp_norm_keys - pred_norm_keys
+ extra_norm = pred_norm_keys - exp_norm_keys
+ common_norm = pred_norm_keys & exp_norm_keys
+
+ # Convert back to original key names for reporting
+ missing = [exp_normalized[k][0] for k in missing_norm]
+ extra = [pred_normalized[k][0] for k in extra_norm]
+
+ result["details"]["missing_keys"] = missing
+ result["details"]["extra_keys"] = extra
+
+ if not exp_norm_keys:
+ result["score"] = 1.0 if not pred_norm_keys else 0.5
+ return result
+
+ # Score based on key overlap (normalized)
+ key_score = len(common_norm) / len(exp_norm_keys) if exp_norm_keys else 1.0
+
+ # Score based on value matches
+ value_scores = []
+ for norm_key in common_norm:
+ pred_orig_key, pred_val = pred_normalized[norm_key]
+ exp_orig_key, exp_val = exp_normalized[norm_key]
+
+ if pred_val == exp_val:
+ value_scores.append(1.0)
+ result["details"]["matches"].append(f"{exp_orig_key}: {exp_val}")
+ else:
+ sim = self._value_similarity(pred_val, exp_val)
+ value_scores.append(sim)
+ if sim < 0.8:
+ result["details"]["mismatches"].append(f"{exp_orig_key}: predicted={pred_val}, expected={exp_val}")
+
+ value_score = sum(value_scores) / len(value_scores) if value_scores else 0.0
+
+ # Combine scores
+ result["score"] = 0.3 * key_score + 0.7 * value_score
+
+ # Penalty for missing keys (reduced from 0.1 to 0.05 per key)
+ if missing:
+ result["score"] *= (1 - 0.05 * len(missing))
+
+ result["score"] = max(0.0, min(1.0, result["score"]))
+ return result
+
+ def _compare_lists(self, pred: list, exp: list) -> Dict[str, Any]:
+ """Compare two lists."""
+ result = {"score": 0.0, "details": {"type": "list", "length_match": False, "item_matches": 0}}
+
+ if not exp:
+ result["score"] = 1.0 if not pred else 0.5
+ return result
+
+ result["details"]["length_match"] = len(pred) == len(exp)
+
+ # Compare items (order-sensitive)
+ matches = 0
+ for i, exp_item in enumerate(exp):
+ if i < len(pred):
+ if pred[i] == exp_item:
+ matches += 1
+ else:
+ # Check if item exists elsewhere
+ if exp_item in pred:
+ matches += 0.5 # Partial credit for wrong position
+
+ result["details"]["item_matches"] = matches
+ result["score"] = matches / len(exp)
+
+ # Penalty for length mismatch
+ if len(pred) != len(exp):
+ len_ratio = min(len(pred), len(exp)) / max(len(pred), len(exp))
+ result["score"] *= (0.7 + 0.3 * len_ratio)
+
+ return result
+
+ def _value_similarity(self, pred: Any, exp: Any) -> float:
+ """
+ Calculate similarity between two values.
+
+ ๐ฅ ENHANCED: Now handles:
+ - Case-insensitive string comparison
+ - Semantic similarity for common variations
+ - Underscore/space/dash normalization
+ - Numeric comparison with tolerance
+ """
+ # Same value (exact match)
+ if pred == exp:
+ return 1.0
+
+ # Numeric comparison
+ try:
+ pred_num = float(pred)
+ exp_num = float(exp)
+ if exp_num == 0:
+ return 1.0 if pred_num == 0 else 0.0
+ # Relative error with tolerance
+ error = abs(pred_num - exp_num) / abs(exp_num)
+ return max(0.0, 1.0 - error)
+ except (ValueError, TypeError):
+ pass
+
+ # String comparison with normalization
+ pred_str = str(pred).strip()
+ exp_str = str(exp).strip()
+
+ # Case-insensitive exact match
+ if pred_str.lower() == exp_str.lower():
+ return 0.98 # Slight penalty for case mismatch
+
+ # Normalize strings (remove underscores, spaces, dashes for comparison)
+ def normalize_str(s: str) -> str:
+ return re.sub(r'[_\s\-]+', '', s.lower())
+
+ pred_norm = normalize_str(pred_str)
+ exp_norm = normalize_str(exp_str)
+
+ if pred_norm == exp_norm:
+ return 0.95 # Good match despite formatting differences
+
+ # Check if one contains the other (partial match)
+ if pred_norm in exp_norm or exp_norm in pred_norm:
+ ratio = min(len(pred_norm), len(exp_norm)) / max(len(pred_norm), len(exp_norm))
+ return 0.7 + (0.2 * ratio) # 0.7-0.9 for partial matches
+
+ # ๐ฅ SEMANTIC SIMILARITY: Check for common equivalent terms
+ semantic_equivalents = {
+ # Priority levels
+ 'low': ['low', 'minor', 'trivial', 'p3', 'p4'],
+ 'medium': ['medium', 'normal', 'moderate', 'p2'],
+ 'high': ['high', 'important', 'major', 'p1', 'critical', 'urgent'],
+ # Boolean variations
+ 'true': ['true', 'yes', '1', 'on', 'enabled'],
+ 'false': ['false', 'no', '0', 'off', 'disabled'],
+ # Status variations
+ 'success': ['success', 'succeeded', 'completed', 'done', 'passed'],
+ 'failure': ['failure', 'failed', 'error', 'crashed'],
+ 'pending': ['pending', 'waiting', 'queued', 'in_progress', 'processing'],
+ }
+
+ for canonical, equivalents in semantic_equivalents.items():
+ pred_match = any(eq in pred_norm for eq in equivalents)
+ exp_match = any(eq in exp_norm for eq in equivalents)
+ if pred_match and exp_match:
+ return 0.85 # Semantic match
+
+ # Sequence matching (character-level similarity)
+ ratio = SequenceMatcher(None, pred_str.lower(), exp_str.lower()).ratio()
+
+ # ๐ฅ WORD-LEVEL SIMILARITY: Check word overlap
+ pred_words = set(re.findall(r'\w+', pred_str.lower()))
+ exp_words = set(re.findall(r'\w+', exp_str.lower()))
+
+ if pred_words and exp_words:
+ word_overlap = len(pred_words & exp_words) / max(len(pred_words), len(exp_words))
+ # Combine character and word similarity
+ return max(ratio, word_overlap * 0.9)
+
+ def _compare_text_structure(self, predicted: str, expected: str) -> Dict[str, Any]:
+ """Compare text structure when not JSON."""
+ result = {"score": 0.0, "details": {"type": "text"}}
+
+ # Word overlap
+ pred_words = set(predicted.lower().split())
+ exp_words = set(expected.lower().split())
+
+ if not exp_words:
+ result["score"] = 1.0 if not pred_words else 0.5
+ return result
+
+ overlap = len(pred_words & exp_words)
+ result["details"]["word_overlap"] = overlap
+ result["details"]["expected_words"] = len(exp_words)
+
+ # Jaccard similarity
+ union = len(pred_words | exp_words)
+ result["score"] = overlap / union if union > 0 else 0.0
+
+ return result
+
+ def _llm_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]:
+ """
+ Use LLM for semantic analysis of predicted vs expected.
+
+ Uses XML-delimited prompt structure to prevent context bleeding
+ and Multi-Dimensional Scoring (Semantics vs. Syntax).
+
+ Returns:
+ Dict with 'score' (0.0-1.0), 'details', and 'feedback'
+ """
+ # Check cache
+ cache_key = f"{hash(predicted)}:{hash(expected)}"
+ if cache_key in self._analysis_cache:
+ return self._analysis_cache[cache_key]
+
+ result = {"score": 0.0, "details": {}, "feedback": ""}
+
+ try:
+ # Truncate for token limits but preserve enough context
+ expected_truncated = expected[:10000]
+ predicted_truncated = predicted[:10000]
+
+ # OPTIMIZED: Penalty-based scoring with self-verification
+ # Starts at 1.0 and deducts for failures - more consistent than subjective scoring
+ analysis_prompt = f"""
+You are a **Semantic Logic Engine** tasked with grading AI performance.
+You must compare a [PREDICTED] output against a [EXPECTED] truth.
+
+
+
+
+{expected_truncated}
+
+
+
+{predicted_truncated}
+
+
+
+
+Calculate the score based on these STRICT rules. Start with 1.0 and deduct penalties.
+
+1. **Information Completeness (Max -0.5)**:
+ - If key facts/fields are missing, deduct proportional to importance.
+ - If a nested JSON field is missing, deduct 0.1 per field.
+
+2. **Accuracy & Hallucination (Max -1.0)**:
+ - If factual numbers/IDs are wrong: Score = 0 immediately.
+ - If the model invents information NOT in the input: Deduct 0.3.
+
+3. **Format Compliance (Max -0.3)**:
+ - If JSON is requested but Markdown is returned: Deduct 0.3.
+ - If keys are lowercase instead of snake_case: Deduct 0.1.
+
+4. **Semantic Equivalence (No Penalty)**:
+ - Synonyms are ACCEPTED (e.g., "Purchase" == "Buy").
+ - Formatting differences (whitespace) are IGNORED.
+
+
+
+Before finalizing the score, ask: "If I used the predicted output in code expecting the original output, would the code crash?"
+- If YES (Crash) -> Score must be < 0.5.
+- If NO (Safe) -> Score can be high.
+
+
+
+Return JSON ONLY:
+{{
+ "semantic_similarity": 0.0-1.0,
+ "structural_similarity": 0.0-1.0,
+ "verdict": "PERFECT" | "ACCEPTABLE" | "FORMAT_ERROR" | "DATA_CORRUPTION",
+ "critical_failures": ["List specific failures that caused score < 1.0"],
+ "penalty_breakdown": {{"completeness": -0.0, "accuracy": -0.0, "format": -0.0}},
+ "fix_directive": "Imperative command to fix the prompt"
+}}
+
+"""
+
+ response = self.llm_client.generate(
+ system_prompt="You are a Semantic Logic Engine. Calculate scores using penalty-based deduction from 1.0. Respond only with valid JSON.",
+ user_prompt=analysis_prompt,
+ image_base64=""
+ )
+
+ content = response.get("content", str(response)) if isinstance(response, dict) else str(response)
+
+ # Parse JSON response
+ analysis = self._extract_json_from_response(content)
+
+ if analysis:
+ # Extract semantic similarity (primary score)
+ semantic_sim = float(analysis.get("semantic_similarity", 0.5))
+ structural_sim = float(analysis.get("structural_similarity", semantic_sim))
+
+ # Compute weighted score based on verdict (updated for new schema)
+ verdict = analysis.get("verdict", "ACCEPTABLE")
+ verdict_multiplier = {
+ "PERFECT": 1.0,
+ "ACCEPTABLE": 0.85,
+ "FORMAT_ERROR": 0.6, # New: was WRONG_FORMAT
+ "DATA_CORRUPTION": 0.1, # New: replaces WRONG_CONTENT + HALLUCINATION
+ # Legacy support
+ "WRONG_FORMAT": 0.6,
+ "WRONG_CONTENT": 0.3,
+ "HALLUCINATION": 0.1
+ }.get(verdict, 0.5)
+
+ # Final score: weighted combination
+ result["score"] = min(1.0, semantic_sim * 0.6 + structural_sim * 0.3 + verdict_multiplier * 0.1)
+
+ # Extract penalty breakdown if available
+ penalty_breakdown = analysis.get("penalty_breakdown", {})
+ critical_failures = analysis.get("critical_failures", [])
+
+ result["details"] = {
+ "verdict": verdict,
+ "semantic_similarity": semantic_sim,
+ "structural_similarity": structural_sim,
+ "critical_failures": critical_failures,
+ "penalty_breakdown": penalty_breakdown,
+ # Legacy field support
+ "key_matches": analysis.get("key_matches", []),
+ "key_differences": analysis.get("key_differences", critical_failures),
+ "value_errors": analysis.get("value_errors", {}),
+ "reasoning": analysis.get("reasoning", "")
+ }
+ result["feedback"] = analysis.get("fix_directive", "")
+ else:
+ # Fallback if JSON parsing fails
+ result = self._heuristic_semantic_analysis(predicted, expected)
+
+ # Cache result
+ self._analysis_cache[cache_key] = result
+
+ except Exception as e:
+ logger.warning(f"LLM semantic analysis failed: {e}, falling back to heuristics")
+ result = self._heuristic_semantic_analysis(predicted, expected)
+
+ return result
+
+ def _extract_json_from_response(self, content: str) -> Optional[Dict]:
+ """Extract JSON from LLM response."""
+ # Try to find JSON in response
+ json_match = re.search(r'\{[\s\S]*\}', content)
+ if json_match:
+ try:
+ return json.loads(json_match.group(0))
+ except json.JSONDecodeError:
+ pass
+ return None
+
+ def _heuristic_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]:
+ """
+ Heuristic-based semantic analysis when LLM is not available.
+
+ Uses multiple signals:
+ - Word overlap (Jaccard)
+ - Sequence matching (SequenceMatcher)
+ - Number extraction and comparison
+ - Key phrase matching
+ """
+ result = {"score": 0.0, "details": {}, "feedback": ""}
+
+ pred_lower = predicted.lower()
+ exp_lower = expected.lower()
+
+ # 1. Sequence similarity
+ seq_sim = SequenceMatcher(None, pred_lower, exp_lower).ratio()
+
+ # 2. Word overlap (Jaccard)
+ pred_words = set(pred_lower.split())
+ exp_words = set(exp_lower.split())
+ jaccard = len(pred_words & exp_words) / len(pred_words | exp_words) if (pred_words | exp_words) else 0.0
+
+ # 3. Number comparison
+ pred_nums = re.findall(r'-?\d+\.?\d*', predicted)
+ exp_nums = re.findall(r'-?\d+\.?\d*', expected)
+
+ num_score = 1.0
+ num_errors = []
+ if exp_nums:
+ matches = 0
+ for exp_num in exp_nums:
+ if exp_num in pred_nums:
+ matches += 1
+ else:
+ # Check for close matches
+ try:
+ exp_val = float(exp_num)
+ for pred_num in pred_nums:
+ pred_val = float(pred_num)
+ if abs(pred_val - exp_val) <= 1: # Off by 1
+ matches += 0.9
+ num_errors.append(f"Number close: expected {exp_num}, got {pred_num}")
+ break
+ else:
+ num_errors.append(f"Number missing: expected {exp_num}")
+ except ValueError:
+ pass
+ num_score = matches / len(exp_nums) if exp_nums else 1.0
+
+ # 4. Key entity extraction (simple approach)
+ # Look for capitalized words, quoted strings, etc.
+ pred_entities = set(re.findall(r'\b[A-Z][a-z]+\b', predicted))
+ exp_entities = set(re.findall(r'\b[A-Z][a-z]+\b', expected))
+ entity_overlap = len(pred_entities & exp_entities) / len(exp_entities) if exp_entities else 1.0
+
+ # Combine scores
+ result["score"] = (
+ 0.3 * seq_sim +
+ 0.25 * jaccard +
+ 0.25 * num_score +
+ 0.2 * entity_overlap
+ )
+
+ result["details"] = {
+ "sequence_similarity": seq_sim,
+ "word_overlap": jaccard,
+ "number_accuracy": num_score,
+ "entity_overlap": entity_overlap,
+ "number_errors": num_errors
+ }
+
+ # Generate feedback
+ feedback_parts = []
+ if jaccard < 0.5:
+ feedback_parts.append("Low word overlap - output may be missing key terms.")
+ if num_errors:
+ feedback_parts.append(f"Number issues: {'; '.join(num_errors[:3])}")
+ if entity_overlap < 0.5 and exp_entities:
+ missing = exp_entities - pred_entities
+ feedback_parts.append(f"Missing entities: {', '.join(list(missing)[:3])}")
+
+ if feedback_parts:
+ result["feedback"] = " | ".join(feedback_parts)
+ else:
+ result["feedback"] = "Output is semantically similar but not exact match."
+
+ return result
+
+ def _generate_default_feedback(self, result: Dict) -> str:
+ """Generate default feedback based on scores."""
+ score = result["composite_score"]
+ semantic = result["semantic_similarity"]
+ structural = result["structural_similarity"]
+
+ if score >= 0.9:
+ return "โ
Excellent match! Minor differences only."
+ elif score >= 0.7:
+ return f"โ ๏ธ Good match (semantic={semantic:.0%}, structural={structural:.0%}). Some differences to address."
+ elif score >= 0.5:
+ return f"โ ๏ธ Partial match (semantic={semantic:.0%}, structural={structural:.0%}). Significant differences found."
+ else:
+ return f"โ Poor match (semantic={semantic:.0%}, structural={structural:.0%}). Major issues to fix."
+
+ def get_evaluation_summary(self, results: List[Dict]) -> Dict[str, Any]:
+ """
+ Get summary statistics for a batch of evaluations.
+
+ Args:
+ results: List of evaluation result dictionaries
+
+ Returns:
+ Summary statistics
+ """
+ if not results:
+ return {
+ "total_samples": 0,
+ "accuracy": 0.0,
+ "avg_semantic_similarity": 0.0,
+ "avg_structural_similarity": 0.0
+ }
+
+ total = len(results)
+ scores = [r.get("composite_score", 0.0) for r in results]
+ semantic_scores = [r.get("semantic_similarity", 0.0) for r in results]
+ structural_scores = [r.get("structural_similarity", 0.0) for r in results]
+
+ return {
+ "total_samples": total,
+ "accuracy": sum(1 for s in scores if s >= 0.8) / total,
+ "avg_composite_score": sum(scores) / total,
+ "avg_semantic_similarity": sum(semantic_scores) / total,
+ "avg_structural_similarity": sum(structural_scores) / total,
+ "min_score": min(scores),
+ "max_score": max(scores)
+ }
+
+
+# Convenience function to create evaluator
+def create_universal_evaluator(llm_client=None) -> UniversalSemanticEvaluator:
+ """
+ Create a Universal Semantic Evaluator.
+
+ Args:
+ llm_client: Optional LLM client for semantic analysis.
+ If not provided, uses heuristic-based analysis.
+
+ Returns:
+ Configured UniversalSemanticEvaluator instance
+ """
+ return UniversalSemanticEvaluator(
+ llm_client=llm_client,
+ use_llm_analysis=llm_client is not None
+ )
+
diff --git a/src/gepa_optimizer/evaluation/validation_evaluator.py b/src/gepa_optimizer/evaluation/validation_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f95765f060e9f1cc181550a28e9d48f9b368b
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/validation_evaluator.py
@@ -0,0 +1,495 @@
+"""
+Validation Evaluator for UI Validation Use Case
+
+Evaluates predicted validation results (true/false) against expected results.
+Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
+"""
+
+from typing import Dict, Any, Optional
+import re
+import logging
+
+try:
+ from .base_evaluator import BaseEvaluator
+except ImportError:
+ # For standalone testing
+ import sys
+ from pathlib import Path
+ sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+ from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
+
+
+class ValidationEvaluator(BaseEvaluator):
+ """
+ Evaluator for validation use case (true/false results).
+
+ Features:
+ - Normalizes boolean formats ("true"/"True"/"1" โ True, "false"/"False"/"0" โ False)
+ - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
+ - Binary scoring: correct boolean = 1.0, wrong = 0.0
+ - Returns reasoning in evaluation results for LLM-as-judge feedback
+ """
+
+ def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+ """
+ Initialize validation evaluator.
+
+ Args:
+ metric_weights: Weights for evaluation metrics
+ Default: {"output_match": 1.0}
+ """
+ default_weights = {
+ "output_match": 1.0 # Binary boolean comparison
+ }
+
+ weights = metric_weights or default_weights
+ super().__init__(metric_weights=weights)
+
+ def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+ """
+ Evaluate predicted validation result against expected result.
+
+ Scoring Strategy:
+ 1. Normalize both predicted and expected to boolean
+ 2. Compare booleans (exact match required)
+ 3. Extract reasoning from both (for LLM-as-judge)
+ 4. Return 1.0 if match, 0.0 otherwise (binary scoring)
+
+ Args:
+ predicted: LLM's output (may include "true"/"false" + reasoning)
+ expected: Expected output (should be "true" or "false", may include reasoning)
+
+ Returns:
+ Dictionary with evaluation metrics, extracted booleans, and reasoning:
+ {
+ "output_match": 1.0 or 0.0,
+ "composite_score": 1.0 or 0.0,
+ "predicted_output": str,
+ "expected_output": str,
+ "predicted_boolean": True/False,
+ "expected_boolean": True/False,
+ "predicted_reasoning": str, # REQUIRED for LLM-as-judge
+ "expected_reasoning": str, # REQUIRED for LLM-as-judge
+ "evaluation_reason": str
+ }
+ """
+ if not predicted or not expected:
+ return {
+ "output_match": 0.0,
+ "composite_score": 0.0,
+ "predicted_output": str(predicted).strip() if predicted else "",
+ "expected_output": str(expected).strip() if expected else "",
+ "predicted_boolean": None,
+ "expected_boolean": None,
+ "predicted_reasoning": "",
+ "expected_reasoning": "",
+ "evaluation_reason": "โ Empty or missing input/output"
+ }
+
+ predicted_str = str(predicted).strip()
+ expected_str = str(expected).strip()
+
+ # 1. Extract boolean from predicted output
+ pred_bool = self._normalize_to_bool(predicted_str)
+ pred_reasoning = self._extract_reasoning(predicted_str)
+
+ # 2. Extract boolean from expected output
+ exp_bool = self._normalize_to_bool(expected_str)
+ exp_reasoning = self._extract_reasoning(expected_str)
+
+ # ๐ฅ NEW: Detect output structure for both expected and predicted
+ expected_structure = self._detect_output_structure(expected_str)
+ predicted_structure = self._detect_output_structure(predicted_str)
+
+ # Compare structures
+ structure_match = (expected_structure['format'] == predicted_structure['format'])
+
+ # 3. Compare booleans (binary scoring)
+ if pred_bool is None or exp_bool is None:
+ # Could not extract boolean from one or both
+ score = 0.0
+ reason = "โ Could not extract boolean value"
+ if pred_bool is None:
+ reason += " from predicted output"
+ if exp_bool is None:
+ reason += " from expected output"
+ else:
+ # Both booleans extracted successfully - compare
+ score = 1.0 if pred_bool == exp_bool else 0.0
+ if score == 1.0:
+ reason = f"โ
Correct! Result matches (both are {exp_bool})"
+ # ๐ฅ NEW: Add note if structure doesn't match
+ if not structure_match:
+ reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
+ else:
+ reason = f"โ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"
+
+ # 4. Log evaluation details
+ self.logger.info(f"\n{'โ'*70}")
+ self.logger.info(f"๐ VALIDATION EVALUATION")
+ self.logger.info(f"{'โ'*70}")
+ self.logger.info(f" Expected: '{expected_str[:100]}...' โ {exp_bool}")
+ self.logger.info(f" Predicted: '{predicted_str[:100]}...' โ {pred_bool}")
+ self.logger.info(f" {'โ'*66}")
+ self.logger.info(f" ๐ฏ SCORE: {score:.2f} - {reason}")
+ if pred_reasoning:
+ self.logger.info(f" ๐ Predicted Reasoning: {pred_reasoning[:150]}...")
+ if exp_reasoning:
+ self.logger.info(f" ๐ Expected Reasoning: {exp_reasoning[:150]}...")
+ # ๐ฅ NEW: Log structure comparison
+ self.logger.info(f" ๐ Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
+ self.logger.info(f" ๐ Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
+ if not structure_match:
+ self.logger.warning(f" โ ๏ธ OUTPUT STRUCTURE MISMATCH!")
+ self.logger.info(f"{'โ'*70}\n")
+
+ return {
+ "output_match": score,
+ "composite_score": score, # This is what GEPA uses
+ "predicted_output": predicted_str,
+ "expected_output": expected_str,
+ "predicted_boolean": pred_bool,
+ "expected_boolean": exp_bool,
+ "predicted_reasoning": pred_reasoning, # REQUIRED for LLM-as-judge
+ "expected_reasoning": exp_reasoning, # REQUIRED for LLM-as-judge
+ "evaluation_reason": reason,
+ # ๐ฅ NEW: Structure metadata for LLM-as-judge
+ "expected_structure": expected_structure,
+ "predicted_structure": predicted_structure,
+ "output_structure_match": structure_match,
+ "expected_has_reasoning": expected_structure['has_reasoning'],
+ "predicted_has_reasoning": predicted_structure['has_reasoning'],
+ "reasoning_quality_gap": expected_structure['reasoning_quality'] + " โ " + predicted_structure['reasoning_quality']
+ }
+
+ def _normalize_to_bool(self, value: str) -> Optional[bool]:
+ """
+ Normalize various formats to boolean.
+
+ Handles:
+ - "true", "True", "TRUE" โ True
+ - "false", "False", "FALSE" โ False
+ - "1", "0" โ True, False
+ - "yes", "no" โ True, False
+ - "correct", "incorrect" โ True, False
+ - JSON: {"result": true} โ True
+ - Text with boolean: "The result is true because..." โ True
+
+ Args:
+ value: String that may contain a boolean value
+
+ Returns:
+ Boolean value or None if cannot be determined
+ """
+ if not value:
+ return None
+
+ value_lower = value.lower().strip()
+
+ # Direct boolean strings
+ if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
+ return True
+ if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
+ return False
+
+ # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
+ # This handles the production prompt's JSON output format
+ # Match both quoted and unquoted values, case-insensitive
+ action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower)
+ if action_match:
+ action_value = action_match.group(1).lower()
+ if action_value == "true":
+ return True
+ elif action_value == "false":
+ return False
+ elif action_value == "loading":
+ # Treat LOADING as False for validation purposes (screen not ready)
+ return False
+
+ # Also try to parse full JSON structure if present (more robust)
+ try:
+ import json
+ # Try to find and parse JSON object
+ json_start = value.find('{')
+ if json_start != -1:
+ # Try to extract JSON from the response
+ for end_idx in range(len(value), json_start, -1):
+ try:
+ json_str = value[json_start:end_idx]
+ data = json.loads(json_str)
+ # Check for "action" field (production prompt format)
+ if "action" in data:
+ action_val = str(data["action"]).upper()
+ if action_val == "TRUE":
+ return True
+ elif action_val == "FALSE":
+ return False
+ elif action_val == "LOADING":
+ return False # Treat as False
+ # Check for "result" field (alternative format)
+ if "result" in data:
+ result_val = data["result"]
+ if isinstance(result_val, bool):
+ return result_val
+ elif isinstance(result_val, str):
+ return result_val.lower() in ("true", "1", "yes")
+ except (json.JSONDecodeError, KeyError, ValueError):
+ continue
+ except Exception:
+ pass # Fall through to other extraction methods
+
+ # JSON format: {"result": true} or {"result": false}
+ json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower)
+ if json_match:
+ return json_match.group(1) == "true"
+
+ # Pattern: "result is true" or "result: true"
+ pattern_match = re.search(r'result[:\s]+(true|false)', value_lower)
+ if pattern_match:
+ return pattern_match.group(1) == "true"
+
+ # Pattern: "is true" or "is false" (standalone)
+ is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower)
+ if is_match:
+ return is_match.group(2) == "true"
+
+ # Pattern: "true" or "false" as standalone word (not in other words)
+ standalone_match = re.search(r'\b(true|false)\b', value_lower)
+ if standalone_match:
+ return standalone_match.group(1) == "true"
+
+ # Last resort: check if "true" appears before "false" in text
+ true_pos = value_lower.find("true")
+ false_pos = value_lower.find("false")
+
+ if true_pos != -1 and false_pos != -1:
+ # Both found - use the one that appears first
+ return true_pos < false_pos
+ elif true_pos != -1:
+ return True
+ elif false_pos != -1:
+ return False
+
+ # Cannot determine
+ return None
+
+ def _detect_output_structure(self, output: str) -> Dict[str, Any]:
+ """
+ Dynamically detect the structure/components of the output.
+
+ This detects:
+ - Boolean result presence
+ - Reasoning/explanation presence and quality
+ - Output format (boolean only, boolean+reasoning, etc.)
+
+ Args:
+ output: Output string to analyze
+
+ Returns:
+ Dictionary with structure information:
+ {
+ "has_boolean": bool,
+ "has_reasoning": bool,
+ "reasoning_length": int,
+ "reasoning_quality": str, # "missing", "minimal", "adequate", "detailed"
+ "format": str # "boolean_only", "boolean_with_reasoning", "unknown"
+ }
+ """
+ if not output:
+ return {
+ "has_boolean": False,
+ "has_reasoning": False,
+ "reasoning_length": 0,
+ "reasoning_quality": "missing",
+ "format": "empty"
+ }
+
+ output_clean = output.strip()
+
+ # Detect boolean
+ has_boolean = self._normalize_to_bool(output_clean) is not None
+
+ # Extract reasoning
+ reasoning = self._extract_reasoning(output_clean)
+ has_reasoning = len(reasoning) > 15 # Minimum 15 chars to count as reasoning
+ reasoning_length = len(reasoning)
+
+ # Classify reasoning quality
+ if reasoning_length == 0:
+ reasoning_quality = "missing"
+ elif reasoning_length < 30:
+ reasoning_quality = "minimal" # Just a few words
+ elif reasoning_length < 100:
+ reasoning_quality = "adequate" # Brief explanation
+ else:
+ reasoning_quality = "detailed" # Full explanation
+
+ # Determine format
+ if has_boolean and has_reasoning:
+ output_format = "boolean_with_reasoning"
+ elif has_boolean and not has_reasoning:
+ output_format = "boolean_only"
+ elif not has_boolean and has_reasoning:
+ output_format = "reasoning_only"
+ else:
+ output_format = "unknown"
+
+ return {
+ "has_boolean": has_boolean,
+ "has_reasoning": has_reasoning,
+ "reasoning_length": reasoning_length,
+ "reasoning_quality": reasoning_quality,
+ "format": output_format
+ }
+
+ def _extract_reasoning(self, output: str) -> str:
+ """
+ Extract reasoning/explanation from output string.
+
+ This is REQUIRED for LLM-as-judge feedback. The reasoning helps
+ the judge understand why the result was true/false and compare
+ predicted vs expected reasoning.
+
+ Args:
+ output: Full output string that may contain reasoning
+
+ Returns:
+ Extracted reasoning text, or empty string if not found
+ """
+ if not output:
+ return ""
+
+ # Patterns to find reasoning sections
+ reasoning_patterns = [
+ r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)', # "Reason: ..."
+ r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)', # "Explanation: ..."
+ r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)', # "Because: ..."
+ r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)', # "Why: ..."
+ r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)', # "Details: ..."
+ ]
+
+ # Try each pattern
+ for pattern in reasoning_patterns:
+ match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
+ if match:
+ reasoning = match.group(1).strip()
+ if len(reasoning) > 20: # Only return if substantial
+ return reasoning
+
+ # If no explicit reasoning section, check if output has substantial text
+ # after boolean (likely contains reasoning)
+ bool_match = re.search(r'\b(true|false)\b', output.lower())
+ if bool_match:
+ # Get text after the boolean
+ bool_pos = bool_match.end()
+ remaining = output[bool_pos:].strip()
+
+ # If remaining text is substantial (more than just punctuation), use it
+ if len(remaining) > 30:
+ # Clean up common prefixes
+ remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
+ if remaining:
+ return remaining
+
+ # If output is long and doesn't start with boolean, might be all reasoning
+ if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE):
+ # Return first 500 chars as reasoning
+ return output[:500].strip()
+
+ # No reasoning found
+ return ""
+
+ def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+ """
+ Get summary statistics for a batch of evaluations.
+
+ Args:
+ results: List of evaluation result dictionaries
+
+ Returns:
+ Summary statistics including accuracy, true/false distribution
+ """
+ if not results:
+ return {
+ "total_samples": 0,
+ "accuracy": 0.0,
+ "correct_predictions": 0,
+ "incorrect_predictions": 0,
+ "true_predictions": 0,
+ "false_predictions": 0
+ }
+
+ total = len(results)
+ correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
+ accuracy = correct / total if total > 0 else 0.0
+
+ # Count true/false predictions
+ true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
+ false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)
+
+ return {
+ "total_samples": total,
+ "accuracy": accuracy,
+ "correct_predictions": correct,
+ "incorrect_predictions": total - correct,
+ "true_predictions": true_preds,
+ "false_predictions": false_preds
+ }
+
+
+# Example usage and testing
+if __name__ == "__main__":
+ print("๐ Testing Validation Evaluator...")
+
+ evaluator = ValidationEvaluator()
+
+ # Test cases
+ test_cases = [
+ # (predicted, expected, should_match)
+ ("true", "true", True),
+ ("false", "false", True),
+ ("True", "true", True),
+ ("FALSE", "false", True),
+ ("1", "true", True),
+ ("0", "false", True),
+ ("true", "false", False),
+ ("false", "true", False),
+ ("The result is true because the button is visible", "true", True),
+ ("The result is false because the element is not found", "false", True),
+ ('{"result": true, "reasoning": "Button is visible"}', "true", True),
+ ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
+ ("", "true", False),
+ ("invalid", "true", False),
+ ]
+
+ print("\n๐ Running test cases:")
+ print("-" * 80)
+
+ results = []
+ for predicted, expected, should_match in test_cases:
+ result = evaluator.evaluate(predicted, expected)
+ match = result["composite_score"] == 1.0
+
+ status = "โ
" if match == should_match else "โ"
+ pred_bool = result.get("predicted_boolean", "?")
+ exp_bool = result.get("expected_boolean", "?")
+ pred_reason = result.get("predicted_reasoning", "")[:50]
+
+ print(f"{status} Predicted: '{predicted[:40]}...' โ {pred_bool}")
+ print(f" Expected: '{expected}' โ {exp_bool}")
+ print(f" Match: {match} (should be {should_match})")
+ if pred_reason:
+ print(f" Reasoning: {pred_reason}...")
+ print()
+
+ results.append(result)
+
+ # Summary
+ print("\n๐ Summary:")
+ summary = evaluator.get_evaluation_summary(results)
+ print(f" Total: {summary['total_samples']}")
+ print(f" Correct: {summary['correct_predictions']}")
+ print(f" Accuracy: {summary['accuracy']:.1%}")
+ print(f" True predictions: {summary['true_predictions']}")
+ print(f" False predictions: {summary['false_predictions']}")
+
diff --git a/src/gepa_optimizer/infrastructure/__init__.py b/src/gepa_optimizer/infrastructure/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b3d27aeb3cf77dcd0d6f4cd1b4e5565580b3df9
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/__init__.py
@@ -0,0 +1,15 @@
+"""
+Infrastructure module for cross-cutting concerns.
+
+This module contains infrastructure components that are used across
+the entire application, including logging, metrics, and configuration.
+"""
+
+from .logging import get_logger, configure_logging, LogContext
+
+__all__ = [
+ "get_logger",
+ "configure_logging",
+ "LogContext",
+]
+
diff --git a/src/gepa_optimizer/infrastructure/logging/__init__.py b/src/gepa_optimizer/infrastructure/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6086e4bb54f61fc14cb384dc87db2d639f9dca85
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/__init__.py
@@ -0,0 +1,43 @@
+"""
+Centralized Logging Infrastructure for GEPA Optimizer.
+
+This module provides a unified logging system with:
+- Structured logging with context
+- Consistent formatting across all modules
+- Log level configuration
+- Operation tracking with timing
+- Contextual logging for debugging
+
+Usage:
+ from gepa_optimizer.infrastructure.logging import get_logger, LogContext
+
+ logger = get_logger(__name__)
+ logger.info("Starting optimization", extra={"iteration": 1})
+
+ with LogContext(logger, "evaluation", sample_id=123):
+ logger.info("Evaluating sample")
+"""
+
+from .logger import (
+ get_logger,
+ configure_logging,
+ LogLevel,
+ GEPA_LOGGER_NAME,
+)
+from .context import LogContext, log_operation
+from .formatters import GepaFormatter, JsonFormatter
+
+__all__ = [
+ # Core logging
+ "get_logger",
+ "configure_logging",
+ "LogLevel",
+ "GEPA_LOGGER_NAME",
+ # Context management
+ "LogContext",
+ "log_operation",
+ # Formatters
+ "GepaFormatter",
+ "JsonFormatter",
+]
+
diff --git a/src/gepa_optimizer/infrastructure/logging/context.py b/src/gepa_optimizer/infrastructure/logging/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2a305e5ac78fd50311331a62713ceff09f4315d
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/context.py
@@ -0,0 +1,257 @@
+"""
+Logging Context Management.
+
+Provides context managers and decorators for:
+- Operation tracking with timing
+- Contextual logging with nested contexts
+- Automatic exception logging
+"""
+
+import logging
+import time
+import functools
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Optional, TypeVar, ParamSpec
+
+P = ParamSpec('P')
+R = TypeVar('R')
+
+
+class LogContext:
+ """
+ Context manager for logging operations with timing and context.
+
+ Features:
+ - Automatic start/end logging
+ - Timing measurement
+ - Exception capture
+ - Nested context support
+
+ Example:
+ logger = get_logger(__name__)
+
+ with LogContext(logger, "optimization", iteration=5):
+ # ... optimization code ...
+ logger.info("Processing sample") # Inherits context
+
+ # Output:
+ # INFO | Starting optimization | iteration=5
+ # INFO | Processing sample | iteration=5
+ # INFO | Completed optimization | iteration=5 duration_ms=1234
+ """
+
+ def __init__(
+ self,
+ logger: logging.Logger,
+ operation: str,
+ log_start: bool = True,
+ log_end: bool = True,
+ log_level: int = logging.INFO,
+ **context_fields: Any
+ ):
+ """
+ Initialize log context.
+
+ Args:
+ logger: Logger instance to use
+ operation: Name of the operation being performed
+ log_start: Whether to log when entering context
+ log_end: Whether to log when exiting context
+ log_level: Log level for start/end messages
+ **context_fields: Additional fields to include in all logs
+ """
+ self.logger = logger
+ self.operation = operation
+ self.log_start = log_start
+ self.log_end = log_end
+ self.log_level = log_level
+ self.context_fields = context_fields
+ self.start_time: Optional[float] = None
+ self.exception: Optional[Exception] = None
+
+ def __enter__(self) -> "LogContext":
+ """Enter the context, logging start if configured."""
+ self.start_time = time.perf_counter()
+
+ if self.log_start:
+ self.logger.log(
+ self.log_level,
+ f"Starting {self.operation}",
+ extra=self.context_fields
+ )
+
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
+ """Exit the context, logging completion or error."""
+ duration_ms = (time.perf_counter() - self.start_time) * 1000
+
+ extra = {
+ **self.context_fields,
+ "duration_ms": round(duration_ms, 2)
+ }
+
+ if exc_type is not None:
+ # Log exception
+ self.exception = exc_val
+ self.logger.error(
+ f"Failed {self.operation}: {exc_type.__name__}: {exc_val}",
+ extra=extra,
+ exc_info=True
+ )
+ # Don't suppress the exception
+ return False
+
+ if self.log_end:
+ self.logger.log(
+ self.log_level,
+ f"Completed {self.operation}",
+ extra=extra
+ )
+
+ return False
+
+ def log(self, level: int, message: str, **extra_fields: Any) -> None:
+ """Log a message within this context, inheriting context fields."""
+ self.logger.log(
+ level,
+ message,
+ extra={**self.context_fields, **extra_fields}
+ )
+
+ def info(self, message: str, **extra_fields: Any) -> None:
+ """Log info message within context."""
+ self.log(logging.INFO, message, **extra_fields)
+
+ def debug(self, message: str, **extra_fields: Any) -> None:
+ """Log debug message within context."""
+ self.log(logging.DEBUG, message, **extra_fields)
+
+ def warning(self, message: str, **extra_fields: Any) -> None:
+ """Log warning message within context."""
+ self.log(logging.WARNING, message, **extra_fields)
+
+ def error(self, message: str, **extra_fields: Any) -> None:
+ """Log error message within context."""
+ self.log(logging.ERROR, message, **extra_fields)
+
+
+def log_operation(
+ logger: Optional[logging.Logger] = None,
+ operation: Optional[str] = None,
+ log_args: bool = False,
+ log_result: bool = False,
+ log_level: int = logging.INFO,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+ """
+ Decorator for logging function execution.
+
+ Automatically logs:
+ - Function entry (with arguments if configured)
+ - Function exit (with result if configured)
+ - Execution duration
+ - Exceptions
+
+ Args:
+ logger: Logger to use (defaults to logger named after module)
+ operation: Operation name (defaults to function name)
+ log_args: Whether to log function arguments
+ log_result: Whether to log function result
+ log_level: Log level for messages
+
+ Example:
+ @log_operation(log_args=True)
+ def process_batch(batch_id: int, items: List[str]) -> int:
+ return len(items)
+
+ # Output:
+ # INFO | Starting process_batch | batch_id=123 items=['a', 'b']
+ # INFO | Completed process_batch | duration_ms=45.2 result=2
+ """
+ def decorator(func: Callable[P, R]) -> Callable[P, R]:
+ nonlocal logger, operation
+
+ if logger is None:
+ logger = logging.getLogger(func.__module__)
+ if operation is None:
+ operation = func.__name__
+
+ @functools.wraps(func)
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+ start_time = time.perf_counter()
+
+ # Build context fields
+ extra: Dict[str, Any] = {}
+ if log_args:
+ # Include positional args (skip self for methods)
+ arg_names = func.__code__.co_varnames[:func.__code__.co_argcount]
+ for i, (name, value) in enumerate(zip(arg_names, args)):
+ if name != 'self':
+ extra[name] = _safe_repr(value)
+ # Include keyword args
+ for key, value in kwargs.items():
+ extra[key] = _safe_repr(value)
+
+ logger.log(log_level, f"Starting {operation}", extra=extra)
+
+ try:
+ result = func(*args, **kwargs)
+
+ duration_ms = (time.perf_counter() - start_time) * 1000
+ result_extra: Dict[str, Any] = {"duration_ms": round(duration_ms, 2)}
+
+ if log_result:
+ result_extra["result"] = _safe_repr(result)
+
+ logger.log(log_level, f"Completed {operation}", extra=result_extra)
+
+ return result
+
+ except Exception as e:
+ duration_ms = (time.perf_counter() - start_time) * 1000
+ logger.error(
+ f"Failed {operation}: {type(e).__name__}: {e}",
+ extra={"duration_ms": round(duration_ms, 2)},
+ exc_info=True
+ )
+ raise
+
+ return wrapper
+
+ return decorator
+
+
+@contextmanager
+def timed_block(logger: logging.Logger, description: str, log_level: int = logging.DEBUG):
+ """
+ Simple context manager for timing a block of code.
+
+ Less verbose than LogContext, suitable for quick timing measurements.
+
+ Example:
+ with timed_block(logger, "data processing"):
+ process_data()
+ # Output: DEBUG | data processing completed in 123.45ms
+ """
+ start = time.perf_counter()
+ try:
+ yield
+ finally:
+ duration_ms = (time.perf_counter() - start) * 1000
+ logger.log(log_level, f"{description} completed in {duration_ms:.2f}ms")
+
+
+def _safe_repr(value: Any, max_length: int = 100) -> str:
+ """
+ Create a safe string representation of a value for logging.
+
+ Truncates long strings and handles non-serializable objects.
+ """
+ try:
+ repr_str = repr(value)
+ if len(repr_str) > max_length:
+ return repr_str[:max_length] + "..."
+ return repr_str
+ except Exception:
+ return f"<{type(value).__name__}>"
+
diff --git a/src/gepa_optimizer/infrastructure/logging/formatters.py b/src/gepa_optimizer/infrastructure/logging/formatters.py
new file mode 100644
index 0000000000000000000000000000000000000000..2387fe8deac3f641e363aab02067a55ed29a4474
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/formatters.py
@@ -0,0 +1,259 @@
+"""
+Custom Log Formatters for GEPA Optimizer.
+
+Provides formatters for:
+- Console output with colors and emoji
+- JSON structured logging for production
+- Plain text for file logging
+"""
+
+import json
+import logging
+from datetime import datetime
+from typing import Any, Dict, Optional
+
+
+# ANSI color codes for terminal output
+class Colors:
+ """ANSI color codes for terminal coloring."""
+ RESET = "\033[0m"
+ BOLD = "\033[1m"
+ DIM = "\033[2m"
+
+ # Log level colors
+ DEBUG = "\033[36m" # Cyan
+ INFO = "\033[32m" # Green
+ WARNING = "\033[33m" # Yellow
+ ERROR = "\033[31m" # Red
+ CRITICAL = "\033[35m" # Magenta
+
+ # Semantic colors
+ TIMESTAMP = "\033[90m" # Gray
+ MODULE = "\033[34m" # Blue
+ MESSAGE = "\033[0m" # Default
+
+
+# Emoji prefixes for visual log scanning
+LEVEL_EMOJI = {
+ logging.DEBUG: "๐",
+ logging.INFO: "โน๏ธ ",
+ logging.WARNING: "โ ๏ธ ",
+ logging.ERROR: "โ",
+ logging.CRITICAL: "๐จ",
+}
+
+# Level colors mapping
+LEVEL_COLORS = {
+ logging.DEBUG: Colors.DEBUG,
+ logging.INFO: Colors.INFO,
+ logging.WARNING: Colors.WARNING,
+ logging.ERROR: Colors.ERROR,
+ logging.CRITICAL: Colors.CRITICAL,
+}
+
+
+class GepaFormatter(logging.Formatter):
+ """
+ Custom formatter for GEPA Optimizer logs.
+
+ Features:
+ - Optional color output for console
+ - Optional emoji prefixes for visual scanning
+ - Structured extra fields support
+ - Clean, readable format
+
+ Example output:
+ 2024-01-15 10:30:45 | INFO | โน๏ธ gepa_optimizer.core.optimizer | Starting optimization iteration=5
+ """
+
+ def __init__(
+ self,
+ fmt: Optional[str] = None,
+ datefmt: Optional[str] = None,
+ use_colors: bool = True,
+ include_emoji: bool = True,
+ ):
+ """
+ Initialize the formatter.
+
+ Args:
+ fmt: Format string (uses default if not provided)
+ datefmt: Date format string
+ use_colors: Whether to use ANSI colors
+ include_emoji: Whether to include emoji prefixes
+ """
+ super().__init__(fmt=fmt, datefmt=datefmt)
+ self.use_colors = use_colors
+ self.include_emoji = include_emoji
+
+ def format(self, record: logging.LogRecord) -> str:
+ """Format a log record with colors and emoji."""
+ # Store original values
+ original_msg = record.msg
+ original_levelname = record.levelname
+
+ try:
+ # Add emoji prefix if enabled
+ if self.include_emoji:
+ emoji = LEVEL_EMOJI.get(record.levelno, "")
+ record.levelname = f"{emoji} {record.levelname}"
+
+ # Add colors if enabled
+ if self.use_colors:
+ color = LEVEL_COLORS.get(record.levelno, Colors.RESET)
+ record.levelname = f"{color}{record.levelname}{Colors.RESET}"
+ record.name = f"{Colors.MODULE}{record.name}{Colors.RESET}"
+
+ # Format extra fields if present
+ extra_str = self._format_extra(record)
+ if extra_str:
+ record.msg = f"{record.msg} | {extra_str}"
+
+ # Call parent formatter
+ formatted = super().format(record)
+
+ return formatted
+
+ finally:
+ # Restore original values
+ record.msg = original_msg
+ record.levelname = original_levelname
+
+ def _format_extra(self, record: logging.LogRecord) -> str:
+ """
+ Format extra fields from the log record.
+
+ Extra fields are passed via the 'extra' parameter to logging calls:
+ logger.info("Message", extra={"key": "value"})
+ """
+ # Standard LogRecord attributes to exclude
+ standard_attrs = {
+ 'name', 'msg', 'args', 'created', 'filename', 'funcName',
+ 'levelname', 'levelno', 'lineno', 'module', 'msecs',
+ 'pathname', 'process', 'processName', 'relativeCreated',
+ 'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
+ 'taskName', 'message'
+ }
+
+ # Collect extra fields
+ extra_fields = {
+ k: v for k, v in record.__dict__.items()
+ if k not in standard_attrs and not k.startswith('_')
+ }
+
+ if not extra_fields:
+ return ""
+
+ # Format as key=value pairs
+ parts = []
+ for key, value in extra_fields.items():
+ if isinstance(value, str):
+ parts.append(f"{key}={value}")
+ elif isinstance(value, (int, float)):
+ parts.append(f"{key}={value}")
+ elif isinstance(value, bool):
+ parts.append(f"{key}={str(value).lower()}")
+ else:
+ parts.append(f"{key}={repr(value)}")
+
+ return " ".join(parts)
+
+
+class JsonFormatter(logging.Formatter):
+ """
+ JSON formatter for structured logging.
+
+ Outputs each log record as a single JSON line, suitable for:
+ - Log aggregation systems (ELK, Splunk)
+ - Cloud logging (CloudWatch, Stackdriver)
+ - Log parsing and analysis
+
+ Example output:
+ {"timestamp": "2024-01-15T10:30:45.123Z", "level": "INFO", "logger": "gepa_optimizer.core", "message": "Starting optimization", "iteration": 5}
+ """
+
+ def __init__(
+ self,
+ include_timestamp: bool = True,
+ include_location: bool = False,
+ ):
+ """
+ Initialize JSON formatter.
+
+ Args:
+ include_timestamp: Include ISO timestamp
+ include_location: Include file/line information
+ """
+ super().__init__()
+ self.include_timestamp = include_timestamp
+ self.include_location = include_location
+
+ def format(self, record: logging.LogRecord) -> str:
+ """Format record as JSON string."""
+ log_dict: Dict[str, Any] = {}
+
+ # Timestamp
+ if self.include_timestamp:
+ log_dict["timestamp"] = datetime.utcfromtimestamp(
+ record.created
+ ).isoformat() + "Z"
+
+ # Core fields
+ log_dict["level"] = record.levelname
+ log_dict["logger"] = record.name
+ log_dict["message"] = record.getMessage()
+
+ # Location info
+ if self.include_location:
+ log_dict["file"] = record.filename
+ log_dict["line"] = record.lineno
+ log_dict["function"] = record.funcName
+
+ # Exception info
+ if record.exc_info:
+ log_dict["exception"] = self.formatException(record.exc_info)
+
+ # Extra fields
+ standard_attrs = {
+ 'name', 'msg', 'args', 'created', 'filename', 'funcName',
+ 'levelname', 'levelno', 'lineno', 'module', 'msecs',
+ 'pathname', 'process', 'processName', 'relativeCreated',
+ 'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
+ 'taskName', 'message'
+ }
+
+ for key, value in record.__dict__.items():
+ if key not in standard_attrs and not key.startswith('_'):
+ try:
+ # Ensure value is JSON serializable
+ json.dumps(value)
+ log_dict[key] = value
+ except (TypeError, ValueError):
+ log_dict[key] = str(value)
+
+ return json.dumps(log_dict, default=str)
+
+
+class CompactFormatter(logging.Formatter):
+ """
+ Compact formatter for minimal log output.
+
+ Useful for:
+ - CI/CD pipelines
+ - Reduced log verbosity
+ - Quick debugging
+
+ Example output:
+ 10:30:45 INFO optimizer: Starting optimization
+ """
+
+ def format(self, record: logging.LogRecord) -> str:
+ """Format record in compact form."""
+ # Short timestamp (time only)
+ time_str = datetime.fromtimestamp(record.created).strftime("%H:%M:%S")
+
+ # Short module name (last part only)
+ short_name = record.name.split(".")[-1]
+
+ return f"{time_str} {record.levelname:5s} {short_name}: {record.getMessage()}"
+
diff --git a/src/gepa_optimizer/infrastructure/logging/logger.py b/src/gepa_optimizer/infrastructure/logging/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cd5e11a86747139d3238cb77d1813aa712baf8
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/logger.py
@@ -0,0 +1,260 @@
+"""
+Core Logger Factory and Configuration.
+
+This module provides the centralized logger factory that should be used
+across all GEPA Optimizer modules. It ensures consistent logging behavior
+and formatting throughout the application.
+
+Design Principles:
+- Single source of truth for logger configuration
+- Lazy initialization (loggers created on first use)
+- Thread-safe logger access
+- Configurable log levels per module
+"""
+
+import logging
+import sys
+from enum import Enum
+from typing import Optional, Dict, Any
+from functools import lru_cache
+
+from .formatters import GepaFormatter
+
+# Root logger name for GEPA Optimizer
+GEPA_LOGGER_NAME = "gepa_optimizer"
+
+# Default log format
+DEFAULT_FORMAT = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
+DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+
+
+class LogLevel(str, Enum):
+ """Supported log levels with string representation."""
+ DEBUG = "DEBUG"
+ INFO = "INFO"
+ WARNING = "WARNING"
+ ERROR = "ERROR"
+ CRITICAL = "CRITICAL"
+
+ @classmethod
+ def from_string(cls, level: str) -> "LogLevel":
+ """Convert string to LogLevel enum."""
+ try:
+ return cls(level.upper())
+ except ValueError:
+ return cls.INFO
+
+
+class LoggerConfig:
+ """
+ Configuration class for GEPA logging.
+
+ This class holds all logging configuration and can be modified
+ before calling configure_logging() to customize behavior.
+ """
+
+ # Default configuration
+ level: LogLevel = LogLevel.INFO
+ format: str = DEFAULT_FORMAT
+ date_format: str = DEFAULT_DATE_FORMAT
+
+ # Module-specific log levels (for fine-grained control)
+ module_levels: Dict[str, LogLevel] = {}
+
+ # Output configuration
+ log_to_console: bool = True
+ log_to_file: Optional[str] = None
+
+ # Formatting options
+ use_colors: bool = True
+ include_emoji: bool = True # For visual clarity in development
+
+ @classmethod
+ def reset(cls) -> None:
+ """Reset configuration to defaults."""
+ cls.level = LogLevel.INFO
+ cls.format = DEFAULT_FORMAT
+ cls.date_format = DEFAULT_DATE_FORMAT
+ cls.module_levels = {}
+ cls.log_to_console = True
+ cls.log_to_file = None
+ cls.use_colors = True
+ cls.include_emoji = True
+
+
+# Global flag to track if logging is configured
+_logging_configured = False
+
+
+def configure_logging(
+ level: Optional[str] = None,
+ log_file: Optional[str] = None,
+ use_colors: bool = True,
+ include_emoji: bool = True,
+ format_string: Optional[str] = None,
+ module_levels: Optional[Dict[str, str]] = None,
+) -> None:
+ """
+ Configure the GEPA logging system.
+
+ This should be called once at application startup. Subsequent calls
+ will update the configuration.
+
+ Args:
+ level: Global log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+ log_file: Optional path to log file
+ use_colors: Whether to use colored output in console
+ include_emoji: Whether to include emoji prefixes for visual clarity
+ format_string: Custom format string (optional)
+ module_levels: Dict mapping module names to their specific log levels
+
+ Example:
+ configure_logging(
+ level="DEBUG",
+ log_file="optimization.log",
+ module_levels={
+ "gepa_optimizer.core.optimizer": "INFO",
+ "gepa_optimizer.llms": "DEBUG"
+ }
+ )
+ """
+ global _logging_configured
+
+ # Update configuration
+ if level:
+ LoggerConfig.level = LogLevel.from_string(level)
+ if log_file:
+ LoggerConfig.log_to_file = log_file
+ LoggerConfig.use_colors = use_colors
+ LoggerConfig.include_emoji = include_emoji
+ if format_string:
+ LoggerConfig.format = format_string
+ if module_levels:
+ LoggerConfig.module_levels = {
+ k: LogLevel.from_string(v) for k, v in module_levels.items()
+ }
+
+ # Get or create root GEPA logger
+ root_logger = logging.getLogger(GEPA_LOGGER_NAME)
+ root_logger.setLevel(getattr(logging, LoggerConfig.level.value))
+
+ # Remove existing handlers to avoid duplicates
+ root_logger.handlers.clear()
+
+ # Console handler
+ if LoggerConfig.log_to_console:
+ console_handler = logging.StreamHandler(sys.stdout)
+ console_handler.setLevel(getattr(logging, LoggerConfig.level.value))
+
+ # Use custom formatter
+ formatter = GepaFormatter(
+ fmt=LoggerConfig.format,
+ datefmt=LoggerConfig.date_format,
+ use_colors=use_colors,
+ include_emoji=include_emoji,
+ )
+ console_handler.setFormatter(formatter)
+ root_logger.addHandler(console_handler)
+
+ # File handler (if configured)
+ if LoggerConfig.log_to_file:
+ file_handler = logging.FileHandler(LoggerConfig.log_to_file)
+ file_handler.setLevel(getattr(logging, LoggerConfig.level.value))
+
+ # File logs don't use colors
+ file_formatter = GepaFormatter(
+ fmt=LoggerConfig.format,
+ datefmt=LoggerConfig.date_format,
+ use_colors=False,
+ include_emoji=False,
+ )
+ file_handler.setFormatter(file_formatter)
+ root_logger.addHandler(file_handler)
+
+ # Apply module-specific levels
+ for module_name, module_level in LoggerConfig.module_levels.items():
+ module_logger = logging.getLogger(module_name)
+ module_logger.setLevel(getattr(logging, module_level.value))
+
+ _logging_configured = True
+
+ # Log that configuration is complete
+ root_logger.debug(
+ f"Logging configured: level={LoggerConfig.level.value}, "
+ f"file={LoggerConfig.log_to_file}"
+ )
+
+
+@lru_cache(maxsize=128)
+def get_logger(name: str) -> logging.Logger:
+ """
+ Get a logger instance for the given module name.
+
+ This is the primary factory function for obtaining loggers.
+ All GEPA modules should use this instead of logging.getLogger().
+
+ Args:
+ name: Module name (typically __name__)
+
+ Returns:
+ Configured Logger instance
+
+ Example:
+ from gepa_optimizer.infrastructure.logging import get_logger
+
+ logger = get_logger(__name__)
+ logger.info("Starting process")
+ logger.error("Failed to connect", exc_info=True)
+ """
+ global _logging_configured
+
+ # Auto-configure with defaults if not yet configured
+ if not _logging_configured:
+ configure_logging()
+
+ # Ensure name is under GEPA namespace for consistent handling
+ if not name.startswith(GEPA_LOGGER_NAME) and name != GEPA_LOGGER_NAME:
+ # External module - still use our formatting
+ pass
+
+ logger = logging.getLogger(name)
+
+ # Apply module-specific level if configured
+ if name in LoggerConfig.module_levels:
+ logger.setLevel(getattr(logging, LoggerConfig.module_levels[name].value))
+
+ return logger
+
+
+def set_log_level(level: str, module: Optional[str] = None) -> None:
+ """
+ Dynamically change log level at runtime.
+
+ Args:
+ level: New log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+ module: Optional module name. If None, changes global level.
+
+ Example:
+ # Enable debug for specific module
+ set_log_level("DEBUG", "gepa_optimizer.core.optimizer")
+
+ # Change global level
+ set_log_level("WARNING")
+ """
+ log_level = LogLevel.from_string(level)
+
+ if module:
+ # Set level for specific module
+ logger = logging.getLogger(module)
+ logger.setLevel(getattr(logging, log_level.value))
+ LoggerConfig.module_levels[module] = log_level
+ else:
+ # Set global level
+ LoggerConfig.level = log_level
+ root_logger = logging.getLogger(GEPA_LOGGER_NAME)
+ root_logger.setLevel(getattr(logging, log_level.value))
+
+ # Update all handlers
+ for handler in root_logger.handlers:
+ handler.setLevel(getattr(logging, log_level.value))
+
diff --git a/src/gepa_optimizer/llms/__init__.py b/src/gepa_optimizer/llms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..352f7a77e882131acec1c3084b72bb5e502a8ef0
--- /dev/null
+++ b/src/gepa_optimizer/llms/__init__.py
@@ -0,0 +1,10 @@
+"""
+LLM module for GEPA Optimizer
+"""
+
+from .base_llm import BaseLLMClient
+from .vision_llm import VisionLLMClient
+from .batch_llm import BatchLLMClient
+from .llego_enhanced_llm import LLEGOEnhancedLLMClient
+
+__all__ = ["BaseLLMClient", "VisionLLMClient", "BatchLLMClient", "LLEGOEnhancedLLMClient"]
diff --git a/src/gepa_optimizer/llms/base_llm.py b/src/gepa_optimizer/llms/base_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..15ffb4e83212922f30edad4d9e18a4b248af6234
--- /dev/null
+++ b/src/gepa_optimizer/llms/base_llm.py
@@ -0,0 +1,56 @@
+"""
+Base LLM client class for all LLM providers.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Union
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BaseLLMClient(ABC):
+ """
+ Abstract base class for all LLM clients.
+
+ Provides a consistent interface for different LLM providers and models.
+ """
+
+ def __init__(self, provider: str, model_name: str, **kwargs):
+ """
+ Initialize LLM client.
+
+ Args:
+ provider: LLM provider (e.g., 'openai', 'anthropic')
+ model_name: Specific model name
+ **kwargs: Additional provider-specific parameters
+ """
+ self.provider = provider
+ self.model_name = model_name
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+ # Store additional configuration
+ self.config = kwargs
+
+ @abstractmethod
+ def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> Dict[str, Any]:
+ """
+ Generate response from LLM.
+
+ Args:
+ system_prompt: System-level instructions
+ user_prompt: User's input prompt
+ **kwargs: Additional generation parameters (e.g., image_base64)
+
+ Returns:
+ Dictionary with 'content' key containing the generated response
+ and additional metadata
+ """
+ pass
+
+ def get_model_info(self) -> Dict[str, str]:
+ """Get model information for logging and debugging"""
+ return {
+ 'provider': self.provider,
+ 'model_name': self.model_name,
+ 'class': self.__class__.__name__
+ }
diff --git a/src/gepa_optimizer/llms/batch_llm.py b/src/gepa_optimizer/llms/batch_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae4cf1462d0fe037442b59e9190cac09fe54975
--- /dev/null
+++ b/src/gepa_optimizer/llms/batch_llm.py
@@ -0,0 +1,712 @@
+"""
+Batch LLM Client for cost-effective processing using Gemini Batch API.
+
+This client provides 50% cost savings by using Google's Gemini Batch API
+instead of real-time API calls. Ideal for large-scale prompt optimization
+where latency is acceptable.
+
+Features:
+- 50% cost reduction compared to standard API
+- Automatic batching and job management
+- Built-in retry and polling logic
+- Thread-safe operation
+- Comprehensive error handling
+
+Author: GEPA Optimizer Team
+"""
+
+import os
+import json
+import time
+import logging
+import tempfile
+import io
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+from .base_llm import BaseLLMClient
+
+try:
+ from PIL import Image
+ PIL_AVAILABLE = True
+except ImportError:
+ PIL_AVAILABLE = False
+ Image = None
+
+try:
+ from google import genai
+ from google.genai import types
+ GENAI_AVAILABLE = True
+except ImportError:
+ GENAI_AVAILABLE = False
+ genai = None
+ types = None
+
+logger = logging.getLogger(__name__)
+
+
+class BatchLLMClient(BaseLLMClient):
+ """
+ Batch LLM client that uses Gemini Batch API for cost-effective processing.
+
+ This client processes multiple requests together in batch jobs, providing:
+ - 50% cost savings vs standard API
+ - No rate limit impact
+ - Automatic job management and polling
+
+ Usage:
+ >>> from gepa_optimizer.llms import BatchLLMClient
+ >>>
+ >>> client = BatchLLMClient(
+ ... provider="google",
+ ... model_name="gemini-2.5-flash",
+ ... api_key="your-key",
+ ... batch_size=20,
+ ... polling_interval=30
+ ... )
+ >>>
+ >>> # Use just like VisionLLMClient - adapter handles the rest!
+ >>> result = client.generate(
+ ... system_prompt="You are a helpful assistant",
+ ... user_prompt="Analyze this image",
+ ... image_base64="..."
+ ... )
+
+ Performance Note:
+ Batch processing adds latency (30s+ polling time) but reduces costs by 50%.
+ Choose this mode for large-scale optimization where cost > speed.
+ """
+
+ def __init__(
+ self,
+ provider: str,
+ model_name: str,
+ api_key: Optional[str] = None,
+ batch_size: int = 20,
+ polling_interval: int = 30,
+ max_polling_time: int = 3600,
+ temp_dir: str = ".gepa_batch_temp",
+ **kwargs
+ ):
+ """
+ Initialize Batch LLM Client.
+
+ Args:
+ provider: Must be "google" or "gemini"
+ model_name: Gemini model (e.g., "gemini-2.5-flash", "gemini-1.5-flash")
+ api_key: Google API key (defaults to GEMINI_API_KEY env var)
+ batch_size: Number of samples to process per batch job (1-100)
+ polling_interval: Seconds between job status checks (default: 30)
+ max_polling_time: Maximum seconds to wait for job completion (default: 3600)
+ temp_dir: Directory for temporary files (default: ".gepa_batch_temp")
+ **kwargs: Additional parameters
+
+ Raises:
+ ValueError: If provider is not Google/Gemini
+ ImportError: If google-genai is not installed
+ """
+ super().__init__(provider=provider, model_name=model_name, **kwargs)
+
+ # Validate provider
+ if provider.lower() not in ["google", "gemini"]:
+ raise ValueError(
+ f"BatchLLMClient only supports Google/Gemini provider. Got: {provider}"
+ )
+
+ # Check dependencies
+ if not GENAI_AVAILABLE:
+ raise ImportError(
+ "google-genai not installed. Install with: pip install google-genai"
+ )
+
+ # Configuration
+ self.batch_size = batch_size
+ self.polling_interval = polling_interval
+ self.max_polling_time = max_polling_time
+ self.temp_dir = Path(temp_dir)
+ self.temp_dir.mkdir(exist_ok=True)
+
+ # Initialize Gemini client
+ from ..utils.api_keys import APIKeyManager
+ self.api_key = api_key or APIKeyManager().get_api_key("google")
+
+ if not self.api_key:
+ raise ValueError(
+ "Google API key required. Provide via api_key parameter or "
+ "set GEMINI_API_KEY environment variable."
+ )
+
+ self.client = genai.Client(api_key=self.api_key)
+
+ logger.info(
+ f"โ BatchLLMClient initialized: {model_name} "
+ f"(batch_size={batch_size}, polling={polling_interval}s)"
+ )
+
+ def generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: Optional[str] = None,
+ **kwargs
+ ) -> Dict[str, Any]:
+ """
+ Generate response using batch API.
+
+ Note: This method is primarily for compatibility. For batch optimization,
+ the adapter will call generate_batch() directly with multiple requests.
+
+ Args:
+ system_prompt: System-level instructions
+ user_prompt: User's input prompt
+ image_base64: Optional base64 encoded image
+ **kwargs: Additional generation parameters
+
+ Returns:
+ Dict with 'content' key containing generated text
+ """
+ # Single request - process as a batch of 1
+ requests = [{
+ 'system_prompt': system_prompt,
+ 'user_prompt': user_prompt,
+ 'image_base64': image_base64
+ }]
+
+ results = self.generate_batch(requests)
+ return results[0] if results else {"content": "", "error": "No results"}
+
+ def generate_batch(
+ self,
+ requests: List[Dict[str, Any]],
+ timeout_override: Optional[int] = None
+ ) -> List[Dict[str, Any]]:
+ """
+ Process multiple requests in a single batch job.
+
+ This is the main method called by UniversalGepaAdapter during GEPA optimization.
+
+ Args:
+ requests: List of request dicts with keys:
+ - system_prompt: System instructions
+ - user_prompt: User input
+ - image_base64: Optional base64 image
+ timeout_override: Override max_polling_time for this batch
+
+ Returns:
+ List of response dicts with 'content' key
+
+ Raises:
+ RuntimeError: If batch job fails
+ TimeoutError: If polling exceeds timeout
+ """
+ logger.info(f"๐ฆ Processing batch of {len(requests)} requests via Gemini Batch API...")
+
+ start_time = time.time()
+
+ try:
+ # Step 1: Upload images if needed
+ file_uris, mime_types = self._upload_images_for_batch(requests)
+
+ # Step 2: Create JSONL file
+ jsonl_path = self._create_batch_jsonl(requests, file_uris, mime_types)
+
+ # Step 3: Submit batch job
+ batch_job_name = self._submit_batch_job(jsonl_path)
+
+ # Step 4: Wait for completion
+ timeout = timeout_override or self.max_polling_time
+ self._wait_for_batch_completion(batch_job_name, timeout)
+
+ # Step 5: Retrieve results
+ results = self._retrieve_batch_results(batch_job_name)
+
+ # Cleanup
+ jsonl_path.unlink(missing_ok=True)
+
+ elapsed_time = time.time() - start_time
+ logger.info(
+ f"โ Batch processing complete: {len(results)} results in {elapsed_time:.1f}s "
+ f"(~{elapsed_time/len(results):.1f}s per request)"
+ )
+
+ return results
+
+ except Exception as e:
+ elapsed_time = time.time() - start_time
+ logger.error(f"โ Batch processing failed after {elapsed_time:.1f}s: {e}")
+ raise
+
+ def _upload_images_for_batch(self, requests: List[Dict]) -> Tuple[List[Optional[str]], List[Optional[str]]]:
+ """
+ Upload images to Gemini and return file URIs and MIME types.
+
+ Args:
+ requests: List of request dicts
+
+ Returns:
+ Tuple of (file_uris, mime_types) - both are lists with None for requests without images
+ """
+ file_uris = []
+ mime_types = []
+ images_to_upload = sum(1 for r in requests if r.get('image_base64'))
+
+ if images_to_upload > 0:
+ logger.info(f" โฌ๏ธ Uploading {images_to_upload} images to Gemini...")
+
+ for i, request in enumerate(requests):
+ image_base64 = request.get('image_base64')
+
+ if not image_base64:
+ file_uris.append(None)
+ mime_types.append(None)
+ continue
+
+ try:
+ # Decode image data
+ import base64
+ image_data = base64.b64decode(image_base64)
+
+ # Detect image format using Pillow
+ image_format = None
+ if PIL_AVAILABLE:
+ try:
+ img = Image.open(io.BytesIO(image_data))
+ image_format = img.format.lower() if img.format else None
+ except Exception as e:
+ logger.warning(f" โ ๏ธ Could not detect image format: {e}")
+
+ # Map format to extension and MIME type
+ format_map = {
+ 'jpeg': ('.jpg', 'image/jpeg'),
+ 'jpg': ('.jpg', 'image/jpeg'),
+ 'png': ('.png', 'image/png'),
+ 'gif': ('.gif', 'image/gif'),
+ 'webp': ('.webp', 'image/webp'),
+ 'bmp': ('.bmp', 'image/bmp'),
+ 'tiff': ('.tiff', 'image/tiff'),
+ 'tif': ('.tiff', 'image/tiff'),
+ }
+
+ # Get extension and MIME type (default to PNG if unknown)
+ ext, mime_type = format_map.get(image_format, ('.png', 'image/png'))
+
+ if image_format and image_format not in format_map:
+ logger.warning(f" โ ๏ธ Unknown image format '{image_format}' for image {i}, defaulting to PNG")
+ elif not image_format:
+ logger.debug(f" โน๏ธ Could not detect format for image {i}, using PNG")
+
+ # Save to temp file with correct extension
+ temp_file = tempfile.NamedTemporaryFile(
+ delete=False,
+ suffix=ext,
+ dir=self.temp_dir
+ )
+ temp_file.write(image_data)
+ temp_file.close()
+
+ # Upload to Gemini with correct MIME type
+ uploaded_file = self.client.files.upload(
+ file=temp_file.name,
+ config=types.UploadFileConfig(
+ display_name=f"batch_image_{i}_{int(time.time())}{ext}",
+ mime_type=mime_type
+ )
+ )
+
+ logger.debug(f" โ Uploaded image {i} as {mime_type}")
+
+ # Wait for file to be active
+ self._wait_for_file_active(uploaded_file)
+ file_uris.append(uploaded_file.uri)
+ mime_types.append(mime_type)
+
+ # Cleanup temp file
+ Path(temp_file.name).unlink()
+
+ except Exception as e:
+ logger.error(f" โ Failed to upload image {i}: {e}")
+ file_uris.append(None)
+ mime_types.append(None)
+
+ if images_to_upload > 0:
+ successful = sum(1 for uri in file_uris if uri is not None)
+ logger.info(f" โ Uploaded {successful}/{images_to_upload} images successfully")
+
+ return file_uris, mime_types
+
+ def _create_batch_jsonl(
+ self,
+ requests: List[Dict],
+ file_uris: List[Optional[str]],
+ mime_types: List[Optional[str]]
+ ) -> Path:
+ """
+ Create JSONL file for batch job.
+
+ Args:
+ requests: List of request dicts
+ file_uris: List of uploaded file URIs
+ mime_types: List of MIME types for uploaded files
+
+ Returns:
+ Path to created JSONL file
+ """
+ timestamp = int(time.time())
+ jsonl_path = self.temp_dir / f"batch_{timestamp}.jsonl"
+
+ with open(jsonl_path, 'w', encoding='utf-8') as f:
+ for i, (request, file_uri, mime_type) in enumerate(zip(requests, file_uris, mime_types)):
+ # Combine system and user prompts
+ system_prompt = request.get('system_prompt', '')
+ user_prompt = request.get('user_prompt', '')
+ full_prompt = f"{system_prompt}\n\n{user_prompt}".strip()
+
+ # Build request parts
+ parts = [{"text": full_prompt}]
+
+ if file_uri:
+ parts.append({
+ "file_data": {
+ "file_uri": file_uri,
+ "mime_type": mime_type or "image/png" # Use actual MIME type
+ }
+ })
+
+ # Gemini Batch API format according to official docs
+ # Reference: https://ai.google.dev/gemini-api/docs/batch-inference
+ # NOTE: The "request" wrapper is REQUIRED for Gemini 2.5 batch API
+ batch_request = {
+ "custom_id": f"request-{i}",
+ "request": {
+ "contents": [{
+ "role": "user",
+ "parts": parts
+ }]
+ }
+ }
+
+ f.write(json.dumps(batch_request, ensure_ascii=False) + '\n')
+
+ logger.info(f" ๐ Created JSONL file: {jsonl_path.name} ({len(requests)} requests)")
+ return jsonl_path
+
+ def _submit_batch_job(self, jsonl_path: Path) -> str:
+ """
+ Submit batch job to Gemini.
+
+ Args:
+ jsonl_path: Path to JSONL file
+
+ Returns:
+ Batch job name
+ """
+ # Upload JSONL file
+ # Try multiple methods as the google-genai SDK can be finicky
+ try:
+ logger.info(f" ๐ค Uploading JSONL file: {jsonl_path.name}")
+
+ # Read and validate file content
+ with open(jsonl_path, 'r', encoding='utf-8') as f:
+ content = f.read()
+ line_count = len(content.strip().split('\n'))
+ logger.debug(f" ๐ JSONL: {len(content)} bytes, {line_count} lines")
+
+ # Validate JSONL format
+ for line_num, line in enumerate(content.strip().split('\n'), 1):
+ try:
+ json.loads(line)
+ except json.JSONDecodeError as e:
+ logger.error(f" โ Invalid JSON at line {line_num}: {e}")
+ logger.error(f" Content: {line[:100]}...")
+ raise ValueError(f"Invalid JSONL format at line {line_num}") from e
+
+ # Method 1: Try uploading with Path object
+ logger.info(f" ๐ Upload method 1: Using Path object...")
+ try:
+ jsonl_file = self.client.files.upload(
+ file=jsonl_path,
+ config=types.UploadFileConfig(
+ display_name=f'gepa-batch-{int(time.time())}',
+ mime_type='application/json' # Try application/json instead of application/jsonl
+ )
+ )
+ logger.info(f" โ JSONL file uploaded: {jsonl_file.name}")
+
+ except Exception as e1:
+ logger.warning(f" โ ๏ธ Method 1 failed: {e1}")
+ logger.info(f" ๐ Upload method 2: Using string path...")
+
+ # Method 2: Fallback to string path
+ try:
+ jsonl_file = self.client.files.upload(
+ file=str(jsonl_path.absolute()),
+ config=types.UploadFileConfig(
+ display_name=f'gepa-batch-{int(time.time())}',
+ mime_type='application/json'
+ )
+ )
+ logger.info(f" โ JSONL file uploaded (method 2): {jsonl_file.name}")
+ except Exception as e2:
+ logger.error(f" โ Method 2 also failed: {e2}")
+ raise e2
+
+ except KeyError as e:
+ logger.error(f"โ KeyError during JSONL upload: {e}")
+ logger.error(f" This suggests the Gemini API response format changed")
+ logger.error(f" Try updating google-genai: pip install --upgrade google-genai")
+ raise RuntimeError(f"Gemini Batch API response format error: {e}") from e
+ except Exception as e:
+ logger.error(f"โ Failed to upload JSONL file: {e}")
+ logger.error(f" File path: {jsonl_path}")
+ logger.error(f" File exists: {jsonl_path.exists()}")
+ logger.error(f" File size: {jsonl_path.stat().st_size if jsonl_path.exists() else 'N/A'} bytes")
+ raise RuntimeError(f"Gemini Batch API file upload failed: {e}") from e
+
+ # Wait for JSONL to be active
+ try:
+ logger.info(f" โณ Waiting for JSONL file to be processed...")
+ self._wait_for_file_active(jsonl_file)
+ except Exception as e:
+ logger.error(f"โ JSONL file processing failed: {e}")
+ raise
+
+ # Create batch job
+ try:
+ logger.info(f" ๐ Creating batch job...")
+ batch_job = self.client.batches.create(
+ model=self.model_name,
+ src=jsonl_file.name,
+ config={'display_name': f'gepa-opt-{int(time.time())}'}
+ )
+
+ logger.info(f" โ Batch job submitted: {batch_job.name}")
+ return batch_job.name
+
+ except Exception as e:
+ logger.error(f"โ Failed to create batch job: {e}")
+ raise RuntimeError(f"Batch job creation failed: {e}") from e
+
+ def _wait_for_batch_completion(self, job_name: str, timeout: int):
+ """
+ Poll batch job until completion.
+
+ Args:
+ job_name: Batch job name
+ timeout: Maximum seconds to wait
+
+ Raises:
+ TimeoutError: If polling exceeds timeout
+ RuntimeError: If batch job fails
+ """
+ logger.info(f" โณ Polling for completion (checking every {self.polling_interval}s)...")
+
+ start_time = time.time()
+ poll_count = 0
+
+ while True:
+ elapsed = time.time() - start_time
+
+ if elapsed > timeout:
+ raise TimeoutError(
+ f"Batch job timeout after {elapsed:.0f}s "
+ f"(max: {timeout}s)"
+ )
+
+ try:
+ batch_job = self.client.batches.get(name=job_name)
+ state = batch_job.state.name
+
+ # Success states
+ if state in ['JOB_STATE_SUCCEEDED', 'SUCCEEDED']:
+ logger.info(f" โ Batch job completed in {elapsed:.0f}s")
+ return
+
+ # Failure states
+ if state in ['JOB_STATE_FAILED', 'FAILED']:
+ raise RuntimeError(f"Batch job failed with state: {state}")
+
+ if state in ['JOB_STATE_CANCELLED', 'CANCELLED']:
+ raise RuntimeError(f"Batch job was cancelled: {state}")
+
+ # Still processing
+ poll_count += 1
+ if poll_count % 5 == 0: # Log every 5 polls
+ logger.info(f" ... still processing ({elapsed:.0f}s elapsed, state: {state})")
+
+ time.sleep(self.polling_interval)
+
+ except (TimeoutError, RuntimeError):
+ raise
+ except Exception as e:
+ logger.warning(f" โ ๏ธ Error checking job status: {e}, retrying...")
+ time.sleep(5)
+
+ def _retrieve_batch_results(self, job_name: str) -> List[Dict[str, Any]]:
+ """
+ Retrieve and parse batch results.
+
+ Args:
+ job_name: Batch job name
+
+ Returns:
+ List of result dicts
+ """
+ batch_job = self.client.batches.get(name=job_name)
+
+ # Check for inline responses (preferred)
+ if hasattr(batch_job.dest, 'inlined_responses') and batch_job.dest.inlined_responses:
+ logger.info(f" ๐ฅ Processing inline responses...")
+ return self._parse_inline_results(batch_job.dest.inlined_responses)
+
+ # Download results file (fallback)
+ if hasattr(batch_job.dest, 'file_name') and batch_job.dest.file_name:
+ logger.info(f" ๐ฅ Downloading results file: {batch_job.dest.file_name}")
+ file_data = self.client.files.download(file=batch_job.dest.file_name)
+ return self._parse_file_results(file_data)
+
+ raise RuntimeError("No results available from batch job")
+
+ def _parse_inline_results(self, inline_responses) -> List[Dict[str, Any]]:
+ """Parse inline batch results."""
+ results = []
+
+ for response_obj in inline_responses:
+ if hasattr(response_obj, 'response') and response_obj.response:
+ text = self._extract_text_from_response(response_obj.response)
+ results.append({
+ "content": text,
+ "role": "assistant",
+ "model": self.model_name,
+ "provider": "google"
+ })
+ else:
+ error_msg = str(getattr(response_obj, 'error', 'Unknown error'))
+ logger.warning(f" โ ๏ธ Response error: {error_msg}")
+ results.append({
+ "content": "",
+ "error": error_msg
+ })
+
+ return results
+
+ def _parse_file_results(self, file_data) -> List[Dict[str, Any]]:
+ """Parse JSONL results file."""
+ if isinstance(file_data, bytes):
+ jsonl_content = file_data.decode('utf-8')
+ else:
+ jsonl_content = file_data
+
+ results = []
+
+ for line_num, line in enumerate(jsonl_content.strip().split('\n'), 1):
+ if not line.strip():
+ continue
+
+ try:
+ result = json.loads(line)
+
+ if 'response' in result:
+ text = self._extract_text_from_dict(result['response'])
+ results.append({
+ "content": text,
+ "role": "assistant",
+ "model": self.model_name,
+ "provider": "google"
+ })
+ else:
+ error_msg = result.get('error', 'Unknown error')
+ logger.warning(f" โ ๏ธ Line {line_num} error: {error_msg}")
+ results.append({
+ "content": "",
+ "error": error_msg
+ })
+
+ except json.JSONDecodeError as e:
+ logger.error(f" โ Line {line_num}: JSON decode error: {e}")
+ results.append({"content": "", "error": f"JSON decode error: {e}"})
+
+ return results
+
+ def _extract_text_from_response(self, response_obj) -> str:
+ """Extract text from response object."""
+ try:
+ # Direct text attribute
+ if hasattr(response_obj, 'text'):
+ return response_obj.text
+
+ # Navigate through candidates
+ if hasattr(response_obj, 'candidates') and response_obj.candidates:
+ candidate = response_obj.candidates[0]
+ if hasattr(candidate, 'content'):
+ content = candidate.content
+ if hasattr(content, 'parts') and content.parts:
+ part = content.parts[0]
+ if hasattr(part, 'text'):
+ return part.text
+
+ # Fallback to string representation
+ return str(response_obj)
+
+ except Exception as e:
+ logger.error(f"Error extracting text from response: {e}")
+ return ""
+
+ def _extract_text_from_dict(self, response_dict: Dict) -> str:
+ """Extract text from response dictionary."""
+ try:
+ # Direct text key
+ if 'text' in response_dict:
+ return response_dict['text']
+
+ # Navigate through candidates
+ if 'candidates' in response_dict and response_dict['candidates']:
+ candidate = response_dict['candidates'][0]
+ if 'content' in candidate and 'parts' in candidate['content']:
+ parts = candidate['content']['parts']
+ if parts and 'text' in parts[0]:
+ return parts[0]['text']
+
+ # Fallback to JSON string
+ return json.dumps(response_dict)
+
+ except Exception as e:
+ logger.error(f"Error extracting text from dict: {e}")
+ return ""
+
+ def _wait_for_file_active(self, uploaded_file, timeout: int = 60):
+ """
+ Wait for uploaded file to become active.
+
+ Args:
+ uploaded_file: Uploaded file object
+ timeout: Maximum seconds to wait
+
+ Raises:
+ TimeoutError: If file processing exceeds timeout
+ RuntimeError: If file processing fails
+ """
+ start_time = time.time()
+
+ while uploaded_file.state.name == "PROCESSING":
+ if time.time() - start_time > timeout:
+ raise TimeoutError(f"File processing timeout: {uploaded_file.name}")
+
+ time.sleep(1)
+ uploaded_file = self.client.files.get(name=uploaded_file.name)
+
+ if uploaded_file.state.name != "ACTIVE":
+ raise RuntimeError(
+ f"File processing failed: {uploaded_file.name} "
+ f"(state: {uploaded_file.state.name})"
+ )
+
+ def get_model_info(self) -> Dict[str, str]:
+ """Get model information for logging and debugging."""
+ return {
+ 'provider': self.provider,
+ 'model_name': self.model_name,
+ 'class': self.__class__.__name__,
+ 'mode': 'batch',
+ 'batch_size': str(self.batch_size),
+ 'polling_interval': f'{self.polling_interval}s'
+ }
+
diff --git a/src/gepa_optimizer/llms/llego_enhanced_llm.py b/src/gepa_optimizer/llms/llego_enhanced_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8785cffea3eb2ef716139f07e92b0f62d3d5cde3
--- /dev/null
+++ b/src/gepa_optimizer/llms/llego_enhanced_llm.py
@@ -0,0 +1,1625 @@
+"""
+LLEGO-Enhanced LLM Client Wrapper
+
+This wrapper intercepts LLM calls and uses LLEGO genetic operators
+when generating new prompt candidates during GEPA's reflection phase.
+"""
+
+import logging
+import re
+from typing import Optional, Dict, Any, Callable, List
+from .base_llm import BaseLLMClient
+
+logger = logging.getLogger(__name__)
+
+# Fallback system prompt for sequential generation (when JSON parsing fails)
+# Uses Linear Command structure for reliability when complex JSON generation fails
+_FALLBACK_SYSTEM_PROMPT = """You are a Prompt Optimization Engine operating in **SAFE MODE**.
+
+
+Rewrite the prompt based on the feedback provided below.
+
+
+
+1. Output **ONLY** the new prompt text.
+2. No JSON. No Explanations. No "Here is the prompt".
+3. The prompt must be fully functional and self-contained.
+4. START directly with the prompt content (e.g., "You are a..." or task instructions).
+5. Preserve the core task/domain - only improve HOW it's described.
+
+
+
+- Be specific and concrete (no vague instructions)
+- Use clear, imperative language
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+- Add explicit constraints for format/output if needed
+
+
+
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..."
+- Anything other than the raw prompt text
+
+
+Start of New Prompt:"""
+
+
+class LLEGOEnhancedLLMClient(BaseLLMClient):
+ """
+ Wrapper around BaseLLMClient that uses LLEGO for candidate generation.
+
+ This wrapper detects when GEPA is asking for new prompt candidates
+ and routes those requests through LLEGO's genetic operators instead
+ of standard LLM generation.
+ """
+
+ def __init__(
+ self,
+ base_llm: BaseLLMClient,
+ llego_layer,
+ config=None,
+ verbose: bool = True
+ ):
+ """
+ Initialize LLEGO-enhanced LLM client.
+
+ Args:
+ base_llm: The underlying LLM client (VisionLLMClient, etc.)
+ llego_layer: LLEGOIntegrationLayer instance
+ config: Optional OptimizationConfig for hybrid mode settings
+ verbose: Whether to log LLEGO operations
+ """
+ self.base_llm = base_llm
+ self.llego = llego_layer
+ self.config = config
+ self.verbose = verbose
+
+ # Get log level from config (default to INFO)
+ self.log_level = getattr(config, 'log_level', 'INFO') if config else 'INFO'
+
+ # Track context for detecting reflection calls
+ self.reflection_context = {
+ 'current_prompt': None,
+ 'feedback': None,
+ 'in_reflection': False
+ }
+
+ # Queue for hybrid mode candidates (GEPA will call generate() multiple times)
+ self._candidate_queue = []
+ self._hybrid_generation_complete = False
+
+ # ๐ฅ CRITICAL: Queue for adapter-generated candidates (from make_reflective_dataset)
+ # When adapter generates candidates at adapter level, they're stored here
+ # GEPA will call generate() for proposals, and we'll return these candidates
+ self._adapter_generated_candidates = []
+
+
+ # ๐ฅ FORMAT AWARENESS: Store format info from adapter for use in candidate generation
+ self._detected_format = None # Will be set by adapter after format detection
+
+ # FIX #5: Circuit breaker for LLEGO failures
+ self._llego_failures = 0
+ self._llego_disabled = False
+ self._llego_failure_threshold = 3 # Disable after 3 consecutive failures
+
+ logger.info("๐งฌ LLEGO-Enhanced LLM Client initialized")
+ logger.info(f" Base LLM: {base_llm.__class__.__name__}")
+ logger.info(f" LLEGO enabled: {llego_layer is not None}")
+ if config and hasattr(config, 'enable_gepa_reflection_with_llego'):
+ logger.info(f" Hybrid mode: {config.enable_gepa_reflection_with_llego}")
+ logger.debug(f" Log level: {self.log_level}")
+
+ def _should_log_debug(self) -> bool:
+ """
+ Check if DEBUG logging is enabled.
+
+ Returns:
+ True if DEBUG level logging is enabled, False otherwise
+ """
+ return self.log_level == "DEBUG" or (
+ hasattr(logging, 'getLogger') and
+ logging.getLogger().isEnabledFor(logging.DEBUG)
+ )
+
+ def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str:
+ """
+ ๐ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions.
+
+ NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text.
+ However, this extraction logic serves as a safety net in case the LLM still adds:
+ "Based on the performance analysis...
+ ### Recommendations...
+ ### Revised Prompt Example:
+ [THE ACTUAL PROMPT HERE]
+ ### Conclusion..."
+
+ This is now a defensive measure, not the primary mechanism.
+
+ Args:
+ reflection_output: Full reflection output (should be clean prompt, but may contain analysis)
+
+ Returns:
+ str: Clean, extracted prompt (or original if extraction fails or not needed)
+ """
+ if not reflection_output or not isinstance(reflection_output, str):
+ return reflection_output
+
+ # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:"
+ patterns = [
+ r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+ r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)',
+ ]
+
+ for pattern in patterns:
+ match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL)
+ if match:
+ extracted = match.group(1).strip()
+ # Clean up common artifacts
+ extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE)
+ extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE)
+ extracted = extracted.strip()
+
+ if len(extracted) > 50: # Reasonable minimum length for a prompt
+ logger.debug(f"โ
Extracted clean prompt using pattern: {pattern[:50]}...")
+ logger.debug(f" Original length: {len(reflection_output)} chars")
+ logger.debug(f" Extracted length: {len(extracted)} chars")
+ return extracted
+
+ # Pattern 2: If output starts with a quote or prompt-like structure
+ # Look for text that starts with "You are..." and is substantial
+ if 'You are' in reflection_output:
+ # Find the longest continuous block that starts with "You are"
+ prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)',
+ reflection_output, re.IGNORECASE | re.DOTALL)
+ if prompt_match:
+ extracted = prompt_match.group(1).strip()
+ if len(extracted) > 50:
+ logger.debug(f"โ
Extracted prompt starting with 'You are...'")
+ return extracted
+
+ # Pattern 3: If the reflection output is actually just a clean prompt (no analysis)
+ # Check if it's relatively short and doesn't contain analysis keywords
+ analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion',
+ 'optimization', 'analysis', 'feedback']
+ if (len(reflection_output) < 2000 and
+ not any(keyword in reflection_output.lower() for keyword in analysis_keywords)):
+ # Likely a clean prompt, return as-is
+ logger.debug(f"โ
Reflection output appears to be a clean prompt (no analysis detected)")
+ return reflection_output.strip()
+
+ # Fallback: Try to extract ANY valid prompt-like text
+ # Look for text that might be a prompt even if not perfectly formatted
+ if 'You are' in reflection_output:
+ # Try to find a substantial block starting with "You are"
+ potential_prompt = re.search(
+ r'(You are(?:[^\.]|\.(?!\s*(?:Here|This|These|The above)))*?)(?:\n\n|\n###|Conclusion|\Z)',
+ reflection_output,
+ re.IGNORECASE | re.DOTALL
+ )
+ if potential_prompt and len(potential_prompt.group(1)) > 100:
+ extracted = potential_prompt.group(1).strip()
+ logger.warning(f"โ ๏ธ Could not extract clean prompt using standard patterns")
+ logger.warning(f" Falling back to 'You are...' block (length: {len(extracted)} chars)")
+ logger.warning(f" This may still contain some analysis text")
+ return extracted
+
+ # Final fallback: If still nothing, return original but log strongly
+ logger.warning(f"โ ๏ธ Could not extract clean prompt from reflection output")
+ logger.warning(f" Output length: {len(reflection_output)} chars")
+ logger.warning(f" Output preview: {reflection_output[:200]}...")
+ logger.warning(f" โ ๏ธ WARNING: Returning original output (may contain analysis text or be invalid)")
+ logger.warning(f" This candidate may perform poorly - consider improving extraction logic")
+ return reflection_output.strip()
+
+ def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]:
+ """
+ ๐ฅ OPTIMIZED: Parse N prompt variations from JSON format response.
+
+ Uses robust JSON parsing with multiple fallback strategies.
+
+ Handles common LLM output issues:
+ - Markdown code blocks (```json ... ```)
+ - Extra text before/after JSON
+ - Trailing commas
+ - Comments in JSON
+ - Newlines in strings
+ """
+ import json
+ import re
+
+ if not response_text or not isinstance(response_text, str):
+ raise ValueError("Empty or invalid response text")
+
+ # ๐ฅ PREPROCESSING: Clean LLM output
+ cleaned = response_text.strip()
+
+ # Remove BOM and invisible chars
+ cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d')
+
+ # Strategy 0: Handle Python dict syntax (single quotes -> double quotes)
+ # LLMs sometimes return Python dict syntax {'key': 'value'} instead of JSON {"key": "value"}
+ if "'variations'" in cleaned or (cleaned.startswith("{'") or cleaned.startswith("{'variations'")):
+ try:
+ import ast
+ # Try to parse as Python literal (handles single quotes, True/False, None)
+ python_dict = ast.literal_eval(cleaned)
+ if isinstance(python_dict, dict) and 'variations' in python_dict:
+ # Convert to JSON-compatible format
+ json_str = json.dumps(python_dict)
+ data = json.loads(json_str)
+ if 'variations' in data:
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_parse", "message": "Successfully parsed Python dict syntax", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ return self._extract_variations_from_json(data, num_expected)
+ except (ValueError, SyntaxError, TypeError) as e:
+ # If ast.literal_eval fails, try string replacement as fallback
+ try:
+ # Simple conversion: replace single quotes with double quotes (with escaping)
+ # This is a heuristic and may not work for all cases
+ converted = cleaned.replace("'", '"')
+ data = json.loads(converted)
+ if 'variations' in data:
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_string_replace", "message": "Parsed Python dict via string replacement", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 1: Direct JSON parse (cleanest case)
+ try:
+ data = json.loads(cleaned)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 2: Extract from markdown code block
+ # More permissive regex that handles various formats
+ code_block_patterns = [
+ r'```(?:json|JSON)?\s*(\{[\s\S]*?\})\s*```', # Standard markdown
+ r'```\s*(\{[\s\S]*"variations"[\s\S]*\})\s*```', # With "variations" keyword
+ ]
+
+ for pattern in code_block_patterns:
+ json_match = re.search(pattern, cleaned)
+ if json_match:
+ json_str = json_match.group(1)
+ try:
+ data = json.loads(json_str)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ # Try repair
+ repaired = self._repair_json_string(json_str)
+ try:
+ data = json.loads(repaired)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 3: Balanced brace extraction (handles nested objects)
+ json_str = self._extract_balanced_json(cleaned)
+ if json_str:
+ try:
+ data = json.loads(json_str)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ repaired = self._repair_json_string(json_str)
+ try:
+ data = json.loads(repaired)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 4: Find JSON object with "variations" keyword
+ # Use greedy matching to get the full object
+ json_match = re.search(r'(\{[\s\S]*"variations"[\s\S]*\})', cleaned)
+ if json_match:
+ json_str = json_match.group(1)
+ # Find the balanced JSON within
+ balanced = self._extract_balanced_json(json_str)
+ if balanced:
+ try:
+ data = json.loads(balanced)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ repaired = self._repair_json_string(balanced)
+ try:
+ data = json.loads(repaired)
+ if 'variations' in data:
+ return self._extract_variations_from_json(data, num_expected)
+ except json.JSONDecodeError:
+ pass
+
+ # Strategy 5: Fallback to numbered sections
+ logger.warning(f"JSON parsing failed, trying numbered section fallback...")
+ try:
+ return self._parse_numbered_section_variations(response_text, num_expected)
+ except ValueError:
+ pass
+
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "D", "location": "llego_enhanced_llm.py:json_parse_fail", "message": "JSON parsing failed completely", "data": {"num_expected": num_expected, "response_preview": response_text[:500] if response_text else "EMPTY", "response_length": len(response_text) if response_text else 0}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ raise ValueError(f"Could not parse {num_expected} variations from response")
+
+ def _extract_balanced_json(self, text: str) -> Optional[str]:
+ """Extract JSON with balanced braces."""
+ brace_count = 0
+ start_idx = -1
+ in_string = False
+ escape_next = False
+
+ for i, char in enumerate(text):
+ # Handle string escaping
+ if escape_next:
+ escape_next = False
+ continue
+ if char == '\\' and in_string:
+ escape_next = True
+ continue
+ if char == '"' and not escape_next:
+ in_string = not in_string
+ continue
+
+ # Skip characters inside strings
+ if in_string:
+ continue
+
+ if char == '{':
+ if brace_count == 0:
+ start_idx = i
+ brace_count += 1
+ elif char == '}':
+ brace_count -= 1
+ if brace_count == 0 and start_idx >= 0:
+ return text[start_idx:i+1]
+
+ return None
+
+ def _repair_json_string(self, json_str: str) -> str:
+ """
+ Repair common JSON issues from LLM output.
+
+ Fixes:
+ - Trailing commas
+ - Comments
+ - Unescaped newlines in strings
+ """
+ repaired = json_str
+
+ # Remove trailing commas before } or ]
+ repaired = re.sub(r',\s*}', '}', repaired)
+ repaired = re.sub(r',\s*]', ']', repaired)
+
+ # Remove single-line comments
+ repaired = re.sub(r'//[^\n]*\n', '\n', repaired)
+
+ # Remove multi-line comments
+ repaired = re.sub(r'/\*[\s\S]*?\*/', '', repaired)
+
+ return repaired
+
+ def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]:
+ """Extract and validate variations from parsed JSON data."""
+
+ if not isinstance(data, dict):
+ raise ValueError("JSON data is not a dictionary")
+
+ variations_list = data.get('variations', [])
+ if not isinstance(variations_list, list):
+ raise ValueError("'variations' field is not a list")
+
+ # Extract and sort by index
+ variations_with_index = []
+ for var in variations_list:
+ if not isinstance(var, dict):
+ continue
+ index = var.get('index', 0)
+ prompt = var.get('prompt', '')
+ if prompt and isinstance(prompt, str):
+ variations_with_index.append((index, prompt.strip()))
+
+ variations_with_index.sort(key=lambda x: x[0])
+ variations = [v[1] for v in variations_with_index]
+
+ # Validate count
+ if len(variations) < num_expected:
+ logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}")
+ while len(variations) < num_expected:
+ variations.append(variations[-1] if variations else "")
+
+ variations = variations[:num_expected]
+
+ if not all(v for v in variations):
+ raise ValueError(f"Some variations are empty after parsing")
+
+ return variations
+
+ def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]:
+ """Fallback parser: Extract variations from numbered sections."""
+ import re
+
+ variations = []
+
+ pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)'
+ matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE)
+
+ pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)'
+ matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE)
+
+ pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)'
+ matches3 = re.findall(pattern3, response_text, re.DOTALL)
+
+ matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3)
+
+ if len(matches) >= num_expected:
+ matches.sort(key=lambda x: int(x[0]))
+ variations = [match[1].strip() for match in matches[:num_expected]]
+
+ if len(variations) != num_expected:
+ raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}")
+
+ return variations
+
+ def _is_valid_prompt(self, prompt: str) -> bool:
+ """
+ Validate that extracted text is actually a valid system prompt.
+
+ Uses minimal, conservative filtering: only rejects OBVIOUSLY wrong text.
+ Let evaluation decide on quality - false negatives (rejecting good prompts)
+ are worse than false positives (accepting bad prompts).
+
+ Args:
+ prompt: Extracted text to validate
+
+ Returns:
+ True if appears to be a valid prompt, False if obviously wrong
+ """
+ if not prompt or not prompt.strip():
+ return False
+
+ prompt_lower = prompt.lower().strip()
+
+ # STRONG indicators of analysis text (high confidence rejection)
+ # These are phrases that almost never appear in actual prompts
+ strong_analysis_patterns = [
+ 'in conclusion',
+ 'to summarize',
+ 'based on the analysis',
+ 'the analysis shows',
+ 'here are some suggestions',
+ 'it seems you\'re looking for',
+ ]
+
+ # Check first 200 characters for strong patterns
+ first_200 = prompt_lower[:200]
+ for pattern in strong_analysis_patterns:
+ if pattern in first_200:
+ if self._should_log_debug():
+ logger.debug(f"Rejected prompt: contains analysis pattern '{pattern}'")
+ return False
+
+ # POSITIVE indicators of valid prompt (high confidence acceptance)
+ # These are common prompt starters
+ valid_starters = [
+ 'you are',
+ 'you\'re',
+ 'your task',
+ 'your role',
+ 'analyze',
+ 'identify',
+ 'select',
+ 'determine',
+ 'given',
+ 'when',
+ ]
+
+ # If starts with valid prompt pattern, accept immediately
+ first_100 = prompt_lower[:100]
+ if any(first_100.startswith(starter) for starter in valid_starters):
+ return True
+
+ # DEFAULT: Accept everything else and let evaluation decide
+ # This is conservative - we'd rather evaluate a bad prompt than reject a good one
+ return True
+
+ def set_reflection_context(
+ self,
+ current_prompt: Optional[str] = None,
+ feedback: Optional[Any] = None,
+ in_reflection: bool = False
+ ):
+ """
+ Set context for the next generate() call.
+
+ Args:
+ current_prompt: The prompt being reflected upon
+ feedback: Evaluation feedback
+ in_reflection: Whether we're in reflection mode
+ """
+ self.reflection_context = {
+ 'current_prompt': current_prompt,
+ 'feedback': feedback,
+ 'in_reflection': in_reflection
+ }
+
+ # Reset candidate queue when entering new reflection phase
+ if in_reflection:
+ self._candidate_queue = []
+ self._hybrid_generation_complete = False
+ if self._should_log_debug():
+ logger.debug("๐ Entering LLEGO reflection mode (queue reset)")
+ else:
+ logger.info("๐ Entering LLEGO reflection mode")
+
+ def generate(
+ self,
+ system_prompt: str = "",
+ user_prompt: str = "",
+ image_base64: str = "",
+ **kwargs
+ ) -> Dict[str, Any]:
+ """
+ Generate response, using LLEGO for reflection calls.
+
+ ๐ฅ CRITICAL: This method intercepts ALL LLM calls. For candidate generation,
+ it checks if we have pre-generated candidates from hybrid mode and returns those.
+
+ Args:
+ system_prompt: System prompt
+ user_prompt: User prompt
+ image_base64: Base64-encoded image (if any)
+ **kwargs: Additional arguments
+
+ Returns:
+ Dict with 'content' key containing the generated text
+ """
+ # ๐ DEBUG: Log generate calls (full details at DEBUG level)
+ if self._should_log_debug():
+ logger.debug(f"๐ LLEGO Wrapper: generate() called")
+ logger.debug(f" system_prompt: '{system_prompt[:100]}...' (truncated)")
+ logger.debug(f" user_prompt length: {len(user_prompt)} chars")
+ logger.debug(f" in_reflection: {self.reflection_context['in_reflection']}")
+ logger.debug(f" has_image: {bool(image_base64)}")
+
+ # #region agent log
+ try:
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "INTERCEPTION", "location": "llego_enhanced_llm.py:generate", "message": "Generate called", "data": {"system_prompt_len": len(system_prompt), "user_prompt_len": len(user_prompt), "has_image": bool(image_base64), "has_candidates": len(getattr(self, '_adapter_generated_candidates', [])), "in_reflection": self.reflection_context.get('in_reflection', False)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ except Exception:
+ pass
+ # #endregion
+
+ # ๐ฅ CRITICAL: Check if we have pre-generated candidates from adapter-level generation
+ # This happens when GEPA calls adapter.llm_client to generate candidates
+ # We intercept and return our pre-generated candidates instead
+ # ๐ฅ NEW: Select BEST candidate instead of FIFO
+ # ๐ฅ FIX: DON'T intercept evaluation calls (those have images!)
+ # Only intercept proposal calls (no images, just asking for new candidate)
+ # ๐ฅ FIX 2: DON'T intercept TEST EVALUATION calls!
+ # Test evaluation has no images but uses the OPTIMIZED prompt to execute tasks
+ # We detect test evaluation by checking if this is a TASK EXECUTION call (not reflection)
+ is_task_execution = (
+ # Task execution prompts contain task instructions, not optimization requests
+ not any(kw in system_prompt.lower() for kw in ['evolutionary', 'mutation', 'variation', 'optimize', 'improve prompt', 'rewrite', 'generate variations']) and
+ # Short prompts are usually task prompts, not optimization prompts
+ len(system_prompt) < 1000 and
+ # User prompt is the actual task input (short), not feedback (long)
+ len(user_prompt) < 2000
+ )
+
+ # Log task execution detection for debugging
+ if is_task_execution and hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates:
+ logger.info(f"๐ NOT intercepting: Task execution detected (not optimization)")
+ logger.debug(f" system_prompt_len={len(system_prompt)}, user_prompt_len={len(user_prompt)}")
+
+ if hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates and not image_base64 and not is_task_execution:
+ # ๐ฅ BEST-CANDIDATE SELECTION: Find candidate with highest Dpareto score
+ # This ensures we use the best candidate for the current iteration
+ best_candidate = None
+ best_score = -float('inf')
+ best_idx = -1
+
+ # Check if candidates have scores stored
+ for idx, cand in enumerate(self._adapter_generated_candidates):
+ if isinstance(cand, dict):
+ # Try to get score from candidate dict
+ score = cand.get('score', -float('inf'))
+
+ # If score not in dict, try to get from Pareto logger
+ if score == -float('inf'):
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+
+ # Look up score in Pareto front or evaluated candidates
+ cand_prompt = cand.get('prompt', '')
+ if cand_prompt:
+ normalized = cand_prompt.strip().strip('"\'')
+ # Check in Pareto front
+ for front_cand in pareto_log.pareto_front:
+ if front_cand.get('prompt', '').strip().strip('"\'') == normalized:
+ score = front_cand.get('score', -float('inf'))
+ break
+
+ # If not in front, check evaluated candidates
+ if score == -float('inf'):
+ for eval_cand in pareto_log.candidates_evaluated:
+ if eval_cand.get('prompt', '').strip().strip('"\'') == normalized:
+ score = eval_cand.get('score', -float('inf'))
+ break
+
+ if score > best_score:
+ best_score = score
+ best_candidate = cand
+ best_idx = idx
+
+ # If no scores found, fall back to FIFO (first candidate)
+ if best_candidate is None and self._adapter_generated_candidates:
+ best_candidate = self._adapter_generated_candidates[0]
+ best_idx = 0
+ logger.info(f"โ ๏ธ No scores found for candidates - using FIFO selection")
+
+ # Remove selected candidate from queue
+ if best_idx >= 0:
+ self._adapter_generated_candidates.pop(best_idx)
+
+ # Important event - keep at INFO
+ if best_score > -float('inf'):
+ logger.info(f"๐ฏ INTERCEPTING GEPA PROPOSAL CALL - Returning BEST candidate (score: {best_score:.4f})!")
+ logger.info(f"๐ฏ Remaining candidates: {len(self._adapter_generated_candidates)}")
+ else:
+ logger.info(f"๐ฏ INTERCEPTING GEPA PROPOSAL CALL - Returning pre-generated candidate!")
+ logger.info(f"๐ฏ Remaining candidates: {len(self._adapter_generated_candidates)}")
+
+ if isinstance(best_candidate, dict) and 'prompt' in best_candidate:
+ prompt = best_candidate['prompt']
+
+ # Detailed logging only in DEBUG mode
+ if self._should_log_debug():
+ logger.debug(f"โ
Pre-generated candidate details:")
+ logger.debug(f"{'โ'*80}")
+ logger.debug(f"{prompt}")
+ logger.debug(f"{'โ'*80}")
+ else:
+ source = best_candidate.get('source', 'unknown')
+ score_info = f" (score: {best_score:.4f})" if best_score > -float('inf') else ""
+ logger.info(f"โ
Candidate length: {len(prompt)} chars, Source: {source}{score_info}")
+
+ return {'content': prompt, 'source': best_candidate.get('source', 'adapter_generated')}
+ elif isinstance(best_candidate, str):
+ if self._should_log_debug():
+ logger.debug(f"โ
Pre-generated candidate (string format):")
+ logger.debug(f"{'โ'*80}")
+ logger.debug(f"{best_candidate}")
+ logger.debug(f"{'โ'*80}")
+ else:
+ logger.info(f"โ
Candidate length: {len(best_candidate)} chars")
+ return {'content': best_candidate, 'source': 'adapter_generated'}
+
+ # ๐ฅ ENHANCED CALL TYPE DETECTION
+ # We need to distinguish between 4 types of calls:
+ # 1. Evaluation calls: Image + task command โ identify element (pass through)
+ # 2. Judge calls: Image + "prompt engineer" โ analyze feedback (pass through)
+ # 3. Proposal calls: No image + feedback โ generate candidate (intercept)
+ # 4. JSON batch calls: JSON generation request (pass through)
+
+ # FIX: DON'T intercept JSON batch generation calls
+ is_json_batch_request = (
+ '"variations"' in system_prompt or
+ 'MUST BE VALID JSON' in system_prompt or
+ 'Output ONLY the JSON object' in system_prompt or
+ '```json' in system_prompt.lower()
+ )
+
+ # FIX: DON'T intercept LLM-as-Judge calls (they analyze feedback with images)
+ is_judge_call = (
+ 'prompt engineer' in system_prompt.lower() or
+ 'analyzing mobile ui automation' in system_prompt.lower() or
+ 'expert prompt engineer' in system_prompt.lower() or
+ ('analyze' in system_prompt.lower() and 'screenshot with numbered bounding boxes' in system_prompt.lower() and image_base64)
+ )
+
+ # Check if this is a reflection call (GEPA asking for new candidate)
+ is_reflection_call = (
+ self.reflection_context['in_reflection'] or
+ self._detect_reflection_call(system_prompt, user_prompt)
+ )
+
+ # Proposal calls are reflection calls WITHOUT images and NOT judge/JSON calls
+ # These are the calls we want to intercept with LLEGO
+ is_proposal_call = (
+ not is_json_batch_request and # Not a JSON generation request
+ not is_judge_call and # Not an LLM-as-Judge analysis
+ not image_base64 and # No image = not an evaluation/judge call
+ (
+ is_reflection_call or
+ 'improve' in system_prompt.lower() or
+ 'optimize' in system_prompt.lower() or
+ 'suggest' in system_prompt.lower() or
+ 'feedback' in system_prompt.lower() or
+ 'reflection' in system_prompt.lower()
+ ) and
+ len(user_prompt) > 100 # Proposal calls have substantial feedback
+ )
+
+ # Detailed call detection logging only in DEBUG mode
+ if self._should_log_debug():
+ logger.debug(f" is_json_batch_request: {is_json_batch_request}")
+ logger.debug(f" is_judge_call: {is_judge_call}")
+ logger.debug(f" is_reflection_call: {is_reflection_call}")
+ logger.debug(f" is_proposal_call: {is_proposal_call}")
+ logger.debug(f" has_image: {bool(image_base64)}")
+ logger.debug(f" has_llego: {self.llego is not None}")
+
+ # Only intercept proposal calls (not judge, not evaluation, not JSON)
+ if is_proposal_call and self.llego:
+ # FIX #5: Check if LLEGO is disabled due to repeated failures
+ if self._llego_disabled:
+ logger.warning("โ ๏ธ LLEGO is disabled (circuit breaker), using base LLM")
+ return self.base_llm.generate(
+ system_prompt=system_prompt,
+ user_prompt=user_prompt,
+ image_base64=image_base64,
+ **kwargs
+ )
+
+ # Important event - keep at INFO
+ logger.info("๐ฅ INTERCEPTING REFLECTION/PROPOSAL CALL FOR CANDIDATE GENERATION")
+ return self._llego_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+ else:
+ # Standard LLM call (for evaluation, not reflection)
+ if self._should_log_debug():
+ logger.debug(" โ Standard LLM call (evaluation, not reflection)")
+ return self.base_llm.generate(
+ system_prompt=system_prompt,
+ user_prompt=user_prompt,
+ image_base64=image_base64,
+ **kwargs
+ )
+
+ def _clean_reflection_feedback(self, feedback_text: str, max_length: int = 50000) -> str:
+ """
+ Clean reflection feedback by removing base64 images and truncating.
+
+ ๐ฅ CRITICAL: GEPA's feedback can include massive base64 images (7MB+).
+ This function removes them and keeps feedback concise.
+
+ Args:
+ feedback_text: Original feedback (may contain base64)
+ max_length: Maximum length after cleaning (default: 50K chars)
+
+ Returns:
+ Cleaned feedback without base64, within size limits
+ """
+ if not feedback_text:
+ return feedback_text
+
+ # Step 1: Remove very long base64-like sequences (50K+ chars of alphanumeric)
+ base64_pattern = r'[A-Za-z0-9+/=]{5000,}'
+ cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', feedback_text)
+
+ # Step 2: Remove explicit image_base64 references and their values
+ cleaned = re.sub(r'image_base64["\']?\s*[:=]\s*["\']?[A-Za-z0-9+/=]+["\']?',
+ 'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE)
+
+ # Step 3: Remove detailed_scores sections that might contain base64
+ cleaned = re.sub(r'##\s+detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*',
+ '## detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE)
+
+ # Step 4: Remove any remaining very long strings (likely base64)
+ cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned)
+
+ # Step 5: Truncate if still too long (keep beginning which has most important info)
+ if len(cleaned) > max_length:
+ truncated_size = len(cleaned) - max_length
+ cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters - keeping essential feedback only]"
+ logger.warning(f"โ ๏ธ Reflection feedback truncated: {len(feedback_text)} โ {len(cleaned)} chars")
+
+ return cleaned
+
+ def _detect_reflection_call(self, system_prompt: str, user_prompt: str) -> bool:
+ """
+ Heuristic to detect if this is a reflection call from GEPA.
+
+ GEPA's reflection calls typically contain feedback/error analysis.
+ """
+ reflection_keywords = [
+ 'improve', 'feedback', 'error', 'failure', 'reflection',
+ 'better prompt', 'modify', 'enhance', 'optimize'
+ ]
+
+ combined = (system_prompt + " " + user_prompt).lower()
+ return any(keyword in combined for keyword in reflection_keywords)
+
+ def _llego_generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: str = "",
+ **kwargs
+ ) -> Dict[str, Any]:
+ """
+ Use LLEGO (or Hybrid mode) to generate new prompt candidates.
+
+ Args:
+ system_prompt: System prompt
+ user_prompt: User prompt (contains reflection feedback)
+ image_base64: Image data (for reflection, always empty)
+ **kwargs: Additional arguments (may contain image_base64, will be removed)
+
+ Returns:
+ Dict with 'content' key containing a new prompt candidate
+ """
+ try:
+ # ๐ฅ CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error
+ kwargs.pop('image_base64', None) # Remove if present to avoid conflict
+
+ # ๐ฅ HYBRID MODE: Generate from BOTH GEPA reflection AND LLEGO
+ if (self.config and
+ hasattr(self.config, 'enable_gepa_reflection_with_llego') and
+ self.config.enable_gepa_reflection_with_llego):
+
+ return self._hybrid_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+
+ # STANDARD LLEGO MODE (LLEGO only)
+ return self._llego_only_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+
+ except Exception as e:
+ # FIX #5: Circuit breaker - track failures and disable LLEGO if needed
+ self._llego_failures += 1
+
+ logger.error(f"โ LLEGO generation failed ({self._llego_failures}/{self._llego_failure_threshold}): {e}")
+ logger.error("โ ๏ธ Falling back to base LLM")
+
+ if self._llego_failures >= self._llego_failure_threshold:
+ self._llego_disabled = True
+ logger.error(f"๐ซ LLEGO DISABLED - {self._llego_failures} consecutive failures detected")
+ logger.error(" All future requests will use base LLM only")
+
+ import traceback
+ logger.debug(traceback.format_exc())
+
+ # Fallback to base LLM - ensure image_base64 is not in kwargs
+ kwargs.pop('image_base64', None)
+ return self.base_llm.generate(
+ system_prompt=system_prompt,
+ user_prompt=user_prompt,
+ image_base64=image_base64,
+ **kwargs
+ )
+
+ def _hybrid_generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: str = "",
+ **kwargs
+ ) -> Dict[str, Any]:
+ """
+ ๐ฅ HYBRID MODE: Generate candidates from BOTH GEPA reflection AND LLEGO operators.
+
+ Smart Compensation Strategy:
+ - When crossover can't run (< 2 parents), compensates with extra GEPA reflection
+ - GEPA is smarter than mutation (uses semantic understanding of feedback)
+ - Crossover only runs when we have 2+ scored parents to combine
+
+ GEPA will call generate() multiple times. On first call, we generate all candidates
+ and queue them. Subsequent calls return from the queue.
+ """
+ # If we already generated candidates, return next from queue
+ if self._hybrid_generation_complete and self._candidate_queue:
+ candidate = self._candidate_queue.pop(0)
+ source = candidate.get('source', 'unknown')
+ logger.info(f"๐ฆ Returning queued candidate (source: {source}, {len(self._candidate_queue)} remaining)")
+ return {'content': candidate['prompt'], 'source': source}
+
+ # First call: Generate ALL candidates
+ from ..utils.clean_logger import get_clean_logger
+ clean_log = get_clean_logger()
+
+ all_candidates = []
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # PHASE 0: Check if crossover will be possible
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ gepa_pareto_front = pareto_log.pareto_front
+
+ # Determine if we need to compensate for crossover
+ crossover_possible = len(gepa_pareto_front) >= 2
+ n_crossover_config = self.config.n_crossover if hasattr(self.config, 'n_crossover') else 2
+ crossover_compensation = 0 if crossover_possible else n_crossover_config
+
+ if not crossover_possible:
+ logger.info(f"โ ๏ธ Crossover NOT possible (have {len(gepa_pareto_front)} parents, need 2+)")
+ logger.info(f" โ Smart compensation: +{crossover_compensation} extra GEPA reflection candidates")
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # PHASE 1: GEPA REFLECTION (Semantic Understanding)
+ # More GEPA = better, it understands WHY things fail
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ base_gepa_count = self.config.num_gepa_reflection_candidates if hasattr(self.config, 'num_gepa_reflection_candidates') else 3
+
+ # ๐ฅ SMART COMPENSATION: More GEPA when crossover can't run
+ num_gepa = base_gepa_count + crossover_compensation
+
+ logger.info("โ" * 80)
+ logger.info("PHASE 1: GEPA REFLECTION (Semantic Understanding)")
+ if crossover_compensation > 0:
+ logger.info(f"Generating {num_gepa} candidates ({base_gepa_count} base + {crossover_compensation} compensation for skipped crossover)")
+ else:
+ logger.info(f"Generating {num_gepa} candidates")
+ logger.info("โ" * 80)
+
+ # ๐ฅ OPTIMIZED: Single call with JSON format for multiple variations
+ try:
+ # Clean user_prompt before sending to LLM
+ cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+
+ # Build diversity requirements based on num_gepa
+ diversity_requirements = self._build_diversity_requirements(num_gepa)
+
+ # ๐ฅ FORMAT AWARENESS: Get format constraint if available
+ format_constraint = ""
+ if self._detected_format and self._detected_format.get('format_constraint'):
+ format_constraint = self._detected_format['format_constraint']
+ logger.info(f"๐ Injecting format constraint into candidate generation")
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "Format constraint injected", "data": {"format_type": self._detected_format.get('format_type', 'unknown'), "constraint_length": len(format_constraint), "avg_length": self._detected_format.get('avg_length', 0)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ else:
+ format_constraint = "No specific format detected - ensure output is CONCISE and matches expected examples."
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ import os as _os_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "No format constraint available", "data": {"has_detected_format": bool(self._detected_format)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+
+ # ๐ฅ EVOLUTIONARY PROMPT ENGINEER: Forces radically different mutations
+ # Each variation MUST use a distinct genetic strategy to maximize search space
+ optimization_system_prompt = f"""
+You are an **Evolutionary Prompt Engineer**. Your task is to mutate a [FAILING_PROMPT] into a high-performance instruction set using genetic strategies.
+You must generate {num_gepa} radically different prompt variations based on the [FAILURE_FEEDBACK].
+
+
+
+
+{cleaned_user_prompt}
+
+
+
+
+You MUST use a different strategy for each variation. Assign strategies in order:
+
+1. **STRATEGY A: The Strict Auditor (Constraints)**
+ - Focus: Add "Negative Constraints" (e.g., "Do NOT...", "NEVER...", "FORBIDDEN:").
+ - Use strict XML tagging for the output schema.
+ - Goal: Fix hallucinations and formatting errors.
+
+2. **STRATEGY B: The Reasoning Expert (Chain of Thought)**
+ - Focus: Add a "Reasoning Steps" section.
+ - Instruct the model to "Think step-by-step" before generating the final output.
+ - Goal: Fix logic errors and complex multi-step reasoning failures.
+
+3. **STRATEGY C: The Few-Shot Teacher (Examples)**
+ - Focus: Generate a *synthetic* example of Input -> Correct Output within the prompt.
+ - Goal: Fix understanding of abstract concepts or strict schema requirements.
+
+4. **STRATEGY D: The Role-Player (Persona)**
+ - Focus: Change the persona to a hyper-specific expert (e.g., "Senior Data Engineer at Fortune 500" vs "Coder").
+ - Add domain-specific vocabulary and expertise markers.
+ - Goal: Fix domain-specific terminology errors.
+
+5. **STRATEGY E: The Structure Architect (Format)**
+ - Focus: Add explicit output schema with field-by-field instructions.
+ - Use markdown or XML headers to organize the prompt.
+ - Goal: Fix output structure and field naming errors.
+
+
+
+1. **Self-Contained**: Each variation must be the FULL prompt text (100-500 words), ready to run.
+2. **No Meta-Talk**: Do not explain your strategy inside the prompt. Just output the optimized prompt.
+3. **Preserve Core Task**: Keep the original task/domain - only improve HOW it's described.
+4. **JSON Output**: Follow the schema below exactly.
+5. **ENFORCE OUTPUT FORMAT**: The generated prompt MUST instruct the model to output in the EXACT format shown in examples.
+
+
+
+๐จ THE GENERATED PROMPTS MUST INCLUDE EXPLICIT OUTPUT FORMAT INSTRUCTIONS!
+Common failure: The model generates explanations/prose instead of the required concise format.
+
+{format_constraint}
+
+Your generated prompts MUST include:
+- Explicit instruction to output ONLY in the required format
+- "Do NOT explain", "No reasoning", "Output ONLY [format]" constraints
+- Length constraint to prevent verbose responses
+
+
+
+You MUST output ONLY valid JSON. No comments, no explanations, no markdown code blocks.
+
+Generate exactly {num_gepa} variations in this exact format:
+
+{{
+ "variations": [
+ {{
+ "index": 1,
+ "strategy": "Strict Auditor",
+ "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]"
+ }},
+ {{
+ "index": 2,
+ "strategy": "Reasoning Expert",
+ "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]"
+ }}
+ ]
+}}
+
+CRITICAL RULES:
+1. Output ONLY the JSON object - no text before or after
+2. Do NOT use markdown code blocks (no ```json)
+3. Do NOT include comments (no // or /* */)
+4. Ensure all strings are properly escaped
+5. Generate exactly {num_gepa} variations
+6. Each variation must have: index (number), strategy (string), prompt (string)
+
+"""
+
+ # Standard GEPA reflection call
+ call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+ result = self.base_llm.generate(
+ system_prompt=optimization_system_prompt,
+ user_prompt=cleaned_user_prompt,
+ image_base64=image_base64,
+ **call_kwargs
+ )
+
+ if isinstance(result, dict):
+ response_text = result.get("content", str(result))
+ else:
+ response_text = str(result)
+
+ # Parse JSON variations
+ gepa_variations = self._parse_json_variations(response_text, num_gepa)
+
+ # Add all variations to candidates
+ for idx, variation_prompt in enumerate(gepa_variations, 1):
+ # ๐ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis
+ gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt)
+
+ # Validate extracted prompt before adding
+ if not self._is_valid_prompt(gepa_candidate):
+ logger.warning(f" โ ๏ธ Variation {idx} appears invalid, skipping")
+ continue
+
+ # ๐ DIAGNOSTIC: Log candidate length to help diagnose scoring issues
+ if self._should_log_debug():
+ logger.debug(f" Candidate {idx} length: {len(gepa_candidate)} chars")
+ logger.debug(f" Candidate {idx} preview: {gepa_candidate[:100]}...")
+
+ all_candidates.append({
+ 'prompt': gepa_candidate,
+ 'source': 'gepa_reflection',
+ 'index': idx
+ })
+
+ clean_log.log_gepa_reflection_candidate(idx, gepa_candidate)
+
+ gepa_count = len(all_candidates)
+ logger.info(f"โ
GEPA Reflection: {gepa_count} candidates generated in single optimized call")
+
+ except Exception as e:
+ logger.error(f"โ Error generating GEPA reflection candidates: {e}")
+ logger.warning(f" Falling back to sequential generation...")
+ import traceback
+ logger.debug(traceback.format_exc())
+
+ # Fallback: Sequential generation (when JSON parsing fails)
+ gepa_count = self._fallback_sequential_gepa_generation(
+ num_gepa, user_prompt, image_base64, kwargs, all_candidates, clean_log
+ )
+
+ if gepa_count > 0:
+ logger.info(f"GEPA Reflection Complete: {gepa_count} candidates")
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # PHASE 2: LLEGO GENETIC OPERATORS
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ logger.info("โ" * 80)
+ logger.info("PHASE 2: LLEGO GENETIC OPERATORS")
+ logger.info("โ" * 80)
+
+ # Extract current prompt from context
+ current_prompt = self.reflection_context.get('current_prompt', '')
+ if not current_prompt:
+ current_prompt = self._extract_prompt_from_feedback(user_prompt)
+
+ if not current_prompt and self.llego.population:
+ current_prompt = self.llego.population[0].prompt
+ logger.info(f" Using population prompt (length: {len(current_prompt)})")
+
+ # Convert GEPA Pareto front to PromptCandidate format (already fetched in Phase 0)
+ pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+ pareto_front = pareto_candidates
+
+ logger.info(f" Pareto front: {len(pareto_front)} candidates with scores")
+ for idx, p in enumerate(pareto_front, 1):
+ notation = p.metadata.get('notation', 'S') if p.metadata else 'S'
+ logger.info(f" {notation}: fitness={p.fitness:.3f}")
+
+ # Create LLM callable for LLEGO genetic operations (crossover/mutation)
+ call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+
+ # LLEGO genetic prompt with SAFETY LOCKS to prevent task drift
+ # Directed mutations ensure prompts improve without losing core functionality
+ genetic_operator_system_prompt = """
+You are a **Prompt Mutation Engine**. Your input is a [PARENT_PROMPT]. Your output is a [MUTATED_CHILD].
+
+
+
+Apply ONE of the following micro-mutations to improve the prompt:
+
+1. **COMPRESS**: Remove fluff words ("please", "ensure to", "kindly"). Make it telegraphic and efficient.
+2. **INTENSIFY**: Capitalize key constraints (e.g., "must return JSON" -> "**MUST** return **VALID JSON**").
+3. **STRUCTURIZE**: Add markdown headers or XML tags to organize a messy prompt.
+4. **CLARIFY**: Expand vague nouns (e.g., "code" -> "production-ready Python code with type hints").
+5. **CONSTRAIN**: Add negative constraints ("Do NOT include explanations", "NEVER output markdown").
+
+
+
+1. **IMMUTABLE CORE**: You MUST NOT change the core task (e.g., do not change "Extract JSON" to "Write a Summary").
+2. **NO EXPLANATION**: Output ONLY the new prompt string. No meta-commentary.
+3. **VALIDITY**: The output must remain a functional system prompt.
+4. **LENGTH LIMIT**: Keep mutations within 20% of original length (no excessive expansion).
+"""
+
+ def llm_callable(genetic_prompt: str) -> str:
+ result = self.base_llm.generate(
+ system_prompt=genetic_operator_system_prompt,
+ user_prompt=genetic_prompt,
+ image_base64="",
+ **call_kwargs
+ )
+ if isinstance(result, dict):
+ return result.get('content', str(result))
+ return str(result)
+
+ # Generate LLEGO offspring (crossover will be skipped if < 2 parents)
+ llego_prompts = self.llego.evolve_generation(
+ llm=llm_callable,
+ pareto_front=pareto_front
+ )
+
+ # Track actual crossover count from LLEGO (it tracks internally now)
+ actual_crossover = getattr(self.llego, '_actual_crossover_count', 0)
+ crossover_skipped = getattr(self.llego, '_crossover_skipped', False)
+
+ crossover_idx = 1
+ mutation_idx = 1
+
+ for i, prompt in enumerate(llego_prompts):
+ if i < actual_crossover:
+ source = 'llego_crossover'
+ clean_log.log_llego_crossover_candidate(crossover_idx, prompt)
+ crossover_idx += 1
+ else:
+ source = 'llego_mutation'
+ clean_log.log_llego_mutation_candidate(mutation_idx, prompt)
+ mutation_idx += 1
+
+ all_candidates.append({
+ 'prompt': prompt,
+ 'source': source,
+ 'index': i + 1
+ })
+
+ mutation_count = len(llego_prompts) - actual_crossover
+ logger.info(f"๐งฌ LLEGO: {actual_crossover} crossover + {mutation_count} mutation = {len(llego_prompts)} candidates")
+ if crossover_skipped:
+ logger.info(f" (Crossover was skipped - compensated with extra GEPA reflection)")
+
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ # SUMMARY
+ # โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ
+ total_gepa = len([c for c in all_candidates if c.get('source') == 'gepa_reflection'])
+ total_crossover = len([c for c in all_candidates if c.get('source') == 'llego_crossover'])
+ total_mutation = len([c for c in all_candidates if c.get('source') == 'llego_mutation'])
+
+ logger.info("โ" * 80)
+ logger.info("CANDIDATE GENERATION SUMMARY")
+ logger.info("โ" * 80)
+ logger.info(f" GEPA Reflection: {total_gepa} candidates (semantic understanding)")
+ logger.info(f" LLEGO Crossover: {total_crossover} candidates (combine best)")
+ logger.info(f" LLEGO Mutation: {total_mutation} candidates (exploration)")
+ logger.info(f" TOTAL: {len(all_candidates)} candidates")
+ if crossover_skipped:
+ logger.info(f" ๐ Note: Crossover skipped (waiting for 2+ scored parents)")
+ logger.info("โ" * 80)
+
+ clean_log.log_candidate_generation_summary()
+
+ # Store in queue (skip first one - return it now)
+ self._candidate_queue = all_candidates[1:] if len(all_candidates) > 1 else []
+ self._hybrid_generation_complete = True
+
+ # Return first candidate
+ if all_candidates:
+ first = all_candidates[0]
+ logger.info(f"๐ค Returning FIRST candidate (source: {first['source']})")
+ return {'content': first['prompt'], 'source': first['source']}
+ else:
+ logger.error("โ No candidates generated!")
+ return {'content': '', 'source': 'error'}
+
+ def _llego_only_generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: str = "",
+ **kwargs
+ ) -> Dict[str, Any]:
+ """
+ STANDARD LLEGO MODE: Generate candidates using only LLEGO operators.
+ """
+ # ๐ฅ CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error
+ kwargs.pop('image_base64', None)
+
+ # ๐ฅ FIX: Clean user_prompt if it contains feedback (might have base64)
+ cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+
+ # Extract current prompt from context or user_prompt
+ current_prompt = self.reflection_context.get('current_prompt', '')
+
+ if not current_prompt:
+ # Try to extract from cleaned user_prompt
+ current_prompt = self._extract_prompt_from_feedback(cleaned_user_prompt)
+
+ logger.info(f"๐งฌ LLEGO: Evolving prompt...")
+ if self._should_log_debug():
+ logger.debug(f" Current prompt: '{current_prompt[:100]}...' (length: {len(current_prompt)} chars)")
+ else:
+ logger.info(f" Prompt length: {len(current_prompt)} chars")
+
+ # ๐ฅ FIX 2: Get Pareto front from GEPA (not LLEGO population)
+ # This ensures LLEGO operators use true non-dominated solutions
+ from ..utils.pareto_logger import get_pareto_logger
+ pareto_log = get_pareto_logger()
+ gepa_pareto_front = pareto_log.pareto_front
+
+ # Convert GEPA Pareto front to PromptCandidate format
+ pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+ pareto_front = pareto_candidates
+
+ logger.info(f" Using GEPA Pareto front (size: {len(gepa_pareto_front)})")
+ logger.info(f" Converted to {len(pareto_front)} PromptCandidate objects")
+
+ # Create LLM callable for LLEGO genetic operations
+ # Uses Genetic Mutation Engine prompt for micro-mutations
+ call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+
+ genetic_system_prompt = """You are a **Genetic Mutation Engine** for Text Prompts.
+
+
+Apply a specific micro-mutation to the provided prompt to increase its clarity, strictness, or effectiveness.
+
+
+
+1. **Compress**: Shorten verbose instructions without losing meaning.
+2. **Expand**: Add detail to vague nouns (e.g., "code" -> "production-ready Python 3.10 code").
+3. **Emphasize**: Highlight CRITICAL constraints using caps, bold, or explicit markers.
+4. **Constrain**: Add explicit boundaries (what NOT to do, format rules, length limits).
+5. **Exemplify**: Add a brief example if the task is ambiguous.
+
+
+
+1. Output ONLY the mutated prompt text.
+2. Do NOT change the core intent or task domain.
+3. Do NOT add explanations or meta-commentary.
+4. Apply ONE primary mutation type while preserving all existing strengths.
+"""
+
+ def llm_callable(prompt: str) -> str:
+ # Clean prompt before sending (might contain base64 if from feedback)
+ cleaned_prompt = self._clean_reflection_feedback(prompt)
+ result = self.base_llm.generate(
+ system_prompt=genetic_system_prompt,
+ user_prompt=cleaned_prompt,
+ image_base64="", # Always empty for LLEGO genetic operations
+ **call_kwargs
+ )
+ if isinstance(result, dict):
+ return result.get('content', str(result))
+ return str(result)
+
+ # Generate offspring using LLEGO
+ new_prompts = self.llego.evolve_generation(
+ llm=llm_callable,
+ pareto_front=pareto_front
+ )
+
+ if new_prompts:
+ new_prompt = new_prompts[0]
+ logger.info(f"โ
LLEGO generated new candidate (length: {len(new_prompt)} chars)")
+
+ if self._should_log_debug():
+ logger.debug(f" Full prompt:")
+ logger.debug(f" '{new_prompt}'")
+
+ return {
+ 'content': new_prompt,
+ 'source': 'llego',
+ 'num_candidates': len(new_prompts)
+ }
+ else:
+ logger.warning("โ ๏ธ LLEGO returned no candidates, falling back to base LLM")
+ return self.base_llm.generate(
+ system_prompt=system_prompt,
+ user_prompt=user_prompt,
+ image_base64="",
+ **kwargs
+ )
+
+ def _build_diversity_requirements(self, num_gepa: int) -> str:
+ """
+ Build diversity requirements using research-backed Prompt Design Patterns.
+
+ These are proven strategies from prompt engineering literature:
+ - Chain-of-Thought (CoT)
+ - Few-Shot Learning
+ - Negative Constraints
+ - Persona Pattern
+
+ Args:
+ num_gepa: Number of GEPA variations to generate
+
+ Returns:
+ String with diversity requirements for the optimization prompt
+ """
+ # Research-backed Prompt Design Patterns that solve specific classes of problems
+ strategies = [
+ """
+
+ **STRATEGY: COGNITIVE DECOMPOSITION (Chain-of-Thought)**
+ - **Goal**: Fixes logic/reasoning errors.
+ - **Action**: Add a thinking process section that forces step-by-step reasoning.
+ - **Implementation**: Include instructions like "First analyze..., then identify..., finally conclude..."
+ - **Pattern**: Force the model to "Plan before executing".
+
+ """,
+
+ """
+
+ **STRATEGY: FEW-SHOT SIMULATION (In-Context Learning)**
+ - **Goal**: Fixes formatting/syntax errors and output structure issues.
+ - **Action**: Invent 1-2 realistic "Input -> Output" examples that mirror the expected format.
+ - **Implementation**: Add "Example: Given [input], respond with: [expected output format]"
+ - **Pattern**: Show, don't just tell. Demonstrate the gold standard.
+
+ """,
+
+ """
+
+ **STRATEGY: SEMANTIC CONSTRAINING (Negative Constraints)**
+ - **Goal**: Fixes hallucinations, verbosity, and off-topic responses.
+ - **Action**: Add explicit forbidden actions and boundaries.
+ - **Implementation**: Include "Do NOT explain your reasoning", "Do NOT add preambles", "Do NOT include information not asked for"
+ - **Pattern**: Define the walls, not just the path.
+
+ """,
+
+ """
+
+ **STRATEGY: PERSONA & ROLE HARDENING**
+ - **Goal**: Fixes tone, domain knowledge gaps, and inconsistent behavior.
+ - **Action**: Define a hyper-specific expert role with clear responsibilities.
+ - **Implementation**: Instead of "You are a helpful assistant", use "You are a Senior Data Analyst with 10 years of experience in [domain]"
+ - **Pattern**: Adopt the mental model and rigorous standards of a real expert.
+
+ """,
+
+ """
+
+ **STRATEGY: OUTPUT SCHEMA ENFORCEMENT**
+ - **Goal**: Fixes structural and format compliance issues.
+ - **Action**: Define an explicit output schema with field names and types.
+ - **Implementation**: Include "Your response MUST follow this exact format: {field1: type, field2: type}"
+ - **Pattern**: Leave no ambiguity about what the output should look like.
+
+ """,
+
+ """
+
+ **STRATEGY: SELF-VERIFICATION LOOP**
+ - **Goal**: Fixes errors that could be caught by double-checking.
+ - **Action**: Add instructions for the model to verify its own output.
+ - **Implementation**: Include "Before responding, verify: 1) Does this match the required format? 2) Did I include all requested information?"
+ - **Pattern**: Build in quality control before submission.
+
+ """,
+
+ """
+
+ **STRATEGY: TASK DECOMPOSITION**
+ - **Goal**: Fixes complex tasks that overwhelm the model.
+ - **Action**: Break the task into numbered sub-tasks.
+ - **Implementation**: "Step 1: [subtask]. Step 2: [subtask]. Step 3: Combine results."
+ - **Pattern**: Divide and conquer complexity.
+
+ """
+ ]
+
+ # Select strategies based on num_gepa
+ selected = strategies[:min(num_gepa, len(strategies))]
+
+ requirements = "\n"
+ requirements += "Each variation MUST use a DIFFERENT strategy from the list below:\n"
+ requirements += "\n".join(selected)
+ requirements += "\n"
+
+ requirements += """
+
+
+ 1. Each variation must apply its assigned strategy comprehensively.
+ 2. Each variation must ALSO address ALL issues mentioned in the feedback.
+ 3. The strategies are not mutually exclusive - but the PRIMARY focus of each variation should be its assigned strategy.
+ 4. Do not just add a single line - transform the prompt structure according to the strategy.
+
+"""
+
+ return requirements
+
+ def _fallback_sequential_gepa_generation(
+ self,
+ num_gepa: int,
+ user_prompt: str,
+ image_base64: str,
+ kwargs: dict,
+ all_candidates: list,
+ clean_log
+ ) -> int:
+ """
+ Fallback to sequential generation when JSON parsing fails.
+
+ Args:
+ num_gepa: Number of candidates to generate
+ user_prompt: The feedback/context
+ image_base64: Image data (if any)
+ kwargs: Additional kwargs
+ all_candidates: List to append candidates to
+ clean_log: Logger for clean output
+
+ Returns:
+ Number of candidates generated
+ """
+ generated_count = 0
+
+ for i in range(num_gepa):
+ logger.debug(f"Generating Reflection Candidate #{i+1}/{num_gepa} (fallback mode)...")
+ try:
+ cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+
+ # Use research-backed strategy for each variation
+ strategy_prompts = [
+ "\nApply CHAIN-OF-THOUGHT: Add step-by-step reasoning instructions. Force the model to 'think before answering'.\n",
+ "\nApply FEW-SHOT LEARNING: Add 1-2 concrete input/output examples within the prompt. Show, don't just tell.\n",
+ "\nApply NEGATIVE CONSTRAINTS: Add explicit 'Do NOT' rules. Define what the model must avoid.\n",
+ "\nApply PERSONA HARDENING: Define a specific expert role with clear responsibilities and standards.\n",
+ "\nApply OUTPUT SCHEMA: Define the exact output format with field names and types. Leave no ambiguity.\n",
+ ]
+
+ strategy = strategy_prompts[i % len(strategy_prompts)]
+
+ fallback_prompt = f"""You are a Prompt Optimization Engine in **SAFE MODE**.
+
+{strategy}
+
+{_FALLBACK_SYSTEM_PROMPT}"""
+
+ call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+ result = self.base_llm.generate(
+ system_prompt=fallback_prompt,
+ user_prompt=cleaned_user_prompt,
+ image_base64=image_base64,
+ **call_kwargs
+ )
+
+ if isinstance(result, dict):
+ gepa_candidate_raw = result.get("content", str(result))
+ else:
+ gepa_candidate_raw = str(result)
+
+ gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw)
+
+ if not self._is_valid_prompt(gepa_candidate):
+ logger.warning(f" โ ๏ธ Fallback candidate #{i+1} appears invalid, skipping")
+ continue
+
+ all_candidates.append({
+ 'prompt': gepa_candidate,
+ 'source': 'gepa_reflection',
+ 'index': i + 1
+ })
+
+ clean_log.log_gepa_reflection_candidate(i + 1, gepa_candidate)
+ generated_count += 1
+
+ except Exception as fallback_error:
+ logger.error(f"โ Error in fallback generation #{i+1}: {fallback_error}")
+
+ return generated_count
+
+ def _extract_prompt_from_feedback(self, user_prompt: str) -> str:
+ """
+ Try to extract the current prompt from GEPA's reflection feedback.
+
+ Args:
+ user_prompt: The feedback text from GEPA
+
+ Returns:
+ Extracted prompt or empty string
+ """
+ # Look for common patterns in GEPA's feedback
+ if "current prompt:" in user_prompt.lower():
+ lines = user_prompt.split('\n')
+ for i, line in enumerate(lines):
+ if "current prompt:" in line.lower():
+ # Return the next line(s) as the prompt
+ return '\n'.join(lines[i+1:i+10])
+
+ return ""
+
+ # Forward other methods to base LLM
+ def get_model_info(self) -> str:
+ """Get model information."""
+ return f"LLEGO({self.base_llm.get_model_info()})"
+
+ def __getattr__(self, name):
+ """Forward unknown attributes to base LLM."""
+ return getattr(self.base_llm, name)
+
diff --git a/src/gepa_optimizer/llms/vision_llm.py b/src/gepa_optimizer/llms/vision_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f84dac705b901341c722f2955af1b8b473561c
--- /dev/null
+++ b/src/gepa_optimizer/llms/vision_llm.py
@@ -0,0 +1,813 @@
+"""
+Vision LLM Client for GEPA Optimizer
+"""
+
+import json
+import logging
+import time
+from enum import Enum
+import requests
+from typing import Dict, Optional, Any, TYPE_CHECKING, Union
+
+# Assuming APIKeyManager is available from utils
+from ..utils.api_keys import APIKeyManager
+
+# Import ModelConfig only for type checking to avoid circular imports
+if TYPE_CHECKING:
+ from ..models.config import ModelConfig
+
+from .base_llm import BaseLLMClient
+
+class ProviderType(str, Enum):
+ OPENAI = "openai"
+ ANTHROPIC = "anthropic"
+ HUGGINGFACE = "huggingface"
+ VLLM = "vllm"
+ GOOGLE = "google"
+ GEMINI = "gemini"
+
+class ErrorType(str, Enum):
+ API_ERROR = "api_error"
+ VALIDATION_ERROR = "validation_error"
+ NETWORK_ERROR = "network_error"
+ RATE_LIMIT = "rate_limit"
+ TIMEOUT = "timeout"
+
+class GepaLLMError(Exception):
+ """Base exception for GEPA LLM related errors"""
+ def __init__(self, message: str, error_type: ErrorType, status_code: Optional[int] = None):
+ self.message = message
+ self.error_type = error_type
+ self.status_code = status_code
+ super().__init__(self.message)
+
+ def __str__(self):
+ if self.status_code:
+ return f"{self.error_type.value} (HTTP {self.status_code}): {self.message}"
+ return f"{self.error_type.value}: {self.message}"
+
+logger = logging.getLogger(__name__)
+
+OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
+
+class VisionLLMClient(BaseLLMClient):
+ """
+ A client for interacting with multi-modal Vision LLMs (e.g., OpenAI GPT-4 Vision).
+
+ Example:
+ ```python
+ # Basic usage
+ client = VisionLLMClient(
+ provider="openai",
+ model_name="gpt-4-vision-preview",
+ temperature=0.7,
+ max_tokens=2048
+ )
+
+ # With custom configuration
+ config = ModelConfig(
+ provider="openai",
+ model_name="gpt-4-vision-preview",
+ temperature=0.5,
+ max_tokens=1024
+ )
+ client = VisionLLMClient.from_config(config)
+ ```
+ """
+
+ def __init__(
+ self,
+ provider: Union[str, ProviderType],
+ model_name: str,
+ api_key: Optional[str] = None,
+ base_url: Optional[str] = None,
+ temperature: float = 0.7,
+ max_tokens: int = 2048,
+ top_p: float = 1.0,
+ frequency_penalty: float = 0.0,
+ presence_penalty: float = 0.0,
+ timeout: int = 120, # Increase to 2 minutes for large prompts
+ max_retries: int = 3
+ ):
+ """
+ Initializes the VisionLLMClient with model configuration.
+
+ Args:
+ provider: The provider of the model (e.g., 'openai', 'anthropic')
+ model_name: The name of the multi-modal LLM model to use (e.g., "gpt-4-vision-preview").
+ api_key: Optional API key. If not provided, it will be fetched from APIKeyManager.
+ base_url: Optional base URL for the API endpoint.
+ temperature: Controls randomness in the response generation.
+ max_tokens: Maximum number of tokens to generate.
+ top_p: Controls diversity via nucleus sampling.
+ frequency_penalty: Penalizes repeated tokens.
+ presence_penalty: Penalizes new tokens based on their presence in the text so far.
+ """
+ # Initialize parent class
+ super().__init__(provider=str(provider), model_name=model_name, **{
+ 'api_key': api_key,
+ 'base_url': base_url,
+ 'temperature': temperature,
+ 'max_tokens': max_tokens,
+ 'top_p': top_p,
+ 'frequency_penalty': frequency_penalty,
+ 'presence_penalty': presence_penalty,
+ 'timeout': timeout,
+ 'max_retries': max_retries
+ })
+
+ # Initialize the actual client
+ self._initialize_client(provider, model_name, api_key, base_url, temperature,
+ max_tokens, top_p, frequency_penalty, presence_penalty,
+ timeout, max_retries)
+
+ def _initialize_client(self, provider, model_name, api_key, base_url, temperature,
+ max_tokens, top_p, frequency_penalty, presence_penalty,
+ timeout, max_retries):
+ """Initialize the actual client (existing logic)"""
+ # Input validation
+ try:
+ self.provider = ProviderType(provider.lower())
+ except ValueError:
+ raise GepaLLMError(
+ f"Unsupported provider: {provider}. "
+ f"Supported providers: {[p.value for p in ProviderType]}",
+ ErrorType.VALIDATION_ERROR
+ )
+
+ if not model_name:
+ raise GepaLLMError("model_name cannot be empty", ErrorType.VALIDATION_ERROR)
+
+ if not isinstance(temperature, (int, float)) or not 0 <= temperature <= 2:
+ raise GepaLLMError(
+ f"temperature must be between 0 and 2, got {temperature}",
+ ErrorType.VALIDATION_ERROR
+ )
+
+ if not isinstance(max_tokens, int) or max_tokens <= 0:
+ raise GepaLLMError(
+ f"max_tokens must be a positive integer, got {max_tokens}",
+ ErrorType.VALIDATION_ERROR
+ )
+
+ # Initialize API key
+ try:
+ self.api_key = api_key or APIKeyManager().get_api_key(self.provider.value)
+ if not self.api_key:
+ raise GepaLLMError(
+ f"No API key found for provider: {self.provider}",
+ ErrorType.VALIDATION_ERROR
+ )
+ except Exception as e:
+ raise GepaLLMError(
+ f"Failed to initialize API key: {str(e)}",
+ ErrorType.API_ERROR
+ ) from e
+
+ self.model_name = model_name
+ self.base_url = base_url or OPENAI_API_URL
+ self.temperature = temperature
+ self.max_tokens = max_tokens
+ self.top_p = top_p
+ self.frequency_penalty = frequency_penalty
+ self.presence_penalty = presence_penalty
+ self.timeout = timeout
+ self.max_retries = max_retries
+ self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+
+ # Configure session with retry
+ self.session = requests.Session()
+ retry_strategy = requests.adapters.Retry(
+ total=max_retries,
+ backoff_factor=1,
+ status_forcelist=[429, 500, 502, 503, 504],
+ allowed_methods=["POST"]
+ )
+ adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
+ self.session.mount("https://", adapter)
+ self.session.mount("http://", adapter)
+
+ # No hardcoded model restrictions - user can specify any model name
+ # The API provider will validate if the model exists and supports vision
+
+ def _get_api_key(self) -> Optional[str]:
+ """Get API key based on provider"""
+ if self.provider == 'openai':
+ return APIKeyManager().get_api_key('openai')
+ elif self.provider == 'anthropic':
+ return APIKeyManager().get_api_key('anthropic')
+ elif self.provider in ['google', 'gemini']:
+ return APIKeyManager().get_api_key('google')
+ # Add other providers as needed
+ return None
+
+ @classmethod
+ def from_config(cls, config: 'ModelConfig') -> 'VisionLLMClient':
+ """Create a VisionLLMClient from a ModelConfig object.
+
+ Args:
+ config: ModelConfig instance with provider and model settings
+
+ Returns:
+ Configured VisionLLMClient instance
+
+ Example:
+ ```python
+ config = ModelConfig(
+ provider="openai",
+ model_name="gpt-4-vision-preview",
+ temperature=0.7
+ )
+ client = VisionLLMClient.from_config(config)
+ ```
+ """
+ return cls(
+ provider=config.provider,
+ model_name=config.model_name,
+ api_key=config.api_key,
+ base_url=config.base_url,
+ temperature=config.temperature,
+ max_tokens=config.max_tokens,
+ top_p=config.top_p,
+ frequency_penalty=config.frequency_penalty,
+ presence_penalty=config.presence_penalty
+ )
+
+ @classmethod
+ def from_model_string(cls, model_string: str, **kwargs) -> 'VisionLLMClient':
+ """Create a VisionLLMClient from a model string like "provider/model-name".
+
+ Args:
+ model_string: Model identifier in format "provider/model-name" or just "model-name"
+ Examples: "google/gemini-2.0-flash", "openai/gpt-4o", "gemini-1.5-pro"
+ **kwargs: Additional configuration options (temperature, max_tokens, etc.)
+
+ Returns:
+ Configured VisionLLMClient instance
+
+ Example:
+ ```python
+ # With provider
+ client = VisionLLMClient.from_model_string("google/gemini-2.0-flash")
+
+ # Without provider (defaults to openai)
+ client = VisionLLMClient.from_model_string("gpt-4o")
+
+ # With additional options
+ client = VisionLLMClient.from_model_string(
+ "google/gemini-2.0-flash",
+ temperature=0.5,
+ max_tokens=4096
+ )
+ ```
+ """
+ import os
+
+ # Parse "provider/model-name" format
+ if "/" in model_string:
+ provider, model_name = model_string.split("/", 1)
+ else:
+ # Default to openai if no provider specified
+ provider = "openai"
+ model_name = model_string
+
+ # Normalize provider names
+ provider = provider.lower()
+ if provider == "gemini":
+ provider = "google"
+
+ # Get API key from environment if not provided
+ api_key = kwargs.pop('api_key', None)
+ if not api_key:
+ env_var_map = {
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "google": "GOOGLE_API_KEY",
+ }
+ env_var = env_var_map.get(provider, f"{provider.upper()}_API_KEY")
+ api_key = os.getenv(env_var)
+
+ return cls(
+ provider=provider,
+ model_name=model_name,
+ api_key=api_key,
+ **kwargs
+ )
+
+ def generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: Optional[str] = None,
+ **generation_kwargs
+ ) -> Dict[str, Any]:
+ """
+ Generates a response from the Vision LLM.
+
+ Args:
+ system_prompt: The system-level instructions for the LLM.
+ user_prompt: The user's query or task.
+ image_base64: Optional Base64 encoded image string.
+ **generation_kwargs: Additional model-specific generation parameters
+
+ Returns:
+ A dictionary containing the generated response and metadata.
+
+ Raises:
+ GepaLLMError: If there's an error during generation
+
+ Example:
+ ```python
+ response = client.generate(
+ system_prompt="You are a helpful assistant.",
+ user_prompt="What's in this image?",
+ image_base64="base64_encoded_image"
+ )
+ ```
+ """
+ if not system_prompt or not user_prompt:
+ raise GepaLLMError(
+ "system_prompt and user_prompt are required",
+ ErrorType.VALIDATION_ERROR
+ )
+
+ try:
+ if self.provider == ProviderType.OPENAI:
+ return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs)
+ elif self.provider in [ProviderType.GOOGLE, ProviderType.GEMINI]:
+ return self._generate_google(system_prompt, user_prompt, image_base64, **generation_kwargs)
+ else:
+ raise GepaLLMError(
+ f"Provider {self.provider} is not yet supported",
+ ErrorType.VALIDATION_ERROR
+ )
+ except requests.exceptions.RequestException as e:
+ self.logger.error(f"Network error during generation: {str(e)}")
+ raise GepaLLMError(
+ f"Network error: {str(e)}",
+ ErrorType.NETWORK_ERROR,
+ getattr(e.response, 'status_code', None) if hasattr(e, 'response') else None
+ ) from e
+ except GepaLLMError:
+ raise
+ except Exception as e:
+ self.logger.error(f"Unexpected error during generation: {str(e)}")
+ raise GepaLLMError(
+ f"Generation failed: {str(e)}",
+ ErrorType.API_ERROR
+ ) from e
+
+ def _generate_openai(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: Optional[str] = None,
+ **generation_kwargs
+ ) -> Dict[str, Any]:
+ """
+ Generate response using OpenAI's API with configured parameters.
+
+ Args:
+ system_prompt: System instructions for the model
+ user_prompt: User's input prompt
+ image_base64: Optional base64 encoded image
+
+ Returns:
+ Dictionary containing the API response
+
+ Raises:
+ GepaDependencyError: If API call fails
+ """
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {self.api_key}",
+ "User-Agent": "GepaOptimizer/1.0 (Python)"
+ }
+
+ messages = [
+ {"role": "system", "content": system_prompt},
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": user_prompt}
+ ]
+ }
+ ]
+
+ if image_base64:
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ try:
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({
+ "id": f"log_{int(_time_debug.time() * 1000)}",
+ "timestamp": int(_time_debug.time() * 1000),
+ "location": "vision_llm.py:_generate_openai",
+ "message": "Image base64 BEFORE processing",
+ "data": {
+ "image_base64_length": len(image_base64) if image_base64 else 0,
+ "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False,
+ "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64,
+ "is_none": image_base64 is None,
+ "is_empty": image_base64 == "" if image_base64 else True
+ },
+ "sessionId": "debug-session",
+ "runId": "run1",
+ "hypothesisId": "A,C,D"
+ }) + "\n")
+ except Exception:
+ pass
+ # #endregion
+
+ # Detect and extract image format
+ detected_format = "jpeg" # Default fallback
+ clean_base64 = image_base64
+
+ # Extract format from data URI prefix if present
+ if image_base64.startswith("data:image"):
+ # Parse format from prefix: data:image/png;base64,...
+ if "," in image_base64:
+ prefix_part = image_base64.split(",", 1)[0]
+ clean_base64 = image_base64.split(",", 1)[1]
+ # Extract format from "data:image/PNG;base64" or "data:image/png"
+ if "/" in prefix_part and ";" in prefix_part:
+ detected_format = prefix_part.split("/")[1].split(";")[0].lower()
+ elif "/" in prefix_part:
+ detected_format = prefix_part.split("/")[1].lower()
+ else:
+ # Fallback: try to extract format
+ if "/" in image_base64:
+ detected_format = image_base64.split("/")[1].split(";")[0].lower() if ";" in image_base64 else "jpeg"
+ clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "")
+
+ # If no format detected from prefix, try to detect from image data
+ if detected_format == "jpeg" or not detected_format:
+ try:
+ import base64 as b64
+ from PIL import Image
+ import io
+ image_data = b64.b64decode(clean_base64)
+ img = Image.open(io.BytesIO(image_data))
+ if img.format:
+ detected_format = img.format.lower()
+ # Normalize format names
+ if detected_format in ["jpg", "jpeg"]:
+ detected_format = "jpeg"
+ except Exception:
+ # If detection fails, keep default
+ pass
+
+ # Normalize format for data URI (OpenAI accepts: jpeg, png, gif, webp)
+ format_map = {
+ "jpg": "jpeg",
+ "jpeg": "jpeg",
+ "png": "png",
+ "gif": "gif",
+ "webp": "webp",
+ "bmp": "png", # Convert BMP to PNG (OpenAI doesn't support BMP)
+ "tiff": "png", # Convert TIFF to PNG
+ "tif": "png"
+ }
+ final_format = format_map.get(detected_format, "jpeg")
+
+ final_url = f"data:image/{final_format};base64,{clean_base64}"
+
+ # #region agent log
+ try:
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({
+ "id": f"log_{int(_time_debug.time() * 1000)}",
+ "timestamp": int(_time_debug.time() * 1000),
+ "location": "vision_llm.py:_generate_openai",
+ "message": "Image URL AFTER processing",
+ "data": {
+ "detected_format": detected_format,
+ "final_format": final_format,
+ "clean_base64_length": len(clean_base64),
+ "final_url_length": len(final_url),
+ "final_url_prefix": final_url[:60]
+ },
+ "sessionId": "debug-session",
+ "runId": "run1",
+ "hypothesisId": "A,B"
+ }) + "\n")
+ except Exception:
+ pass
+ # #endregion
+
+ messages[1]["content"].append({
+ "type": "image_url",
+ "image_url": {
+ "url": final_url
+ }
+ })
+
+ payload = {
+ "model": self.model_name,
+ "messages": messages,
+ # "temperature": self.temperature,
+ # "max_tokens": self.max_tokens,
+ "top_p": self.top_p,
+ "frequency_penalty": self.frequency_penalty,
+ "presence_penalty": self.presence_penalty
+ }
+
+ self.logger.debug(f"Sending request to {self.base_url} with model {self.model_name}")
+
+ try:
+ self.logger.debug(f"Sending request to {self.model_name}")
+
+ # Make the API request with retry
+ response = self.session.post(
+ self.base_url,
+ headers=headers,
+ json=payload,
+ timeout=300
+ )
+
+ # Handle rate limiting
+ if response.status_code == 429:
+ retry_after = int(response.headers.get('Retry-After', 5))
+ self.logger.warning(f"Rate limited. Retrying after {retry_after} seconds...")
+ time.sleep(retry_after)
+ return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs)
+
+ response.raise_for_status()
+
+ result = response.json()
+ self.logger.debug(f"Received response from {self.model_name}")
+
+ # Extract and validate the response
+ try:
+ message = result["choices"][0]["message"]
+ llm_response_content = message["content"]
+
+ # Log token usage if available
+ if "usage" in result:
+ usage = result["usage"]
+ self.logger.info(
+ f"Tokens used - Prompt: {usage.get('prompt_tokens', 'N/A')}, "
+ f"Completion: {usage.get('completion_tokens', 'N/A')}, "
+ f"Total: {usage.get('total_tokens', 'N/A')}"
+ )
+
+ # Try to parse JSON if the response looks like JSON
+ if isinstance(llm_response_content, str) and (
+ llm_response_content.startswith('{') or
+ llm_response_content.startswith('[')
+ ):
+ try:
+ return json.loads(llm_response_content)
+ except json.JSONDecodeError:
+ pass
+
+ # Default response format
+ return {
+ "content": llm_response_content,
+ "role": message.get("role", "assistant"),
+ "model": self.model_name,
+ "provider": self.provider.value
+ }
+
+ except (KeyError, IndexError) as e:
+ self.logger.error(f"Unexpected response format: {result}")
+ raise GepaLLMError(
+ f"Unexpected response format from {self.provider} API",
+ ErrorType.API_ERROR,
+ response.status_code
+ ) from e
+
+ except requests.exceptions.HTTPError as e:
+ status_code = e.response.status_code if hasattr(e, 'response') else None
+ error_msg = f"HTTP error {status_code} from {self.provider} API"
+
+ try:
+ error_data = e.response.json()
+ error_msg = error_data.get('error', {}).get('message', error_msg)
+ except Exception:
+ error_msg = str(e)
+
+ self.logger.error(f"{error_msg}: {error_data if 'error_data' in locals() else str(e)}")
+ raise GepaLLMError(
+ error_msg,
+ ErrorType.RATE_LIMIT if status_code == 429 else ErrorType.API_ERROR,
+ status_code
+ ) from e
+
+ except requests.exceptions.Timeout:
+ self.logger.error(f"Request to {self.provider} API timed out after {self.timeout} seconds")
+ raise GepaLLMError(
+ f"Request timed out after {self.timeout} seconds",
+ ErrorType.TIMEOUT
+ )
+
+ except requests.exceptions.RequestException as e:
+ self.logger.error(f"Network error: {str(e)}")
+ raise GepaLLMError(
+ f"Network error: {str(e)}",
+ ErrorType.NETWORK_ERROR
+ ) from e
+
+ except Exception as e:
+ self.logger.error(f"Unexpected error: {str(e)}", exc_info=True)
+ raise GepaLLMError(
+ f"Unexpected error: {str(e)}",
+ ErrorType.API_ERROR
+ ) from e
+
+ def _generate_google(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: Optional[str] = None,
+ **generation_kwargs
+ ) -> Dict[str, Any]:
+ """
+ Generate response using Google Gemini API with configured parameters.
+
+ Args:
+ system_prompt: System instructions for the model
+ user_prompt: User's input prompt
+ image_base64: Optional base64 encoded image
+
+ Returns:
+ Dictionary containing the API response
+
+ Raises:
+ GepaLLMError: If API call fails
+ """
+ try:
+ import google.generativeai as genai
+ import base64
+ from PIL import Image
+ import io
+ except ImportError as e:
+ raise GepaLLMError(
+ f"Required dependencies for Google Gemini not installed: {str(e)}. "
+ f"Please install: pip install google-generativeai Pillow",
+ ErrorType.VALIDATION_ERROR
+ ) from e
+
+ # Configure Gemini
+ genai.configure(api_key=self.api_key)
+
+ # Use the model name directly as specified by the user
+ # No hardcoded mappings or restrictions - fully configurable
+ # The Gemini API will validate if the model exists
+ gemini_model_name = self.model_name
+
+ try:
+ model = genai.GenerativeModel(gemini_model_name)
+ except Exception as e:
+ raise GepaLLMError(
+ f"Failed to initialize Gemini model {gemini_model_name}: {str(e)}",
+ ErrorType.API_ERROR
+ ) from e
+
+ # Prepare content
+ content_parts = []
+
+ # Add system prompt and user prompt
+ full_prompt = f"{system_prompt}\n\n{user_prompt}"
+ content_parts.append(full_prompt)
+
+ # Add image if provided
+ if image_base64:
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ try:
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({
+ "id": f"log_{int(_time_debug.time() * 1000)}",
+ "timestamp": int(_time_debug.time() * 1000),
+ "location": "vision_llm.py:_generate_google",
+ "message": "Image base64 BEFORE processing (Google)",
+ "data": {
+ "image_base64_length": len(image_base64) if image_base64 else 0,
+ "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False,
+ "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64,
+ "is_none": image_base64 is None,
+ "is_empty": image_base64 == "" if image_base64 else True
+ },
+ "sessionId": "debug-session",
+ "runId": "run1",
+ "hypothesisId": "A,C,D"
+ }) + "\n")
+ except Exception:
+ pass
+ # #endregion
+
+ try:
+ # Strip data URI prefix if present (hypothesis A fix)
+ clean_base64 = image_base64
+ if image_base64.startswith("data:image"):
+ # Extract just the base64 part after the comma
+ if "," in image_base64:
+ clean_base64 = image_base64.split(",", 1)[1]
+ else:
+ clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "")
+
+ # Decode base64 image
+ image_data = base64.b64decode(clean_base64)
+ image = Image.open(io.BytesIO(image_data))
+ content_parts.append(image)
+ self.logger.debug(f"Added image to Gemini request")
+ except Exception as e:
+ self.logger.warning(f"Failed to process image for Gemini: {str(e)}")
+ # Continue without image rather than failing
+
+ self.logger.debug(f"Sending request to Gemini model {gemini_model_name}")
+
+ try:
+ # Generate response with retry logic
+ max_retries = 3
+ for attempt in range(max_retries):
+ try:
+ # Configure generation parameters
+ generation_config = genai.types.GenerationConfig(
+ temperature=self.temperature,
+ max_output_tokens=self.max_tokens,
+ top_p=self.top_p,
+ )
+
+ response = model.generate_content(
+ content_parts,
+ generation_config=generation_config
+ )
+
+ # Check if response was blocked
+ if response.prompt_feedback and response.prompt_feedback.block_reason:
+ raise GepaLLMError(
+ f"Gemini blocked the prompt: {response.prompt_feedback.block_reason}",
+ ErrorType.VALIDATION_ERROR
+ )
+
+ # Check if response was blocked
+ if not response.text:
+ if response.candidates and response.candidates[0].finish_reason:
+ finish_reason = response.candidates[0].finish_reason
+ if finish_reason == genai.types.FinishReason.SAFETY:
+ raise GepaLLMError(
+ "Gemini response blocked due to safety concerns",
+ ErrorType.VALIDATION_ERROR
+ )
+ elif finish_reason == genai.types.FinishReason.RECITATION:
+ raise GepaLLMError(
+ "Gemini response blocked due to recitation concerns",
+ ErrorType.VALIDATION_ERROR
+ )
+ raise GepaLLMError(
+ "Gemini returned empty response",
+ ErrorType.API_ERROR
+ )
+
+ self.logger.debug(f"Received response from Gemini model {gemini_model_name}")
+
+ # Log usage information if available
+ if hasattr(response, 'usage_metadata') and response.usage_metadata:
+ usage = response.usage_metadata
+ self.logger.info(
+ f"Tokens used - Prompt: {usage.prompt_token_count}, "
+ f"Completion: {usage.candidates_token_count}, "
+ f"Total: {usage.total_token_count}"
+ )
+
+ # Try to parse JSON if the response looks like JSON
+ response_text = response.text
+ if isinstance(response_text, str) and (
+ response_text.startswith('{') or
+ response_text.startswith('[')
+ ):
+ try:
+ return json.loads(response_text)
+ except json.JSONDecodeError:
+ pass
+
+ # Default response format
+ return {
+ "content": response_text,
+ "role": "assistant",
+ "model": gemini_model_name,
+ "provider": "google"
+ }
+
+ except Exception as e:
+ if attempt < max_retries - 1:
+ self.logger.warning(f"Gemini API attempt {attempt + 1} failed: {str(e)}. Retrying...")
+ time.sleep(2 ** attempt) # Exponential backoff
+ continue
+ else:
+ raise
+
+ except GepaLLMError:
+ raise
+ except Exception as e:
+ self.logger.error(f"Unexpected error with Gemini API: {str(e)}")
+ raise GepaLLMError(
+ f"Gemini API error: {str(e)}",
+ ErrorType.API_ERROR
+ ) from e
diff --git a/src/gepa_optimizer/models/__init__.py b/src/gepa_optimizer/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afaf5b607bb0fdddf1d62b08d6348a034e5f8a0
--- /dev/null
+++ b/src/gepa_optimizer/models/__init__.py
@@ -0,0 +1,15 @@
+"""
+Models module for GEPA Optimizer
+"""
+
+from .config import ModelConfig, OptimizationConfig
+from .dataset import DatasetItem
+from .result import OptimizationResult, OptimizedResult
+
+__all__ = [
+ "ModelConfig",
+ "OptimizationConfig",
+ "DatasetItem",
+ "OptimizationResult",
+ "OptimizedResult"
+]
diff --git a/src/gepa_optimizer/models/config.py b/src/gepa_optimizer/models/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f6193d450b891b1d66255966b843fecbc45d25c
--- /dev/null
+++ b/src/gepa_optimizer/models/config.py
@@ -0,0 +1,488 @@
+"""
+Configuration models for GEPA Optimizer
+"""
+
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Union, Tuple
+
+@dataclass
+class ModelConfig:
+ """Configuration for any LLM provider"""
+ provider: str # Required: "openai", "anthropic", "huggingface", "vllm", etc.
+ model_name: str # Required: actual model name
+ api_key: str # Required: API key for the provider
+ base_url: Optional[str] = None # Optional: custom endpoint URL
+ temperature: float = 0.7
+ max_tokens: int = 2048
+ top_p: float = 1.0
+ frequency_penalty: float = 0.0
+ presence_penalty: float = 0.0
+
+ def __post_init__(self):
+ """Validate required fields after initialization"""
+ if not self.provider:
+ raise ValueError("Provider is required (e.g., 'openai', 'anthropic', 'huggingface')")
+ if not self.model_name:
+ raise ValueError("Model name is required (e.g., 'gpt-4', 'claude-3-opus')")
+ if not self.api_key:
+ raise ValueError(f"API key is required for {self.provider} provider")
+
+ @classmethod
+ def from_string(cls, model_string: str) -> 'ModelConfig':
+ """Create ModelConfig from string like 'openai/gpt-4' or 'gpt-4'"""
+ if "/" in model_string:
+ provider, model_name = model_string.split("/", 1)
+ else:
+ # Default to OpenAI if no provider specified
+ provider = "openai"
+ model_name = model_string
+
+ # Get API key from environment
+ api_key = cls._get_api_key_for_provider(provider)
+ if not api_key:
+ raise ValueError(
+ f"No API key found for {provider}. Please set {provider.upper()}_API_KEY environment variable"
+ )
+
+ return cls(
+ provider=provider,
+ model_name=model_name,
+ api_key=api_key
+ )
+
+ @classmethod
+ def from_dict(cls, config_dict: dict) -> 'ModelConfig':
+ """Create ModelConfig from dictionary"""
+ return cls(**config_dict)
+
+ def to_dict(self) -> dict:
+ """Convert ModelConfig to dictionary"""
+ return {
+ 'provider': self.provider,
+ 'model_name': self.model_name,
+ 'api_key': self.api_key,
+ 'base_url': self.base_url,
+ 'temperature': self.temperature,
+ 'max_tokens': self.max_tokens,
+ 'top_p': self.top_p,
+ 'frequency_penalty': self.frequency_penalty,
+ 'presence_penalty': self.presence_penalty
+ }
+
+ @staticmethod
+ def _get_api_key_for_provider(provider: str) -> Optional[str]:
+ """Get API key for provider from environment variables"""
+ env_var_map = {
+ "openai": "OPENAI_API_KEY",
+ "anthropic": "ANTHROPIC_API_KEY",
+ "huggingface": "HUGGINGFACE_API_KEY",
+ "cohere": "COHERE_API_KEY",
+ "ai21": "AI21_API_KEY",
+ "together": "TOGETHER_API_KEY",
+ "replicate": "REPLICATE_API_TOKEN",
+ "groq": "GROQ_API_KEY",
+ "ollama": "OLLAMA_API_KEY"
+ }
+
+ env_var = env_var_map.get(provider.lower())
+ if env_var:
+ return os.getenv(env_var)
+
+ # Fallback: try generic pattern
+ return os.getenv(f"{provider.upper()}_API_KEY")
+
+@dataclass
+class DataSplitConfig:
+ """Configuration for dataset splitting into train/val/test sets
+
+ ๐ฅ ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size for optimal results.
+ - Small datasets (< 15): Prioritizes validation set (70/25/5) for reliable candidate ranking
+ - Medium datasets (15-50): Balanced split (60/20/20)
+ - Large datasets (50+): More training data (70/15/15)
+ """
+
+ # Split ratios (must sum to 1.0) - used as defaults, but adaptive strategy overrides for small datasets
+ train_ratio: float = 0.6 # 60% for training (Dfeedback - reflection examples)
+ val_ratio: float = 0.2 # 20% for validation (Dpareto - Pareto selection)
+ test_ratio: float = 0.2 # 20% for test (held-out final evaluation)
+
+ # Minimum samples per split
+ min_train_samples: int = 3
+ min_val_samples: int = 3 # ๐ฅ INCREASED from 2 to 3 for more reliable validation scores
+ min_test_samples: int = 1 # ๐ฅ REDUCED from 2 to 1 (test set less critical, only used once)
+
+ # Strategy for handling small datasets
+ small_dataset_strategy: str = 'adaptive' # ๐ฅ DEFAULT: 'adaptive', 'duplicate_val', 'no_test', 'error'
+
+ def __post_init__(self):
+ """Validate split configuration"""
+ total = self.train_ratio + self.val_ratio + self.test_ratio
+ if not (0.99 <= total <= 1.01): # Allow small floating point errors
+ raise ValueError(
+ f"Split ratios must sum to 1.0, got {total:.3f} "
+ f"(train={self.train_ratio}, val={self.val_ratio}, test={self.test_ratio})"
+ )
+
+ if self.train_ratio <= 0 or self.val_ratio <= 0 or self.test_ratio < 0:
+ raise ValueError("Split ratios must be positive (test_ratio can be 0 to disable)")
+
+ if self.small_dataset_strategy not in {'adaptive', 'duplicate_val', 'no_test', 'error'}:
+ raise ValueError(
+ f"Invalid small_dataset_strategy: {self.small_dataset_strategy}. "
+ f"Must be 'adaptive', 'duplicate_val', 'no_test', or 'error'"
+ )
+
+ def get_adaptive_ratios(self, dataset_size: int) -> Tuple[float, float, float]:
+ """
+ ๐ฅ NEW: Get adaptive split ratios based on dataset size.
+
+ For prompt optimization:
+ - Small datasets (< 15): Prioritize validation (70/25/5) for reliable candidate ranking
+ - Medium (15-50): Balanced (60/20/20)
+ - Large (50+): More training (70/15/15)
+
+ Args:
+ dataset_size: Total number of samples in dataset
+
+ Returns:
+ Tuple of (train_ratio, val_ratio, test_ratio)
+ """
+ if dataset_size < 15:
+ # Small dataset: Prioritize validation for reliable candidate ranking
+ # Validation set is CRITICAL - used for every candidate evaluation
+ return (0.70, 0.25, 0.05) # 70% train, 25% val, 5% test
+ elif dataset_size < 50:
+ # Medium dataset: Balanced split
+ return (0.60, 0.20, 0.20) # 60% train, 20% val, 20% test
+ else:
+ # Large dataset: More training data, can reduce validation/test
+ return (0.70, 0.15, 0.15) # 70% train, 15% val, 15% test
+
+ def get_split_indices(self, dataset_size: int) -> Tuple[int, int, int, int]:
+ """
+ Calculate split indices for a dataset with adaptive ratios.
+
+ ๐ฅ ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size.
+ This ensures optimal allocation:
+ - Small datasets: More validation samples for reliable ranking
+ - Medium datasets: Balanced split
+ - Large datasets: More training data
+
+ Args:
+ dataset_size: Total number of samples in dataset
+
+ Returns:
+ Tuple of (train_end, val_end, test_end, dataset_size) indices
+
+ Raises:
+ ValueError: If dataset is too small for configured splits
+ """
+ # ๐ฅ NEW: Use adaptive ratios if strategy is 'adaptive'
+ if self.small_dataset_strategy == 'adaptive':
+ train_ratio, val_ratio, test_ratio = self.get_adaptive_ratios(dataset_size)
+ else:
+ train_ratio, val_ratio, test_ratio = self.train_ratio, self.val_ratio, self.test_ratio
+
+ if dataset_size < self.min_train_samples + self.min_val_samples:
+ if self.small_dataset_strategy == 'error':
+ raise ValueError(
+ f"Dataset too small ({dataset_size} samples). "
+ f"Need at least {self.min_train_samples + self.min_val_samples} samples."
+ )
+
+ # Calculate ideal split points with adaptive ratios
+ train_end = max(self.min_train_samples, int(dataset_size * train_ratio))
+ val_end = train_end + max(self.min_val_samples, int(dataset_size * val_ratio))
+
+ # Adjust for small datasets
+ if val_end >= dataset_size:
+ if self.small_dataset_strategy in {'adaptive', 'duplicate_val'}:
+ # Ensure minimum validation samples, use remainder for test
+ val_end = min(dataset_size, train_end + self.min_val_samples)
+ test_end = dataset_size
+ elif self.small_dataset_strategy == 'no_test':
+ # No test set for small datasets
+ val_end = dataset_size
+ test_end = dataset_size
+ else: # error
+ raise ValueError(
+ f"Dataset too small ({dataset_size} samples) for train/val/test split. "
+ f"Need at least {self.min_train_samples + self.min_val_samples + self.min_test_samples} samples."
+ )
+ else:
+ test_end = dataset_size
+
+ return train_end, val_end, test_end, dataset_size
+
+@dataclass
+class OptimizationConfig:
+ """Configuration class for GEPA optimization process"""
+
+ # Core models - REQUIRED by user
+ model: Union[str, ModelConfig] # No default - user must specify
+ reflection_model: Union[str, ModelConfig] # No default - user must specify
+
+ # Optimization parameters - REQUIRED by user
+ max_iterations: int # No default - user decides their budget
+ max_metric_calls: int # No default - user sets their budget
+ batch_size: int # No default - user decides based on memory
+
+ # Dataset splitting configuration
+ data_split: DataSplitConfig = field(default_factory=DataSplitConfig)
+
+ # Reflection settings (separate from evaluation batch_size)
+ reflection_examples: int = 3 # Number of examples for each reflection (small!)
+
+ # Optional optimization settings with sensible fallbacks
+ early_stopping: bool = True
+ learning_rate: float = 0.01
+
+ # Multi-objective optimization
+ multi_objective: bool = False
+ objectives: List[str] = field(default_factory=lambda: ["accuracy"])
+
+ # Advanced settings
+ custom_metrics: Optional[Dict[str, Any]] = None
+ use_cache: bool = True
+ parallel_evaluation: bool = False
+
+ # Backwards compatibility (deprecated)
+ train_split_ratio: Optional[float] = None # Use data_split instead
+ min_dataset_size: int = 2
+
+ # Cost and budget - user controlled
+ max_cost_usd: Optional[float] = None
+ timeout_seconds: Optional[int] = None
+
+ # GEPA-specific optimization parameters (based on actual GEPA library)
+ candidate_selection_strategy: str = 'pareto' # Use Pareto selection strategy
+ skip_perfect_score: bool = False # Don't skip perfect scores (set to True for early stopping)
+ reflection_minibatch_size: Optional[int] = None # Will use reflection_examples if None
+ perfect_score: float = 1.0 # Perfect score threshold
+ module_selector: str = 'round_robin' # Component selection strategy
+ verbose: bool = True # Enable detailed GEPA logging
+
+ # Test set evaluation
+ evaluate_on_test: bool = True # Evaluate final prompt on held-out test set
+
+ # ๐ LLEGO Genetic Operator Parameters (Optional - for faster convergence)
+ # Based on ICLR 2025 paper: "Decision Tree Induction Through LLMs via Semantically-Aware Evolution"
+ # Optimized for small datasets (6-10 samples)
+ use_llego_operators: bool = False # Enable LLEGO genetic operators
+
+ # ๐ฅ HYBRID MODE: Combine GEPA Reflection + LLEGO Operators
+ # When both enabled, candidates are generated from BOTH sources for maximum diversity
+ enable_gepa_reflection_with_llego: bool = False # Enable hybrid GEPA+LLEGO mode
+ num_gepa_reflection_candidates: int = 3 # Number of GEPA reflection candidates per iteration (default: 3 for better exploration, range: 2-5)
+
+ # Fitness-guided crossover parameters (FIX #3: Conservative alpha)
+ alpha: float = 0.05 # FIX #3: Fitness extrapolation (0.05 = 5% above best parent, realistic for prompt optimization)
+ n_crossover: int = 2 # Number of offspring from crossover per iteration
+
+ # Diversity-guided mutation parameters
+ tau: float = 8.0 # Diversity temperature (8.0 = moderate diversity, balanced exploration/exploitation)
+ nu: int = 3 # Parent arity (3 parents optimal for small populations ~6 samples)
+ n_mutation: int = 2 # Number of offspring from mutation per iteration (total 4 offspring with crossover)
+
+ # Population management (for genetic operators)
+ population_size: int = 8 # Size of prompt population (small but diverse for 6-sample dataset)
+
+ # ๐ LLM-as-Judge configuration (Phase 2)
+ use_llm_as_judge: bool = True # Enable LLM-as-Judge feedback for detailed, actionable analysis
+ llm_as_judge_threshold: float = 0.8 # Use LLM-as-Judge for scores below this threshold
+ llm_as_judge_model: Optional[ModelConfig] = None # Optional: use different model (defaults to reflection_model)
+
+ # ๐ Logging configuration (Phase 3)
+ log_level: str = "INFO" # Logging level: "DEBUG", "INFO", "WARNING", "ERROR"
+
+ def __post_init__(self):
+ """Validate and process configuration after initialization"""
+ # Handle backwards compatibility for train_split_ratio
+ if self.train_split_ratio is not None and self.train_split_ratio != 0.8:
+ import warnings
+ warnings.warn(
+ "train_split_ratio is deprecated. Use data_split=DataSplitConfig(...) instead. "
+ "Converting to 3-way split with your ratio.",
+ DeprecationWarning,
+ stacklevel=2
+ )
+ # Convert 2-way split to 3-way: use train_ratio, split remainder between val/test
+ remainder = 1.0 - self.train_split_ratio
+ self.data_split = DataSplitConfig(
+ train_ratio=self.train_split_ratio,
+ val_ratio=remainder * 0.5,
+ test_ratio=remainder * 0.5
+ )
+
+ # Convert string models to ModelConfig objects
+ self.model = self._parse_model_config(self.model, "model")
+ self.reflection_model = self._parse_model_config(self.reflection_model, "reflection_model")
+
+ # Set reflection_minibatch_size default
+ if self.reflection_minibatch_size is None:
+ self.reflection_minibatch_size = self.reflection_examples
+
+ # Validate required parameters
+ self._validate_required_params()
+
+ # Validate ranges
+ self._validate_ranges()
+
+ def _parse_model_config(self, model: Union[str, ModelConfig], field_name: str) -> ModelConfig:
+ """Parse string model specification into ModelConfig"""
+ if isinstance(model, ModelConfig):
+ return model
+
+ if isinstance(model, str):
+ # Parse "provider/model-name" format
+ if "/" in model:
+ provider, model_name = model.split("/", 1)
+ else:
+ # Default to openai if no provider specified
+ provider = "openai"
+ model_name = model
+
+ # Try to get API key from environment
+ api_key = self._get_api_key_for_provider(provider)
+ if not api_key:
+ raise ValueError(
+ f"No API key found for {provider}. Please set environment variable "
+ f"or provide ModelConfig with api_key for {field_name}"
+ )
+
+ return ModelConfig(
+ provider=provider,
+ model_name=model_name,
+ api_key=api_key
+ )
+
+ raise ValueError(f"{field_name} must be either a string or ModelConfig object")
+
+ def _get_api_key_for_provider(self, provider: str) -> Optional[str]:
+ """Get API key for provider from environment variables"""
+ return ModelConfig._get_api_key_for_provider(provider)
+
+ def _validate_required_params(self):
+ """Validate that all required parameters are provided"""
+ required_fields = {
+ "max_iterations": self.max_iterations,
+ "max_metric_calls": self.max_metric_calls,
+ "batch_size": self.batch_size,
+ }
+
+ for field_name, value in required_fields.items():
+ if value is None:
+ raise ValueError(f"{field_name} is required and must be specified by user")
+
+ def _validate_ranges(self):
+ """Validate parameter ranges"""
+ if self.max_iterations <= 0:
+ raise ValueError("max_iterations must be positive")
+
+ if self.max_metric_calls <= 0:
+ raise ValueError("max_metric_calls must be positive")
+
+ if self.batch_size <= 0:
+ raise ValueError("batch_size must be positive")
+
+ if self.reflection_examples <= 0 or self.reflection_examples > 10:
+ raise ValueError("reflection_examples must be between 1 and 10 (recommended: 2-5)")
+
+ if self.reflection_minibatch_size <= 0:
+ raise ValueError("reflection_minibatch_size must be positive")
+
+ if hasattr(self.model, 'max_tokens') and self.model.max_tokens <= 0:
+ raise ValueError("model.max_tokens must be a positive integer")
+
+ # Validate hybrid mode parameters
+ if self.enable_gepa_reflection_with_llego and not self.use_llego_operators:
+ raise ValueError("enable_gepa_reflection_with_llego requires use_llego_operators=True")
+
+ if self.num_gepa_reflection_candidates <= 0 or self.num_gepa_reflection_candidates > 5:
+ raise ValueError("num_gepa_reflection_candidates must be between 1 and 5 (recommended: 3 for balanced exploration)")
+
+ # Validate log_level
+ valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"]
+ if self.log_level.upper() not in valid_log_levels:
+ raise ValueError(f"log_level must be one of {valid_log_levels}, got: {self.log_level}")
+
+ def validate_api_connectivity(self) -> Dict[str, bool]:
+ """Test API connectivity for both models"""
+ results = {}
+
+ for model_name, model_config in [("model", self.model), ("reflection_model", self.reflection_model)]:
+ try:
+ # This would be implemented to actually test the API
+ # For now, just check if we have the required info
+ if model_config.api_key and model_config.provider and model_config.model_name:
+ results[model_name] = True
+ else:
+ results[model_name] = False
+ except Exception:
+ results[model_name] = False
+
+ return results
+
+ def get_estimated_cost(self) -> Dict[str, Any]:
+ """Estimate cost based on configuration"""
+ # This would calculate estimated costs based on:
+ # - max_metric_calls
+ # - model pricing
+ # - expected tokens per call
+ return {
+ "max_calls": self.max_metric_calls,
+ "estimated_cost_range": "To be calculated based on provider pricing",
+ "cost_factors": {
+ "model_calls": self.max_metric_calls,
+ "reflection_calls": self.max_iterations,
+ "batch_size": self.batch_size
+ }
+ }
+
+ @classmethod
+ def create_example_config(cls, provider: str = "openai") -> str:
+ """Generate example configuration code for users"""
+ examples = {
+ "openai": '''
+# Example OpenAI Configuration
+config = OptimizationConfig(
+ model="openai/gpt-4-turbo", # or ModelConfig(...)
+ reflection_model="openai/gpt-4-turbo",
+ max_iterations=50, # Your choice based on budget
+ max_metric_calls=300, # Your choice based on budget
+ batch_size=8, # Your choice based on memory
+ early_stopping=True,
+ learning_rate=0.01
+)
+''',
+ "anthropic": '''
+# Example Anthropic Configuration
+config = OptimizationConfig(
+ model=ModelConfig(
+ provider="anthropic",
+ model_name="claude-3-opus-20240229",
+ api_key="your-anthropic-key",
+ temperature=0.7
+ ),
+ reflection_model="anthropic/claude-3-sonnet-20240229",
+ max_iterations=30,
+ max_metric_calls=200,
+ batch_size=4
+)
+''',
+ "mixed": '''
+# Example Mixed Providers Configuration
+config = OptimizationConfig(
+ model="openai/gpt-4-turbo", # Main model
+ reflection_model="anthropic/claude-3-opus", # Reflection model
+ max_iterations=25,
+ max_metric_calls=250,
+ batch_size=6,
+ max_cost_usd=100.0, # Budget limit
+ timeout_seconds=3600 # 1 hour limit
+)
+'''
+ }
+
+ return examples.get(provider, examples["openai"])
diff --git a/src/gepa_optimizer/models/dataset.py b/src/gepa_optimizer/models/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..82bf5264c258f4e6cd5c89dbde63f78693761c45
--- /dev/null
+++ b/src/gepa_optimizer/models/dataset.py
@@ -0,0 +1,89 @@
+"""
+Dataset models for GEPA Optimizer
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import uuid
+
+@dataclass
+class DatasetItem:
+ """Single item in a dataset"""
+
+ # Identifiers
+ item_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+ # Core data
+ input_data: Any = ""
+ expected_output: Optional[str] = None
+ image_base64: Optional[str] = None
+
+ # Metadata
+ metadata: Dict[str, Any] = field(default_factory=dict)
+ tags: List[str] = field(default_factory=list)
+
+ # File references
+ file_paths: List[str] = field(default_factory=list)
+
+ # Quality indicators
+ quality_score: float = 1.0
+ is_validated: bool = False
+ validation_notes: List[str] = field(default_factory=list)
+
+ def __post_init__(self):
+ """Validate item after initialization"""
+ if self.quality_score < 0 or self.quality_score > 1:
+ raise ValueError("quality_score must be between 0 and 1")
+
+ def add_tag(self, tag: str):
+ """Add a tag to this item"""
+ if tag not in self.tags:
+ self.tags.append(tag)
+
+ def mark_validated(self, notes: Optional[List[str]] = None):
+ """Mark item as validated"""
+ self.is_validated = True
+ if notes:
+ self.validation_notes.extend(notes)
+
+@dataclass
+class ProcessedDataset:
+ """Dataset after processing for GEPA optimization"""
+
+ # Identifiers
+ dataset_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+ name: str = "Untitled Dataset"
+
+ # Data
+ items: List[DatasetItem] = field(default_factory=list)
+ train_split: List[DatasetItem] = field(default_factory=list)
+ val_split: List[DatasetItem] = field(default_factory=list)
+
+ # Metadata
+ source_info: Dict[str, Any] = field(default_factory=dict)
+ processing_stats: Dict[str, Any] = field(default_factory=dict)
+
+ # Quality metrics
+ total_items: int = 0
+ validated_items: int = 0
+ avg_quality_score: float = 0.0
+
+ def __post_init__(self):
+ """Calculate derived fields"""
+ self.total_items = len(self.items)
+
+ if self.items:
+ self.validated_items = sum(1 for item in self.items if item.is_validated)
+ self.avg_quality_score = sum(item.quality_score for item in self.items) / len(self.items)
+
+ def get_stats(self) -> Dict[str, Any]:
+ """Get dataset statistics"""
+ return {
+ 'total_items': self.total_items,
+ 'validated_items': self.validated_items,
+ 'validation_rate': self.validated_items / self.total_items if self.total_items > 0 else 0,
+ 'avg_quality_score': self.avg_quality_score,
+ 'train_size': len(self.train_split),
+ 'val_size': len(self.val_split),
+ 'has_expected_outputs': sum(1 for item in self.items if item.expected_output),
+ }
diff --git a/src/gepa_optimizer/models/result.py b/src/gepa_optimizer/models/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d11cda56bf40a7faae1fea211bd572a9c4dbe5
--- /dev/null
+++ b/src/gepa_optimizer/models/result.py
@@ -0,0 +1,204 @@
+"""
+Result models for GEPA Optimizer
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, Any, Optional, List
+import uuid
+
+@dataclass
+class OptimizationResult:
+ """Complete optimization result with all metadata"""
+
+ # Identifiers
+ session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+
+ # Core results
+ original_prompt: str = ""
+ optimized_prompt: str = ""
+
+ # Performance metrics
+ improvement_data: Dict[str, Any] = field(default_factory=dict)
+ baseline_metrics: Dict[str, float] = field(default_factory=dict)
+ final_metrics: Dict[str, float] = field(default_factory=dict)
+
+ # Process metadata
+ optimization_time: float = 0.0
+ dataset_size: int = 0
+ total_iterations: int = 0
+
+ # Status and error handling
+ status: str = "pending" # pending, running, completed, failed
+ error_message: Optional[str] = None
+
+ # Timestamps
+ created_at: datetime = field(default_factory=datetime.now)
+ completed_at: Optional[datetime] = None
+
+ # Reflection history
+ reflection_history: List[Dict[str, Any]] = field(default_factory=list)
+
+ # Cost and resource usage
+ estimated_cost: Optional[float] = None
+ api_calls_made: int = 0
+
+ def mark_completed(self):
+ """Mark optimization as completed"""
+ self.status = "completed"
+ self.completed_at = datetime.now()
+
+ def mark_failed(self, error: str):
+ """Mark optimization as failed"""
+ self.status = "failed"
+ self.error_message = error
+ self.completed_at = datetime.now()
+
+class OptimizedResult:
+ """
+ User-facing result class that provides clean interface
+ """
+
+ def __init__(self,
+ original_prompt: str = "",
+ optimized_prompt: str = "",
+ improvement_data: Dict[str, Any] = None,
+ optimization_time: float = 0.0,
+ dataset_size: int = 0,
+ total_iterations: int = 0,
+ status: str = "pending",
+ error_message: Optional[str] = None,
+ detailed_result: Optional[OptimizationResult] = None,
+ session_id: Optional[str] = None):
+ """
+ Initialize OptimizedResult with individual parameters
+
+ Args:
+ original_prompt: Original seed prompt
+ optimized_prompt: Optimized prompt
+ improvement_data: Performance improvement data
+ optimization_time: Time taken for optimization
+ dataset_size: Size of dataset used
+ total_iterations: Number of optimization iterations
+ status: Optimization status
+ error_message: Error message if failed
+ detailed_result: Optional detailed OptimizationResult
+ session_id: Optional session ID
+ """
+ if improvement_data is None:
+ improvement_data = {}
+
+ # Create internal OptimizationResult
+ self._result = OptimizationResult(
+ session_id=session_id or str(uuid.uuid4()),
+ original_prompt=original_prompt,
+ optimized_prompt=optimized_prompt,
+ improvement_data=improvement_data,
+ optimization_time=optimization_time,
+ dataset_size=dataset_size,
+ total_iterations=total_iterations,
+ status=status,
+ error_message=error_message
+ )
+
+ # If detailed_result is provided, use it instead
+ if detailed_result is not None:
+ self._result = detailed_result
+
+ @property
+ def prompt(self) -> str:
+ """The optimized prompt ready for production use"""
+ return self._result.optimized_prompt
+
+ @property
+ def original_prompt(self) -> str:
+ """The original seed prompt for reference"""
+ return self._result.original_prompt
+
+ @property
+ def session_id(self) -> str:
+ """Unique session identifier"""
+ return self._result.session_id
+
+ @property
+ def improvement_data(self) -> Dict[str, Any]:
+ """Performance improvement data"""
+ return self._result.improvement_data
+
+ @property
+ def status(self) -> str:
+ """Optimization status"""
+ return self._result.status
+
+ @property
+ def error_message(self) -> Optional[str]:
+ """Error message if optimization failed"""
+ return self._result.error_message
+
+ @property
+ def is_successful(self) -> bool:
+ """Whether optimization completed successfully"""
+ return (
+ self._result.status == "completed" and
+ self._result.error_message is None
+ )
+
+ @property
+ def optimization_time(self) -> float:
+ """Time taken for optimization in seconds"""
+ return self._result.optimization_time
+
+ @property
+ def dataset_size(self) -> int:
+ """Size of dataset used for optimization"""
+ return self._result.dataset_size
+
+ @property
+ def total_iterations(self) -> int:
+ """Total optimization iterations performed"""
+ return self._result.total_iterations
+
+ @property
+ def estimated_cost(self) -> Optional[float]:
+ """Estimated cost in USD"""
+ return self._result.estimated_cost
+
+ def get_improvement_summary(self) -> Dict[str, Any]:
+ """Get summary of improvements made"""
+ summary = {
+ 'has_improvement': bool(self._result.improvement_data),
+ 'optimization_time': self.optimization_time,
+ 'iterations': self.total_iterations,
+ 'dataset_size': self.dataset_size
+ }
+
+ # Add improvement percentage if available
+ if 'improvement_percent' in self._result.improvement_data:
+ summary['improvement_percent'] = self._result.improvement_data['improvement_percent']
+
+ return summary
+
+ def get_reflection_summary(self) -> Dict[str, Any]:
+ """Get summary of reflection process"""
+ if not self._result.reflection_history:
+ return {'total_reflections': 0}
+
+ return {
+ 'total_reflections': len(self._result.reflection_history),
+ 'reflection_points': [
+ r.get('summary', 'No summary')
+ for r in self._result.reflection_history[:3] # First 3
+ ]
+ }
+
+ def get_detailed_result(self) -> OptimizationResult:
+ """Get the full detailed result for advanced users"""
+ return self._result
+
+ def __str__(self) -> str:
+ """String representation"""
+ status_emoji = "โ
" if self.is_successful else "โ" if self.status == "failed" else "โณ"
+ return f"OptimizedResult({status_emoji} {self.status}, time={self.optimization_time:.2f}s)"
+
+ def __repr__(self) -> str:
+ return self.__str__()
diff --git a/src/gepa_optimizer/operators/__init__.py b/src/gepa_optimizer/operators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f3b6b5092b597b55a76fbabb6654f58e51f04d
--- /dev/null
+++ b/src/gepa_optimizer/operators/__init__.py
@@ -0,0 +1,45 @@
+"""
+LLEGO Genetic Operators for GEPA.
+
+This module provides genetic operators for prompt optimization:
+- FitnessGuidedCrossover: Combines high-performing prompts
+- DiversityGuidedMutation: Explores diverse variations
+- LLEGOIntegrationLayer: Manages the genetic algorithm workflow
+
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+
+# Base interfaces (SOLID: Interface Segregation)
+from .base_operator import (
+ BaseGeneticOperator,
+ BaseCrossoverOperator,
+ BaseMutationOperator,
+)
+
+# Data models
+from .models import (
+ PromptCandidate,
+ PromptMetadata,
+)
+
+# Concrete operators (SOLID: Single Responsibility)
+from .crossover import FitnessGuidedCrossover
+from .mutation import DiversityGuidedMutation
+
+# Integration layer
+from .llego_operators import LLEGOIntegrationLayer
+
+__all__ = [
+ # Base interfaces
+ 'BaseGeneticOperator',
+ 'BaseCrossoverOperator',
+ 'BaseMutationOperator',
+ # Data models
+ 'PromptCandidate',
+ 'PromptMetadata',
+ # Operators
+ 'FitnessGuidedCrossover',
+ 'DiversityGuidedMutation',
+ # Integration
+ 'LLEGOIntegrationLayer',
+]
diff --git a/src/gepa_optimizer/operators/base_operator.py b/src/gepa_optimizer/operators/base_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3a2772abec80e3800aeec1857dc371cccdb608
--- /dev/null
+++ b/src/gepa_optimizer/operators/base_operator.py
@@ -0,0 +1,107 @@
+"""
+Base Genetic Operator Interface.
+
+Defines the abstract interface for all genetic operators following
+the Interface Segregation Principle (ISP) of SOLID.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Callable
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class BaseGeneticOperator(ABC):
+ """
+ Abstract base class for genetic operators.
+
+ All genetic operators (crossover, mutation, etc.) should inherit from this
+ class and implement the __call__ method.
+
+ Design Principles:
+ - Single Responsibility: Each operator does one thing
+ - Open/Closed: Extend via inheritance, don't modify
+ - Liskov Substitution: Any operator works where base is expected
+ - Interface Segregation: Minimal required interface
+ - Dependency Inversion: Depend on abstractions (LLM callable)
+ """
+
+ @abstractmethod
+ def __call__(self, *args, **kwargs) -> str:
+ """
+ Execute the genetic operation.
+
+ Returns:
+ str: New prompt generated by the operation
+ """
+ pass
+
+ @abstractmethod
+ def _build_prompt(self, *args, **kwargs) -> str:
+ """
+ Build the LLM prompt for this operation.
+
+ Returns:
+ str: Prompt to send to the LLM
+ """
+ pass
+
+
+class BaseCrossoverOperator(BaseGeneticOperator):
+ """
+ Abstract base class for crossover operators.
+
+ Crossover combines multiple parent prompts to create offspring
+ that inherit good traits from both parents.
+ """
+
+ @abstractmethod
+ def __call__(
+ self,
+ parents: List, # List[PromptCandidate]
+ target_fitness: float,
+ llm: Callable[[str], str]
+ ) -> str:
+ """
+ Combine parent prompts to create offspring.
+
+ Args:
+ parents: List of parent PromptCandidate objects
+ target_fitness: Desired fitness for offspring
+ llm: Language model callable
+
+ Returns:
+ str: Offspring prompt
+ """
+ pass
+
+
+class BaseMutationOperator(BaseGeneticOperator):
+ """
+ Abstract base class for mutation operators.
+
+ Mutation creates variations of a parent prompt to explore
+ new regions of the search space.
+ """
+
+ @abstractmethod
+ def __call__(
+ self,
+ parent, # PromptCandidate
+ population: List, # List[PromptCandidate]
+ llm: Callable[[str], str]
+ ) -> str:
+ """
+ Mutate a parent prompt to create a variation.
+
+ Args:
+ parent: Parent PromptCandidate to mutate
+ population: Current population for diversity guidance
+ llm: Language model callable
+
+ Returns:
+ str: Mutated prompt
+ """
+ pass
+
diff --git a/src/gepa_optimizer/operators/crossover.py b/src/gepa_optimizer/operators/crossover.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff82d7d619550b72e33299a8f78c2ccd28b7e48
--- /dev/null
+++ b/src/gepa_optimizer/operators/crossover.py
@@ -0,0 +1,120 @@
+"""
+Fitness-Guided Crossover Operator.
+
+Adapts LLEGO's fitness-guided crossover for text prompts.
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+
+from typing import List, Callable, TYPE_CHECKING
+import logging
+
+from .base_operator import BaseCrossoverOperator
+
+if TYPE_CHECKING:
+ from .models import PromptCandidate
+
+logger = logging.getLogger(__name__)
+
+
+class FitnessGuidedCrossover(BaseCrossoverOperator):
+ """
+ Fitness-guided crossover for text prompts.
+
+ Combines high-performing parent prompts to generate offspring
+ that target specific fitness levels using LLM semantic understanding.
+
+ From LLEGO paper:
+ "Fitness-guided crossover exploits high-performing regions of the search space
+ by combining parent trees targeting a desired fitness level f* = f_max + ฮฑ(f_max - f_min)"
+
+ Reference: https://github.com/nicolashuynh/LLEGO
+ """
+
+ def __init__(self, alpha: float = 0.1):
+ """
+ Initialize crossover operator.
+
+ Args:
+ alpha: Fitness extrapolation parameter.
+ Higher ฮฑ = target higher fitness than parents.
+ Default 0.1 from LLEGO paper (target 10% above best parent).
+ """
+ self.alpha = alpha
+ logger.debug(f"FitnessGuidedCrossover initialized with ฮฑ={alpha}")
+
+ def __call__(
+ self,
+ parents: List["PromptCandidate"],
+ target_fitness: float,
+ llm: Callable[[str], str]
+ ) -> str:
+ """
+ Combine parent prompts targeting specific fitness.
+
+ Args:
+ parents: List of PromptCandidate objects (2+ parents)
+ target_fitness: Desired fitness for offspring
+ llm: Language model callable
+
+ Returns:
+ str: Offspring prompt
+
+ Raises:
+ ValueError: If fewer than 2 parents provided
+ """
+ if len(parents) < 2:
+ raise ValueError("Crossover requires at least 2 parents")
+
+ # Sort parents by fitness (best first)
+ sorted_parents = sorted(parents, key=lambda p: p.fitness, reverse=True)
+
+ logger.debug(f"Crossover: {len(parents)} parents, target fitness={target_fitness:.3f}")
+
+ # Build crossover prompt and call LLM
+ crossover_prompt = self._build_prompt(sorted_parents, target_fitness)
+ new_prompt = llm(crossover_prompt)
+
+ return new_prompt
+
+ def _build_prompt(
+ self,
+ parents: List["PromptCandidate"],
+ target_fitness: float
+ ) -> str:
+ """
+ Build LLM prompt for crossover operation.
+
+ Args:
+ parents: Sorted list of parent candidates (best first)
+ target_fitness: Target fitness for offspring
+
+ Returns:
+ str: Prompt for LLM
+ """
+ # Truncate parents to prevent safety filter issues
+ MAX_PARENT_LENGTH = 350
+
+ # Build parent descriptions (limit to top 2)
+ parent_descriptions = []
+ for i, parent in enumerate(parents[:2]):
+ truncated = parent.prompt[:MAX_PARENT_LENGTH]
+ if len(parent.prompt) > MAX_PARENT_LENGTH:
+ truncated += "..."
+ parent_descriptions.append(
+ f"P{i+1} (f={parent.fitness:.2f}): {truncated}\n"
+ )
+
+ prompt = f"""Combine these prompts into ONE improved version (target fitness: {target_fitness:.2f}).
+
+{' '.join(parent_descriptions)}
+Instructions:
+1. Merge the best rules/principles from both parents
+2. Organize logic clearly (e.g., "For X tasks: do Y", "If Z: then A")
+3. Add structure to handle different cases systematically
+4. Keep output format (Element: X, Description:, Reason:)
+5. Max 600 chars
+
+Output ONLY the combined prompt:"""
+
+ return prompt
+
diff --git a/src/gepa_optimizer/operators/llego_operators.py b/src/gepa_optimizer/operators/llego_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be082eea011484ab510e6e63789db43a0c06ff6
--- /dev/null
+++ b/src/gepa_optimizer/operators/llego_operators.py
@@ -0,0 +1,364 @@
+"""
+LLEGO Integration Layer for GEPA.
+
+This module provides the integration layer that wraps LLEGO genetic operators
+for use with the GEPA optimization framework.
+
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+GitHub: https://github.com/nicolashuynh/LLEGO
+"""
+
+from typing import List, Callable, Dict, Any, Optional, Literal
+import numpy as np
+import logging
+
+# Import from modular files (SOLID: Single Responsibility)
+from .models import PromptCandidate, PromptMetadata
+from .crossover import FitnessGuidedCrossover
+from .mutation import DiversityGuidedMutation
+
+logger = logging.getLogger(__name__)
+
+
+class LLEGOIntegrationLayer:
+ """
+ Integration layer that wraps LLEGO operators for GEPA.
+
+ This class manages the genetic algorithm workflow:
+ - Population initialization
+ - Parent selection (fitness-based)
+ - Crossover and mutation operations
+ - Population management
+
+ Design Principles:
+ - Composition over inheritance (uses crossover_op, mutation_op)
+ - Single Responsibility: Only manages GA workflow
+ - Open/Closed: New operators can be added without modifying this class
+ """
+
+ def __init__(
+ self,
+ alpha: float = 0.05,
+ tau: float = 10.0,
+ nu: int = 4,
+ population_size: int = 10,
+ n_crossover: int = 2,
+ n_mutation: int = 3
+ ):
+ """
+ Initialize LLEGO integration layer.
+
+ Args:
+ alpha: Fitness extrapolation for crossover (default 0.05)
+ tau: Diversity temperature for mutation
+ nu: Parent arity for diversity sampling
+ population_size: Maximum population size
+ n_crossover: Number of crossover offspring per generation
+ n_mutation: Number of mutation offspring per generation
+ """
+ self.crossover_op = FitnessGuidedCrossover(alpha=alpha)
+ self.mutation_op = DiversityGuidedMutation(tau=tau, nu=nu)
+ self.population_size = population_size
+ self.n_crossover = n_crossover
+ self.n_mutation = n_mutation
+ self.population: List[PromptCandidate] = []
+ self.current_generation = 0
+
+ # Track metadata for prompts generated in current generation
+ self._generation_metadata: Dict[str, PromptMetadata] = {}
+
+ logger.debug(f"LLEGO initialized: pop_size={population_size}, crossover={n_crossover}, mutation={n_mutation}")
+
+ def initialize_population(self, seed_prompt: str, initial_fitness: float = 0.5):
+ """Initialize population with seed prompt."""
+ seed_candidate = PromptCandidate(
+ prompt=seed_prompt,
+ fitness=initial_fitness,
+ metadata={
+ 'generation': 0,
+ 'operator': 'seed',
+ 'parent_indices': None,
+ 'parent_prompts': None,
+ 'target_fitness': None,
+ 'diversity_score': None,
+ 'sample_scores': None,
+ 'num_diverse_parents': None
+ }
+ )
+ self.population = [seed_candidate]
+ logger.debug(f"Population initialized with seed prompt ({len(seed_prompt)} chars)")
+
+ def create_candidate_with_metadata(
+ self,
+ prompt: str,
+ fitness: float,
+ generation: int,
+ operator: Literal['crossover', 'mutation'],
+ parent_indices: Optional[List[int]] = None,
+ parent_prompts: Optional[List[str]] = None,
+ target_fitness: Optional[float] = None,
+ diversity_score: Optional[float] = None,
+ sample_scores: Optional[List[float]] = None,
+ num_diverse_parents: Optional[int] = None
+ ) -> PromptCandidate:
+ """Create a PromptCandidate with properly populated metadata."""
+ return PromptCandidate(
+ prompt=prompt,
+ fitness=fitness,
+ metadata={
+ 'generation': generation,
+ 'operator': operator,
+ 'parent_indices': parent_indices,
+ 'parent_prompts': parent_prompts,
+ 'target_fitness': target_fitness,
+ 'diversity_score': diversity_score,
+ 'sample_scores': sample_scores,
+ 'num_diverse_parents': num_diverse_parents
+ }
+ )
+
+ def evolve_generation(
+ self,
+ llm: Callable[[str], str],
+ pareto_front: List[PromptCandidate]
+ ) -> List[str]:
+ """
+ Evolve one generation using LLEGO operators.
+
+ When crossover cannot run (< 2 parents with scores), it is skipped.
+ The caller should compensate by generating extra GEPA reflection candidates.
+
+ Args:
+ llm: Language model callable
+ pareto_front: Current Pareto front (non-dominated prompts with scores)
+
+ Returns:
+ List of new prompt candidates to evaluate
+ """
+ new_prompts = []
+ self.current_generation += 1
+ self._generation_metadata = {}
+
+ # Track crossover status for caller to handle compensation
+ self._crossover_skipped = False
+ self._crossover_deficit = 0
+ self._actual_crossover_count = 0
+
+ logger.info(f"๐งฌ LLEGO Generation {self.current_generation}: pareto_front={len(pareto_front)}, population={len(self.population)}")
+
+ # Crossover: Combine BEST parents (requires >= 2 parents WITH SCORES)
+ if len(pareto_front) >= 2:
+ # Sort by fitness - always use TOP scored parents for crossover
+ sorted_front = sorted(pareto_front, key=lambda p: p.fitness, reverse=True)
+
+ for i in range(self.n_crossover):
+ # Always use top 2 highest-scored parents
+ parents = sorted_front[:2]
+ target_fitness = self._calculate_target_fitness(parents)
+
+ offspring = self.crossover_op(parents, target_fitness, llm)
+ new_prompts.append(offspring)
+ self._actual_crossover_count += 1
+
+ # Store metadata with parent fitness info
+ self._generation_metadata[offspring] = {
+ 'generation': self.current_generation,
+ 'operator': 'crossover',
+ 'parent_indices': [self.population.index(p) for p in parents if p in self.population],
+ 'parent_prompts': [p.prompt for p in parents],
+ 'parent_fitnesses': [p.fitness for p in parents],
+ 'target_fitness': target_fitness,
+ 'diversity_score': None,
+ 'sample_scores': None,
+ 'num_diverse_parents': len(parents)
+ }
+
+ logger.info(f" Oโโ{i+1}: Crossed top parents (f={parents[0].fitness:.3f} ร f={parents[1].fitness:.3f}) โ target f*={target_fitness:.3f}")
+ else:
+ # Signal that crossover was skipped - caller should compensate with GEPA
+ self._crossover_skipped = True
+ self._crossover_deficit = self.n_crossover
+ logger.info(f"โ ๏ธ Crossover SKIPPED: need 2+ scored parents, have {len(pareto_front)}")
+ logger.info(f" โ Caller should compensate with {self._crossover_deficit} extra GEPA reflection candidates")
+
+ # Mutation: Explore diverse variations (requires >= 1 parent)
+ # Use pareto_front if available, otherwise fall back to population
+ mutation_source = pareto_front if pareto_front else self.population
+
+ if len(mutation_source) >= 1:
+ for i in range(self.n_mutation):
+ parent = self._select_parent_for_mutation(mutation_source)
+
+ offspring = self.mutation_op(parent, self.population, llm)
+ new_prompts.append(offspring)
+
+ parent_idx = self.population.index(parent) if parent in self.population else -1
+ self._generation_metadata[offspring] = {
+ 'generation': self.current_generation,
+ 'operator': 'mutation',
+ 'parent_indices': [parent_idx] if parent_idx >= 0 else None,
+ 'parent_prompts': [parent.prompt],
+ 'parent_fitness': parent.fitness,
+ 'target_fitness': None,
+ 'diversity_score': None,
+ 'sample_scores': None,
+ 'num_diverse_parents': min(self.mutation_op.nu, len(self.population))
+ }
+
+ crossover_count = len([p for p in new_prompts if self._generation_metadata.get(p, {}).get('operator') == 'crossover'])
+ mutation_count = len([p for p in new_prompts if self._generation_metadata.get(p, {}).get('operator') == 'mutation'])
+
+ logger.info(f"๐งฌ LLEGO Generated {len(new_prompts)} candidates: {crossover_count} crossover, {mutation_count} mutation")
+
+ return new_prompts
+
+ def get_prompt_metadata(self, prompt: str) -> Optional[PromptMetadata]:
+ """Retrieve metadata for a prompt generated in the current generation."""
+ return self._generation_metadata.get(prompt)
+
+ def _convert_gepa_pareto_to_candidates(
+ self,
+ gepa_pareto_front: List[Dict[str, Any]]
+ ) -> List[PromptCandidate]:
+ """
+ Convert GEPA Pareto front entries to PromptCandidate format.
+
+ Args:
+ gepa_pareto_front: List of dicts with 'prompt', 'score', 'type', 'notation'
+
+ Returns:
+ List of PromptCandidate objects
+ """
+ if not gepa_pareto_front:
+ return []
+
+ # De-duplicate Pareto front
+ seen_prompts = set()
+ deduplicated_front = []
+
+ for entry in gepa_pareto_front:
+ if isinstance(entry, dict) and 'prompt' in entry:
+ prompt_text = entry['prompt']
+ if prompt_text not in seen_prompts:
+ seen_prompts.add(prompt_text)
+ deduplicated_front.append(entry)
+
+ candidates = []
+
+ for idx, entry in enumerate(deduplicated_front):
+ try:
+ if not isinstance(entry, dict):
+ continue
+
+ prompt = entry.get('prompt')
+ if not prompt or not isinstance(prompt, str):
+ continue
+
+ score = entry.get('score')
+ if score is None:
+ continue
+
+ try:
+ fitness = float(score)
+ except (ValueError, TypeError):
+ continue
+
+ candidate_type = entry.get('type', 'unknown')
+ notation = entry.get('notation', 'S')
+
+ metadata: PromptMetadata = {
+ 'generation': self.current_generation,
+ 'operator': 'gepa_pareto_front',
+ 'parent_indices': None,
+ 'parent_prompts': None,
+ 'target_fitness': None,
+ 'diversity_score': None,
+ 'sample_scores': None,
+ 'num_diverse_parents': None,
+ 'candidate_type': candidate_type,
+ 'notation': notation,
+ 'prompt_length': len(prompt),
+ 'word_count': len(prompt.split()),
+ 'from_gepa_pareto': True
+ }
+
+ candidate = PromptCandidate(
+ prompt=prompt,
+ fitness=fitness,
+ metadata=metadata
+ )
+
+ candidates.append(candidate)
+
+ except Exception as e:
+ logger.error(f"Error converting Pareto entry #{idx+1}: {e}")
+ continue
+
+ return candidates
+
+ def update_population(self, new_candidates: List[PromptCandidate]):
+ """Update population with new evaluated candidates."""
+ self.population.extend(new_candidates)
+
+ # Remove duplicates
+ seen_prompts = set()
+ unique_population = []
+ for p in self.population:
+ normalized = p.prompt.strip().strip('"\'')
+ if normalized not in seen_prompts:
+ seen_prompts.add(normalized)
+ unique_population.append(p)
+ self.population = unique_population
+
+ # Keep top population_size by fitness
+ self.population.sort(key=lambda p: p.fitness, reverse=True)
+ self.population = self.population[:self.population_size]
+
+ if self.population:
+ logger.debug(f"Population updated: {len(self.population)} candidates, best={self.population[0].fitness:.3f}")
+
+ def _select_parents_for_crossover(self, pareto_front: List[PromptCandidate], k: int = 2) -> List[PromptCandidate]:
+ """Select top-k parents for crossover."""
+ sorted_front = sorted(pareto_front, key=lambda p: p.fitness, reverse=True)
+ return sorted_front[:k]
+
+ def _select_parent_for_mutation(self, pareto_front: List[PromptCandidate]) -> PromptCandidate:
+ """Select a parent for mutation (fitness-proportionate)."""
+ if len(pareto_front) == 1:
+ return pareto_front[0]
+
+ fitnesses = np.array([p.fitness for p in pareto_front])
+ fitnesses = np.maximum(fitnesses, 0.01)
+ probs = fitnesses / fitnesses.sum()
+
+ idx = np.random.choice(len(pareto_front), p=probs)
+ return pareto_front[idx]
+
+ def _calculate_target_fitness(self, parents: List[PromptCandidate]) -> float:
+ """Calculate target fitness for crossover using LLEGO formula: f* = f_max + ฮฑ(f_max - f_min)"""
+ fitnesses = [p.fitness for p in parents]
+ f_max = max(fitnesses)
+ f_min = min(fitnesses)
+
+ target_fitness = f_max + self.crossover_op.alpha * (f_max - f_min)
+ return min(target_fitness, 1.0)
+
+ def get_best_candidate(self) -> Optional[PromptCandidate]:
+ """Get current best prompt."""
+ if not self.population:
+ return None
+ return max(self.population, key=lambda p: p.fitness)
+
+ def get_stats(self) -> Dict[str, Any]:
+ """Get population statistics."""
+ if not self.population:
+ return {"population_size": 0, "best_fitness": 0.0, "avg_fitness": 0.0}
+
+ fitnesses = [p.fitness for p in self.population]
+ return {
+ "population_size": len(self.population),
+ "best_fitness": max(fitnesses),
+ "avg_fitness": np.mean(fitnesses),
+ "min_fitness": min(fitnesses),
+ "fitness_std": np.std(fitnesses)
+ }
diff --git a/src/gepa_optimizer/operators/models.py b/src/gepa_optimizer/operators/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d92fb0843e843dc205fcd4da3ea64ec87bbe1c
--- /dev/null
+++ b/src/gepa_optimizer/operators/models.py
@@ -0,0 +1,60 @@
+"""
+Data models for LLEGO genetic operators.
+
+Contains the core data structures used across all genetic operators.
+"""
+
+from typing import List, Optional, Literal
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import TypedDict
+
+
+class PromptMetadata(TypedDict, total=False):
+ """
+ Metadata for tracking prompt evolution history and performance.
+
+ This enables debugging, analysis, and visualization of the genetic algorithm's
+ evolution process by tracking how each prompt was created and its characteristics.
+ """
+ generation: int # Which iteration created this prompt
+ operator: Literal['seed', 'crossover', 'mutation'] # How the prompt was created
+ parent_indices: Optional[List[int]] # Indices of parent prompts
+ parent_prompts: Optional[List[str]] # Actual parent prompt texts
+ target_fitness: Optional[float] # Target fitness for crossover
+ diversity_score: Optional[float] # Diversity from population
+ sample_scores: Optional[List[float]] # Performance per sample
+ num_diverse_parents: Optional[int] # Diverse parents count (mutation)
+ created_at: str # Creation timestamp
+ prompt_length: int # Character count
+ word_count: int # Word count
+ candidate_type: Optional[str] # Type for GEPA notation
+
+
+@dataclass
+class PromptCandidate:
+ """
+ Represents a prompt candidate with fitness score and evolution metadata.
+
+ Attributes:
+ prompt: The actual prompt text
+ fitness: Fitness score (0-1) from evaluation
+ metadata: Tracking information about prompt creation and performance
+ """
+ prompt: str
+ fitness: float
+ metadata: Optional[PromptMetadata] = field(default_factory=dict)
+
+ def __post_init__(self):
+ """Initialize metadata if not provided."""
+ if self.metadata is None:
+ self.metadata = {}
+
+ # Auto-populate prompt statistics
+ if 'prompt_length' not in self.metadata:
+ self.metadata['prompt_length'] = len(self.prompt)
+ if 'word_count' not in self.metadata:
+ self.metadata['word_count'] = len(self.prompt.split())
+ if 'created_at' not in self.metadata:
+ self.metadata['created_at'] = datetime.now().isoformat()
+
diff --git a/src/gepa_optimizer/operators/mutation.py b/src/gepa_optimizer/operators/mutation.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc850d93bb20c0e770f48f857e932bf01d6d7394
--- /dev/null
+++ b/src/gepa_optimizer/operators/mutation.py
@@ -0,0 +1,185 @@
+"""
+Diversity-Guided Mutation Operator.
+
+Adapts LLEGO's diversity-guided mutation for text prompts.
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+
+from typing import List, Callable, TYPE_CHECKING
+import numpy as np
+import logging
+
+from .base_operator import BaseMutationOperator
+
+if TYPE_CHECKING:
+ from .models import PromptCandidate
+
+logger = logging.getLogger(__name__)
+
+
+class DiversityGuidedMutation(BaseMutationOperator):
+ """
+ Diversity-guided mutation for text prompts.
+
+ Explores the search space by generating diverse prompt variations
+ using temperature-controlled LLM sampling.
+
+ From LLEGO paper:
+ "Diversity-guided mutation enables efficient global exploration by sampling
+ diverse parents with temperature parameter ฯ"
+
+ Reference: https://github.com/nicolashuynh/LLEGO
+ """
+
+ def __init__(self, tau: float = 10.0, nu: int = 4):
+ """
+ Initialize mutation operator.
+
+ Args:
+ tau: Diversity temperature (higher = more exploration).
+ Default 10.0 from LLEGO paper.
+ nu: Parent arity (number of parents to sample for diversity).
+ Default 4 from LLEGO paper.
+ """
+ self.tau = tau
+ self.nu = nu
+ logger.debug(f"DiversityGuidedMutation initialized with ฯ={tau}, ฮฝ={nu}")
+
+ def __call__(
+ self,
+ parent: "PromptCandidate",
+ population: List["PromptCandidate"],
+ llm: Callable[[str], str]
+ ) -> str:
+ """
+ Mutate a parent prompt to explore new regions.
+
+ Args:
+ parent: Parent PromptCandidate to mutate
+ population: Current population for diversity guidance
+ llm: Language model callable
+
+ Returns:
+ str: Mutated prompt
+ """
+ logger.debug(f"Mutation: parent fitness={parent.fitness:.3f}")
+
+ # Sample diverse parents for context
+ diverse_parents = self._sample_diverse_parents(parent, population)
+
+ # Build mutation prompt and call LLM
+ mutation_prompt = self._build_prompt(parent, diverse_parents)
+ mutated_prompt = llm(mutation_prompt)
+
+ return mutated_prompt
+
+ def _sample_diverse_parents(
+ self,
+ parent: "PromptCandidate",
+ population: List["PromptCandidate"]
+ ) -> List["PromptCandidate"]:
+ """
+ Sample diverse parents using temperature-based selection.
+
+ Args:
+ parent: Current parent
+ population: Population to sample from
+
+ Returns:
+ List of diverse parent candidates
+ """
+ # Calculate diversity scores
+ diversity_scores = []
+ for candidate in population:
+ if candidate.prompt != parent.prompt:
+ diversity = self._calculate_diversity(parent.prompt, candidate.prompt)
+ diversity_scores.append((candidate, diversity))
+
+ if not diversity_scores:
+ return [parent]
+
+ # Temperature-based sampling
+ scores = np.array([score for _, score in diversity_scores])
+ probs = np.exp(scores / self.tau)
+ probs /= probs.sum()
+
+ # Sample nu diverse parents
+ n_samples = min(self.nu, len(diversity_scores))
+ indices = np.random.choice(
+ len(diversity_scores),
+ size=n_samples,
+ replace=False,
+ p=probs
+ )
+
+ return [diversity_scores[i][0] for i in indices]
+
+ def _calculate_diversity(self, prompt1: str, prompt2: str) -> float:
+ """
+ Calculate semantic diversity between two prompts.
+
+ Uses Jaccard distance on words as a simple diversity metric.
+
+ Args:
+ prompt1: First prompt
+ prompt2: Second prompt
+
+ Returns:
+ float: Diversity score (0-1, higher = more diverse)
+ """
+ words1 = set(prompt1.lower().split())
+ words2 = set(prompt2.lower().split())
+
+ intersection = len(words1 & words2)
+ union = len(words1 | words2)
+
+ jaccard_similarity = intersection / union if union > 0 else 0
+ return 1 - jaccard_similarity # Higher = more diverse
+
+ def _build_prompt(
+ self,
+ parent: "PromptCandidate",
+ diverse_parents: List["PromptCandidate"]
+ ) -> str:
+ """
+ Build LLM prompt for mutation operation.
+
+ Args:
+ parent: Parent candidate to mutate
+ diverse_parents: Diverse parents for context
+
+ Returns:
+ str: Prompt for LLM
+ """
+ MAX_PARENT_LENGTH = 350
+ MAX_DIVERSE_LENGTH = 200
+
+ parent_truncated = parent.prompt[:MAX_PARENT_LENGTH]
+ if len(parent.prompt) > MAX_PARENT_LENGTH:
+ parent_truncated += "..."
+
+ # Build diversity context
+ diversity_context = []
+ for i, diverse_parent in enumerate(diverse_parents[:2]):
+ truncated = diverse_parent.prompt[:MAX_DIVERSE_LENGTH]
+ if len(diverse_parent.prompt) > MAX_DIVERSE_LENGTH:
+ truncated += "..."
+ diversity_context.append(f"V{i+1}: {truncated}")
+
+ prompt = f"""Create a variation of this prompt with different decision logic (fitness: {parent.fitness:.2f}).
+
+Parent: {parent_truncated}
+
+{chr(10).join(diversity_context) if diversity_context else ""}
+
+Instructions:
+1. Explore NEW ways to categorize tasks (e.g., by element type, by action, by hierarchy)
+2. Add handling for edge cases the parent might miss
+3. Keep the structured, logical approach
+4. Keep format (Element: X, Description:, Reason:)
+5. Max 600 chars
+
+Output ONLY the new prompt:"""
+
+ return prompt
+
diff --git a/src/gepa_optimizer/types.py b/src/gepa_optimizer/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df1de9d2dbdaf50a0e048ded5e2345cf72fc1b6
--- /dev/null
+++ b/src/gepa_optimizer/types.py
@@ -0,0 +1,245 @@
+"""
+Type definitions for GEPA Optimizer.
+
+This module contains type aliases, TypedDicts, and Protocol classes
+used throughout the GEPA Optimizer codebase for strict typing.
+"""
+
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ List,
+ Literal,
+ Optional,
+ Protocol,
+ Tuple,
+ TypedDict,
+ TypeVar,
+ Union,
+)
+
+
+# ============================================================================
+# Dataset Types
+# ============================================================================
+
+class DatasetItem(TypedDict, total=False):
+ """Standard dataset item format for GEPA optimization."""
+ input: str # Input text/command
+ output: str # Expected output
+ image_base64: str # Base64-encoded image (optional)
+ metadata: Dict[str, Any] # Additional metadata
+ reflection_input: str # Simplified input for reflection (optional)
+
+
+DatasetSplit = Literal["train", "val", "test", "trainset", "valset", "testset"]
+DatasetList = List[DatasetItem]
+
+# Train/Val/Test split tuple
+DatasetSplitTuple = Tuple[DatasetList, DatasetList, DatasetList]
+
+
+# ============================================================================
+# Evaluation Types
+# ============================================================================
+
+class EvaluationResult(TypedDict, total=False):
+ """Result from evaluating a single sample."""
+ score: float # Primary score [0.0, 1.0]
+ composite_score: float # Weighted composite score
+ is_match: bool # Whether prediction matches expected
+ predicted: str # Model's prediction
+ expected: str # Expected output
+ metrics: Dict[str, float] # Detailed metric scores
+ feedback: str # Human-readable feedback
+
+
+class EvaluationSummary(TypedDict):
+ """Summary of evaluation results."""
+ total_samples: int
+ correct_predictions: int
+ accuracy: float
+ average_score: float
+
+
+# ============================================================================
+# LLM Types
+# ============================================================================
+
+class LLMResponse(TypedDict, total=False):
+ """Response from LLM generation."""
+ content: str # Generated text
+ usage: Dict[str, int] # Token usage stats
+ model: str # Model used
+ finish_reason: str # Why generation stopped
+ source: str # Source (gepa_reflection, llego_crossover, etc.)
+
+
+class LLMClientProtocol(Protocol):
+ """Protocol for LLM client implementations."""
+
+ def generate(
+ self,
+ system_prompt: str,
+ user_prompt: str,
+ image_base64: str = "",
+ **kwargs: Any
+ ) -> Union[str, Dict[str, Any]]:
+ """Generate a response from the LLM."""
+ ...
+
+
+class BatchLLMClientProtocol(Protocol):
+ """Protocol for batch LLM client implementations."""
+
+ def submit_batch(
+ self,
+ tasks: List[Dict[str, Any]],
+ **kwargs: Any
+ ) -> str:
+ """Submit a batch of tasks. Returns batch ID."""
+ ...
+
+ def get_batch_results(
+ self,
+ batch_id: str,
+ **kwargs: Any
+ ) -> List[Dict[str, Any]]:
+ """Get results for a submitted batch."""
+ ...
+
+
+# ============================================================================
+# Evaluator Types
+# ============================================================================
+
+class EvaluatorProtocol(Protocol):
+ """Protocol for evaluator implementations."""
+
+ def evaluate(
+ self,
+ predicted: str,
+ expected: str,
+ **kwargs: Any
+ ) -> Dict[str, float]:
+ """Evaluate a prediction against expected output."""
+ ...
+
+ def get_composite_score(
+ self,
+ metrics: Dict[str, float]
+ ) -> float:
+ """Calculate composite score from individual metrics."""
+ ...
+
+
+# ============================================================================
+# Optimization Types
+# ============================================================================
+
+class CandidateDict(TypedDict, total=False):
+ """A prompt candidate in the optimization process."""
+ system_prompt: str # The prompt text
+ prompt: str # Alias for system_prompt
+ fitness: float # Fitness score
+ score: float # Alias for fitness
+ source: str # Source of candidate (seed, gepa_reflection, llego_crossover, etc.)
+ type: str # Type alias for source
+ notation: str # Mathematical notation (Sโ, Sแตฃ, Oโโ, Oโแตคโ)
+ index: int # Candidate index
+
+
+class ParetoCandidate(TypedDict):
+ """A candidate in the Pareto front."""
+ prompt: str
+ score: float
+ type: str
+ notation: str
+
+
+class OptimizationState(TypedDict, total=False):
+ """Current state of optimization."""
+ iteration: int
+ best_score: float
+ best_prompt: str
+ pareto_front: List[ParetoCandidate]
+ baseline_score: Optional[float]
+
+
+# ============================================================================
+# Configuration Types
+# ============================================================================
+
+class DataSplitConfig(TypedDict, total=False):
+ """Configuration for dataset splitting."""
+ train_ratio: float
+ val_ratio: float
+ test_ratio: float
+ shuffle: bool
+ seed: Optional[int]
+
+
+class LLEGOConfig(TypedDict, total=False):
+ """Configuration for LLEGO operators."""
+ mode: Literal["hybrid", "llego_only", "disabled"]
+ population_size: int
+ num_crossover_candidates: int
+ num_mutation_candidates: int
+ crossover_enabled: bool
+ mutation_enabled: bool
+
+
+# ============================================================================
+# Type Variables
+# ============================================================================
+
+T = TypeVar("T")
+DatasetT = TypeVar("DatasetT", bound=Dict[str, Any])
+ResultT = TypeVar("ResultT")
+
+
+# ============================================================================
+# Callback Types
+# ============================================================================
+
+EvaluationCallback = Callable[[str, str], EvaluationResult]
+GenerationCallback = Callable[[str], str]
+ProgressCallback = Callable[[int, int, float], None]
+
+
+# ============================================================================
+# Export
+# ============================================================================
+
+__all__ = [
+ # Dataset
+ "DatasetItem",
+ "DatasetSplit",
+ "DatasetList",
+ "DatasetSplitTuple",
+ # Evaluation
+ "EvaluationResult",
+ "EvaluationSummary",
+ "EvaluatorProtocol",
+ # LLM
+ "LLMResponse",
+ "LLMClientProtocol",
+ "BatchLLMClientProtocol",
+ # Optimization
+ "CandidateDict",
+ "ParetoCandidate",
+ "OptimizationState",
+ # Configuration
+ "DataSplitConfig",
+ "LLEGOConfig",
+ # Callbacks
+ "EvaluationCallback",
+ "GenerationCallback",
+ "ProgressCallback",
+ # Type Variables
+ "T",
+ "DatasetT",
+ "ResultT",
+]
+
diff --git a/src/gepa_optimizer/utils/__init__.py b/src/gepa_optimizer/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec5d3d6813c17c86cf77e1ad00d5c9bff4aae705
--- /dev/null
+++ b/src/gepa_optimizer/utils/__init__.py
@@ -0,0 +1,40 @@
+"""
+Utility functions for GEPA Optimizer
+"""
+
+from .helpers import sanitize_prompt
+from .logging import setup_logging
+from .metrics import calculate_metrics
+from .api_keys import APIKeyManager
+from .exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError
+from .universal_judge_prompt import (
+ build_universal_judge_prompt,
+ get_universal_judge_system_prompt,
+ format_universal_judge_feedback,
+ build_empty_output_feedback
+)
+from .format_detection import (
+ detect_output_format,
+ build_format_aware_reflection_prompt,
+ generate_format_feedback
+)
+
+__all__ = [
+ "sanitize_prompt",
+ "setup_logging",
+ "calculate_metrics",
+ "APIKeyManager",
+ "GepaOptimizerError",
+ "GepaDependencyError",
+ "InvalidInputError",
+ "DatasetError",
+ # Universal judge prompt utilities
+ "build_universal_judge_prompt",
+ "get_universal_judge_system_prompt",
+ "format_universal_judge_feedback",
+ "build_empty_output_feedback",
+ # Format detection utilities
+ "detect_output_format",
+ "build_format_aware_reflection_prompt",
+ "generate_format_feedback"
+]
diff --git a/src/gepa_optimizer/utils/api_keys.py b/src/gepa_optimizer/utils/api_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cec5c12d4768e9bf76240ef78f0a4ab3daa3f3
--- /dev/null
+++ b/src/gepa_optimizer/utils/api_keys.py
@@ -0,0 +1,109 @@
+"""
+API Key Management for GEPA Optimizer
+"""
+
+import os
+from dotenv import load_dotenv
+from typing import Optional, Dict, List
+
+class APIKeyManager:
+ """Handles API keys securely without hardcoding"""
+
+ def __init__(self):
+ # Load .env file if present
+ load_dotenv()
+ self._keys: Dict[str, str] = {}
+ self._load_from_env()
+
+ def _load_from_env(self):
+ """Load API keys from environment variables"""
+ env_mappings = {
+ 'openai': 'OPENAI_API_KEY',
+ 'anthropic': 'ANTHROPIC_API_KEY',
+ 'huggingface': 'HUGGINGFACE_API_KEY',
+ 'cohere': 'COHERE_API_KEY',
+ 'ai21': 'AI21_API_KEY',
+ 'together': 'TOGETHER_API_KEY',
+ 'replicate': 'REPLICATE_API_TOKEN',
+ 'groq': 'GROQ_API_KEY',
+ 'ollama': 'OLLAMA_API_KEY',
+ 'google': 'GEMINI_API_KEY',
+ 'gemini': 'GEMINI_API_KEY'
+ }
+
+ for provider, env_var in env_mappings.items():
+ key = os.getenv(env_var)
+ if key:
+ self._keys[provider] = key
+
+ def get_api_key(self, provider: str) -> Optional[str]:
+ """Get API key for a specific provider"""
+ return self._keys.get(provider.lower())
+
+ def set_api_key(self, provider: str, key: str):
+ """Set API key for a provider at runtime"""
+ provider_lower = provider.lower()
+ self._keys[provider_lower] = key
+
+ # Handle aliases - if setting google, also set gemini and vice versa
+ if provider_lower == 'google':
+ self._keys['gemini'] = key
+ elif provider_lower == 'gemini':
+ self._keys['google'] = key
+
+ def has_key(self, provider: str) -> bool:
+ """Check if API key exists for provider"""
+ return provider.lower() in self._keys
+
+ def get_missing_keys(self, providers: List[str]) -> List[str]:
+ """Get list of providers missing API keys"""
+ return [p for p in providers if not self.has_key(p)]
+
+ def validate_keys(self, providers: List[str]) -> Dict[str, bool]:
+ """Validate API keys for multiple providers"""
+ return {provider: self.has_key(provider) for provider in providers}
+
+ # Legacy methods for backward compatibility
+ def set_openai_key(self, key: str):
+ """Set OpenAI API key at runtime"""
+ self.set_api_key('openai', key)
+
+ def set_anthropic_key(self, key: str):
+ """Set Anthropic API key at runtime"""
+ self.set_api_key('anthropic', key)
+
+ def set_google_key(self, key: str):
+ """Set Google API key at runtime"""
+ self.set_api_key('google', key)
+
+ def set_gemini_key(self, key: str):
+ """Set Gemini API key at runtime (alias for Google)"""
+ self.set_api_key('google', key)
+
+ def get_openai_key(self) -> str:
+ """Get OpenAI key or raise error if missing"""
+ key = self.get_api_key('openai')
+ if not key:
+ raise RuntimeError(
+ "OpenAI API key missing. Set via:\n"
+ "1. Environment variable: OPENAI_API_KEY=your_key\n"
+ "2. .env file: OPENAI_API_KEY=your_key\n"
+ "3. Code: api_manager.set_api_key('openai', 'your_key')"
+ )
+ return key
+
+ def get_anthropic_key(self) -> Optional[str]:
+ """Get Anthropic key (optional)"""
+ return self.get_api_key('anthropic')
+
+ def get_google_key(self) -> Optional[str]:
+ """Get Google key (optional)"""
+ return self.get_api_key('google')
+
+ def get_gemini_key(self) -> Optional[str]:
+ """Get Gemini key (alias for Google)"""
+ return self.get_api_key('google')
+
+ def has_required_keys(self) -> bool:
+ """Check if required keys are available"""
+ return bool(self.get_api_key('openai'))
diff --git a/src/gepa_optimizer/utils/candidate_collector.py b/src/gepa_optimizer/utils/candidate_collector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2ce7836104778ec70239fd22c77ff2256100b3
--- /dev/null
+++ b/src/gepa_optimizer/utils/candidate_collector.py
@@ -0,0 +1,313 @@
+"""
+Candidate and Feedback Collector for Presentation
+
+This module collects all candidates generated during optimization along with
+their feedback, scores, and metadata for presentation purposes.
+"""
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+from dataclasses import dataclass, asdict, field
+
+
+@dataclass
+class CandidateInfo:
+ """Information about a single candidate prompt"""
+ iteration: int
+ candidate_id: str
+ source: str # "GEPA_Reflection", "LLEGO_Crossover", "LLEGO_Mutation", "Seed"
+ prompt: str
+ score: Optional[float] = None
+ feedback: Optional[str] = None
+ feedback_details: Optional[Dict[str, Any]] = None
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+
+@dataclass
+class IterationInfo:
+ """Information about a single optimization iteration"""
+ iteration: int
+ candidates: List[CandidateInfo] = field(default_factory=list)
+ best_candidate: Optional[CandidateInfo] = None
+ best_score: Optional[float] = None
+ timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+
+class CandidateCollector:
+ """
+ Collects all candidates and feedback during optimization for presentation.
+ """
+
+ def __init__(self, output_dir: str = "presentation_data"):
+ """
+ Initialize the collector.
+
+ Args:
+ output_dir: Directory to save collected data
+ """
+ self.output_dir = Path(output_dir)
+ self.output_dir.mkdir(exist_ok=True)
+
+ self.iterations: List[IterationInfo] = []
+ self.current_iteration: Optional[IterationInfo] = None
+ self.all_candidates: List[CandidateInfo] = []
+
+ # Track seed prompt
+ self.seed_prompt: Optional[str] = None
+
+ def set_seed_prompt(self, seed_prompt: str):
+ """Set the seed prompt for reference"""
+ self.seed_prompt = seed_prompt
+
+ def start_iteration(self, iteration: int):
+ """Start tracking a new iteration"""
+ self.current_iteration = IterationInfo(iteration=iteration)
+ self.iterations.append(self.current_iteration)
+
+ def add_candidate(
+ self,
+ iteration: int,
+ candidate_id: str,
+ source: str,
+ prompt: str,
+ score: Optional[float] = None,
+ feedback: Optional[str] = None,
+ feedback_details: Optional[Dict[str, Any]] = None
+ ):
+ """
+ Add a candidate to the collection.
+
+ Args:
+ iteration: Iteration number
+ candidate_id: Unique identifier for the candidate
+ source: Source of the candidate ("GEPA_Reflection", "LLEGO_Crossover", etc.)
+ prompt: The candidate prompt text
+ score: Evaluation score (if available)
+ feedback: Feedback text (if available)
+ feedback_details: Additional feedback details (if available)
+ """
+ candidate = CandidateInfo(
+ iteration=iteration,
+ candidate_id=candidate_id,
+ source=source,
+ prompt=prompt,
+ score=score,
+ feedback=feedback,
+ feedback_details=feedback_details
+ )
+
+ # Add to current iteration
+ if self.current_iteration and self.current_iteration.iteration == iteration:
+ self.current_iteration.candidates.append(candidate)
+
+ # Update best candidate if this is better
+ if score is not None:
+ if (self.current_iteration.best_score is None or
+ score > self.current_iteration.best_score):
+ self.current_iteration.best_candidate = candidate
+ self.current_iteration.best_score = score
+
+ # Add to all candidates list
+ self.all_candidates.append(candidate)
+
+ def add_feedback(
+ self,
+ candidate_id: str,
+ feedback: str,
+ feedback_details: Optional[Dict[str, Any]] = None
+ ):
+ """
+ Add feedback to an existing candidate.
+
+ Args:
+ candidate_id: ID of the candidate to update
+ feedback: Feedback text
+ feedback_details: Additional feedback details
+ """
+ for candidate in self.all_candidates:
+ if candidate.candidate_id == candidate_id:
+ candidate.feedback = feedback
+ candidate.feedback_details = feedback_details
+ break
+
+ # Also update in iterations
+ for iteration in self.iterations:
+ for candidate in iteration.candidates:
+ if candidate.candidate_id == candidate_id:
+ candidate.feedback = feedback
+ candidate.feedback_details = feedback_details
+ break
+
+ def add_score(
+ self,
+ candidate_id: str,
+ score: float
+ ):
+ """
+ Add score to an existing candidate.
+
+ Args:
+ candidate_id: ID of the candidate to update
+ score: Evaluation score
+ """
+ for candidate in self.all_candidates:
+ if candidate.candidate_id == candidate_id:
+ candidate.score = score
+ break
+
+ # Also update in iterations
+ for iteration in self.iterations:
+ for candidate in iteration.candidates:
+ if candidate.candidate_id == candidate_id:
+ candidate.score = score
+ # Update best candidate if needed
+ if (iteration.best_score is None or score > iteration.best_score):
+ iteration.best_candidate = candidate
+ iteration.best_score = score
+ break
+
+ def save_to_json(self, filename: Optional[str] = None) -> Path:
+ """
+ Save collected data to JSON file.
+
+ Args:
+ filename: Optional filename (auto-generated if not provided)
+
+ Returns:
+ Path to saved file
+ """
+ if filename is None:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"candidates_and_feedback_{timestamp}.json"
+
+ filepath = self.output_dir / filename
+
+ data = {
+ "seed_prompt": self.seed_prompt,
+ "total_iterations": len(self.iterations),
+ "total_candidates": len(self.all_candidates),
+ "iterations": [asdict(iter_info) for iter_info in self.iterations],
+ "all_candidates": [asdict(candidate) for candidate in self.all_candidates],
+ "timestamp": datetime.now().isoformat()
+ }
+
+ with open(filepath, 'w', encoding='utf-8') as f:
+ json.dump(data, f, indent=2, ensure_ascii=False)
+
+ return filepath
+
+ def save_to_markdown(self, filename: Optional[str] = None) -> Path:
+ """
+ Save collected data to Markdown file (presentation-ready format).
+
+ Args:
+ filename: Optional filename (auto-generated if not provided)
+
+ Returns:
+ Path to saved file
+ """
+ if filename is None:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ filename = f"candidates_and_feedback_{timestamp}.md"
+
+ filepath = self.output_dir / filename
+
+ with open(filepath, 'w', encoding='utf-8') as f:
+ # Header
+ f.write("# Optimization Candidates and Feedback\n\n")
+ f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+ f.write(f"**Total Iterations:** {len(self.iterations)}\n")
+ f.write(f"**Total Candidates:** {len(self.all_candidates)}\n\n")
+
+ # Seed Prompt
+ if self.seed_prompt:
+ f.write("---\n\n")
+ f.write("## ๐ฑ Seed Prompt\n\n")
+ f.write("```\n")
+ f.write(self.seed_prompt)
+ f.write("\n```\n\n")
+
+ # Iterations
+ for iter_info in self.iterations:
+ f.write("---\n\n")
+ f.write(f"## ๐ Iteration {iter_info.iteration}\n\n")
+
+ # Best candidate for this iteration
+ if iter_info.best_candidate:
+ f.write(f"### ๐ Best Candidate (Score: {iter_info.best_score:.4f})\n\n")
+ f.write(f"**Source:** {iter_info.best_candidate.source}\n\n")
+ f.write(f"**Prompt:**\n```\n")
+ f.write(iter_info.best_candidate.prompt)
+ f.write("\n```\n\n")
+
+ if iter_info.best_candidate.feedback:
+ f.write(f"**Feedback:**\n\n")
+ f.write(f"{iter_info.best_candidate.feedback}\n\n")
+
+ # All candidates in this iteration
+ f.write(f"### ๐ All Candidates ({len(iter_info.candidates)})\n\n")
+
+ for idx, candidate in enumerate(iter_info.candidates, 1):
+ f.write(f"#### Candidate {idx}: {candidate.source}\n\n")
+ f.write(f"**ID:** `{candidate.candidate_id}`\n\n")
+
+ if candidate.score is not None:
+ f.write(f"**Score:** `{candidate.score:.4f}`\n\n")
+
+ f.write(f"**Prompt:**\n```\n")
+ f.write(candidate.prompt)
+ f.write("\n```\n\n")
+
+ if candidate.feedback:
+ f.write(f"**Feedback:**\n\n")
+ f.write(f"{candidate.feedback}\n\n")
+
+ if candidate.feedback_details:
+ f.write(f"**Feedback Details:**\n\n")
+ f.write("```json\n")
+ f.write(json.dumps(candidate.feedback_details, indent=2))
+ f.write("\n```\n\n")
+
+ f.write("---\n\n")
+
+ # Summary by source
+ f.write("---\n\n")
+ f.write("## ๐ Summary by Source\n\n")
+
+ sources = {}
+ for candidate in self.all_candidates:
+ if candidate.source not in sources:
+ sources[candidate.source] = []
+ sources[candidate.source].append(candidate)
+
+ for source, candidates in sources.items():
+ f.write(f"### {source} ({len(candidates)} candidates)\n\n")
+ for candidate in candidates:
+ score_str = f"Score: {candidate.score:.4f}" if candidate.score else "No score"
+ f.write(f"- **{candidate.candidate_id}** (Iteration {candidate.iteration}, {score_str})\n")
+ f.write("\n")
+
+ return filepath
+
+ def get_summary(self) -> Dict[str, Any]:
+ """Get a summary of collected data"""
+ sources = {}
+ for candidate in self.all_candidates:
+ if candidate.source not in sources:
+ sources[candidate.source] = 0
+ sources[candidate.source] += 1
+
+ scored_candidates = [c for c in self.all_candidates if c.score is not None]
+ avg_score = sum(c.score for c in scored_candidates) / len(scored_candidates) if scored_candidates else None
+
+ return {
+ "total_iterations": len(self.iterations),
+ "total_candidates": len(self.all_candidates),
+ "candidates_by_source": sources,
+ "candidates_with_scores": len(scored_candidates),
+ "average_score": avg_score,
+ "candidates_with_feedback": len([c for c in self.all_candidates if c.feedback])
+ }
+
diff --git a/src/gepa_optimizer/utils/clean_logger.py b/src/gepa_optimizer/utils/clean_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..167c596e412ca1649b214317cef94eaf17bbc75d
--- /dev/null
+++ b/src/gepa_optimizer/utils/clean_logger.py
@@ -0,0 +1,160 @@
+"""
+Clean Logger for GEPA + LLEGO Optimization
+Provides simple, visual logging similar to diagram format.
+
+Uses the centralized logging infrastructure with a custom handler
+for clean, user-friendly console output.
+"""
+
+import logging
+import sys
+from typing import List, Optional
+
+# Create dedicated logger for clean output
+_clean_output_logger = logging.getLogger("gepa_optimizer.clean_output")
+
+
+def _setup_clean_logger():
+ """Setup the clean output logger with minimal formatting."""
+ if not _clean_output_logger.handlers:
+ handler = logging.StreamHandler(sys.stdout)
+ handler.setLevel(logging.INFO)
+ # Minimal formatter - just the message
+ handler.setFormatter(logging.Formatter("%(message)s"))
+ _clean_output_logger.addHandler(handler)
+ _clean_output_logger.setLevel(logging.INFO)
+ # Don't propagate to root logger to avoid duplicate output
+ _clean_output_logger.propagate = False
+
+
+# Initialize on module load
+_setup_clean_logger()
+
+
+class CleanLogger:
+ """
+ Simple, visual logging for optimization workflow.
+
+ Uses a dedicated logger with minimal formatting to produce
+ clean, user-friendly console output.
+ """
+
+ def __init__(self):
+ self.current_iteration = 0
+ self.gepa_reflection_count = 0
+ self.llego_crossover_count = 0
+ self.llego_mutation_count = 0
+ self._logger = _clean_output_logger
+
+ def log_iteration_start(self, iteration: int, seed_prompt: Optional[str] = None):
+ """Log start of new iteration."""
+ self.current_iteration = iteration
+ self.gepa_reflection_count = 0
+ self.llego_crossover_count = 0
+ self.llego_mutation_count = 0
+
+ self._logger.info("")
+ self._logger.info("โ" * 80)
+ # FIX: More accurate description - we evaluate first, then generate
+ if iteration == 1:
+ self._logger.info(f" ITERATION {iteration}: EVALUATING SEED PROMPT")
+ else:
+ self._logger.info(f" ITERATION {iteration}: EVALUATING & GENERATING CANDIDATES")
+ self._logger.info("โ" * 80)
+
+ if seed_prompt and iteration == 0:
+ self._logger.info("")
+ self._logger.info("SEED PROMPT:")
+ self._logger.info("โ" * 80)
+ self._logger.info(seed_prompt)
+ self._logger.info("โ" * 80)
+
+ def log_candidate_generation_summary(self):
+ """Log summary of candidates generated this iteration."""
+ total = self.gepa_reflection_count + self.llego_crossover_count + self.llego_mutation_count
+
+ self._logger.info("")
+ self._logger.info("CANDIDATES GENERATED THIS ITERATION:")
+ self._logger.info(f" GEPA Reflection: {self.gepa_reflection_count}")
+ self._logger.info(f" LLEGO Crossover: {self.llego_crossover_count}")
+ self._logger.info(f" LLEGO Mutation: {self.llego_mutation_count}")
+ self._logger.info(f" TOTAL: {total}")
+
+ def log_gepa_reflection_candidate(self, candidate_num: int, prompt: str):
+ """Log a GEPA reflection candidate."""
+ self.gepa_reflection_count += 1
+ self._logger.info("")
+ self._logger.info(f"GEPA Reflection Candidate #{candidate_num}:")
+ self._logger.info("โ" * 80)
+ if prompt and prompt.strip():
+ self._logger.info(prompt) # Show full prompt at INFO level
+ else:
+ self._logger.warning("โ ๏ธ Empty candidate prompt!")
+ self._logger.info("โ" * 80)
+
+ def log_llego_crossover_candidate(self, candidate_num: int, prompt: str):
+ """Log a LLEGO crossover candidate."""
+ self.llego_crossover_count += 1
+ self._logger.info("")
+ self._logger.info(f"LLEGO Crossover Candidate #{candidate_num}:")
+ self._logger.info("โ" * 80)
+ if prompt and prompt.strip():
+ self._logger.info(prompt) # Show full prompt at INFO level
+ else:
+ self._logger.warning("โ ๏ธ Empty candidate prompt!")
+ self._logger.info("โ" * 80)
+
+ def log_llego_mutation_candidate(self, candidate_num: int, prompt: str):
+ """Log a LLEGO mutation candidate."""
+ self.llego_mutation_count += 1
+ self._logger.info("")
+ self._logger.info(f"LLEGO Mutation Candidate #{candidate_num}:")
+ self._logger.info("โ" * 80)
+ if prompt and prompt.strip():
+ self._logger.info(prompt) # Show full prompt at INFO level
+ else:
+ self._logger.warning("โ ๏ธ Empty candidate prompt!")
+ self._logger.info("โ" * 80)
+
+ def log_evaluation_results(self, candidate_prompts: List[str], scores: List[float]):
+ """Log evaluation results for all candidates."""
+ self._logger.info("")
+ self._logger.info("โ" * 80)
+ self._logger.info(" EVALUATION RESULTS")
+ self._logger.info("โ" * 80)
+
+ for i, (prompt, score) in enumerate(zip(candidate_prompts, scores), 1):
+ self._logger.info(f"")
+ self._logger.info(f"Candidate #{i}:")
+ self._logger.info(f" Score: {score:.4f}")
+ self._logger.info(f" Prompt Preview: {prompt[:100]}...")
+
+ def log_pareto_front_update(self, pareto_size: int, best_score: float):
+ """Log Pareto front update."""
+ self._logger.info("")
+ self._logger.info("โ" * 80)
+ self._logger.info(" PARETO FRONT UPDATE")
+ self._logger.info("โ" * 80)
+ self._logger.info(f" Front Size: {pareto_size} candidates")
+ self._logger.info(f" Best Score: {best_score:.4f}")
+
+ def log_iteration_summary(self, iteration: int, total_candidates: int, best_score: float):
+ """Log iteration summary."""
+ self._logger.info("")
+ self._logger.info("โ" * 80)
+ self._logger.info(f" ITERATION {iteration} SUMMARY")
+ self._logger.info("โ" * 80)
+ self._logger.info(f" Candidates Evaluated: {total_candidates}")
+ self._logger.info(f" Best Score: {best_score:.4f}")
+ self._logger.info(f" GEPA Reflection: {self.gepa_reflection_count}")
+ self._logger.info(f" LLEGO Crossover: {self.llego_crossover_count}")
+ self._logger.info(f" LLEGO Mutation: {self.llego_mutation_count}")
+
+
+# Global instance
+_clean_logger_instance = CleanLogger()
+
+
+def get_clean_logger() -> CleanLogger:
+ """Get global clean logger instance."""
+ return _clean_logger_instance
diff --git a/src/gepa_optimizer/utils/exceptions.py b/src/gepa_optimizer/utils/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4e043e6edafbab84261debe19151cf208cfdef
--- /dev/null
+++ b/src/gepa_optimizer/utils/exceptions.py
@@ -0,0 +1,27 @@
+"""
+Custom exceptions for GEPA Optimizer
+"""
+
+class GepaOptimizerError(Exception):
+ """Base class for all GEPA Optimizer exceptions"""
+ pass
+
+class GepaDependencyError(GepaOptimizerError):
+ """Exception raised for errors related to the GEPA library dependency"""
+ pass
+
+class InvalidInputError(GepaOptimizerError):
+ """Exception raised for invalid user inputs"""
+ pass
+
+class DatasetError(GepaOptimizerError):
+ """Exception raised for errors related to the dataset"""
+ pass
+
+class TestSetEvaluationError(GepaOptimizerError):
+ """Exception raised when test set evaluation fails"""
+ pass
+
+class ConfigurationError(GepaOptimizerError):
+ """Exception raised for invalid configuration"""
+ pass
diff --git a/src/gepa_optimizer/utils/format_detection.py b/src/gepa_optimizer/utils/format_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..e574cd3f73c98eafb1cbf2cf5d07c30fdf2e4c65
--- /dev/null
+++ b/src/gepa_optimizer/utils/format_detection.py
@@ -0,0 +1,391 @@
+"""
+Format Detection Utilities for GEPA Optimizer.
+
+This module provides utilities to automatically detect output format patterns
+from expected outputs and generate format constraints for reflection prompts.
+
+Key Features:
+1. Auto-detect JSON, key-value, tabular, or free-text formats
+2. Generate format specifications from examples
+3. Create format constraint strings for prompt injection
+"""
+
+import re
+import json
+from typing import List, Dict, Any, Optional, Tuple
+
+
+def detect_output_format(expected_outputs: List[str]) -> Dict[str, Any]:
+ """
+ Analyze expected outputs to detect the common format pattern.
+
+ Args:
+ expected_outputs: List of expected output strings from the dataset
+
+ Returns:
+ Dictionary containing:
+ - format_type: 'json', 'key_value', 'tabular', 'structured_text', 'free_text'
+ - format_spec: Human-readable format specification
+ - format_example: Example showing the format
+ - format_constraint: Constraint text to add to prompts
+ - detected_keys: List of keys/fields detected (for structured formats)
+ - avg_length: Average length of outputs (to enforce conciseness)
+ """
+ if not expected_outputs:
+ return {
+ 'format_type': 'unknown',
+ 'format_spec': 'Unknown format',
+ 'format_example': '',
+ 'format_constraint': '',
+ 'detected_keys': [],
+ 'avg_length': 0
+ }
+
+ # Filter out empty outputs
+ valid_outputs = [o for o in expected_outputs if o and o.strip()]
+ if not valid_outputs:
+ return _create_format_result('unknown', 'Unknown format', '', [], 0)
+
+ # Calculate average length for conciseness constraint
+ avg_length = sum(len(o) for o in valid_outputs) // len(valid_outputs)
+ max_length = max(len(o) for o in valid_outputs)
+
+ # Try to detect format type (in order of specificity)
+
+ # 1. Check for JSON format
+ json_result = _detect_json_format(valid_outputs, avg_length, max_length)
+ if json_result:
+ return json_result
+
+ # 2. Check for key-value format (e.g., "Department: X | Sentiment: Y")
+ kv_result = _detect_key_value_format(valid_outputs, avg_length, max_length)
+ if kv_result:
+ return kv_result
+
+ # 3. Check for bullet/list format
+ list_result = _detect_list_format(valid_outputs, avg_length, max_length)
+ if list_result:
+ return list_result
+
+ # 4. Check for tabular/structured text
+ structured_result = _detect_structured_text(valid_outputs, avg_length, max_length)
+ if structured_result:
+ return structured_result
+
+ # 5. Default to free text with length constraint
+ return _create_format_result(
+ 'free_text',
+ f'Free-form text response (typically {avg_length} characters)',
+ valid_outputs[0][:100] if valid_outputs else '',
+ [],
+ avg_length,
+ max_length
+ )
+
+
+def _detect_json_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+ """Detect if outputs are JSON format."""
+ json_count = 0
+ all_keys = []
+
+ for output in outputs:
+ stripped = output.strip()
+ if stripped.startswith('{') and stripped.endswith('}'):
+ try:
+ parsed = json.loads(stripped)
+ if isinstance(parsed, dict):
+ json_count += 1
+ all_keys.extend(parsed.keys())
+ except json.JSONDecodeError:
+ pass
+
+ # If majority are JSON
+ if json_count >= len(outputs) * 0.7:
+ # Find common keys
+ key_counts = {}
+ for key in all_keys:
+ key_counts[key] = key_counts.get(key, 0) + 1
+
+ common_keys = [k for k, v in key_counts.items() if v >= json_count * 0.5]
+
+ # Build format spec
+ format_spec = f"JSON object with keys: {', '.join(common_keys)}"
+ format_example = outputs[0][:200] if outputs else '{}'
+
+ return _create_format_result(
+ 'json',
+ format_spec,
+ format_example,
+ common_keys,
+ avg_length,
+ max_length
+ )
+
+ return None
+
+
+def _detect_key_value_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+ """Detect key-value formats like 'Department: X | Sentiment: Y'."""
+ # Common separators for key-value pairs
+ separators = ['|', '\n', ';', ',']
+ key_patterns = [
+ r'([A-Za-z_][A-Za-z0-9_\s]*)\s*[:=]\s*([^|;\n,]+)', # Key: Value or Key = Value
+ ]
+
+ all_keys = []
+ kv_count = 0
+ detected_separator = None
+
+ for output in outputs:
+ # Try to find key-value pairs
+ for pattern in key_patterns:
+ matches = re.findall(pattern, output)
+ if len(matches) >= 2: # At least 2 key-value pairs
+ kv_count += 1
+ for key, _ in matches:
+ all_keys.append(key.strip())
+
+ # Detect separator
+ for sep in separators:
+ if sep in output:
+ detected_separator = sep
+ break
+ break
+
+ # If majority are key-value
+ if kv_count >= len(outputs) * 0.6:
+ # Find common keys
+ key_counts = {}
+ for key in all_keys:
+ normalized = key.strip().lower()
+ key_counts[normalized] = key_counts.get(normalized, 0) + 1
+
+ common_keys = [k for k, v in sorted(key_counts.items(), key=lambda x: -x[1])
+ if v >= kv_count * 0.4][:5] # Top 5 keys
+
+ # Determine the exact format pattern
+ sep_display = detected_separator if detected_separator else ' | '
+ format_spec = f"Key-value pairs: {sep_display.join([f'{k}: [value]' for k in common_keys])}"
+ format_example = outputs[0] if outputs else ''
+
+ return _create_format_result(
+ 'key_value',
+ format_spec,
+ format_example,
+ common_keys,
+ avg_length,
+ max_length
+ )
+
+ return None
+
+
+def _detect_list_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+ """Detect bullet/numbered list formats."""
+ list_patterns = [
+ r'^[-*โข]\s+', # Bullet points
+ r'^\d+[.)]\s+', # Numbered list
+ ]
+
+ list_count = 0
+
+ for output in outputs:
+ lines = output.strip().split('\n')
+ list_lines = 0
+ for line in lines:
+ for pattern in list_patterns:
+ if re.match(pattern, line.strip()):
+ list_lines += 1
+ break
+
+ if list_lines >= len(lines) * 0.5: # Majority are list items
+ list_count += 1
+
+ if list_count >= len(outputs) * 0.6:
+ return _create_format_result(
+ 'list',
+ 'Bullet or numbered list format',
+ outputs[0][:200] if outputs else '',
+ [],
+ avg_length,
+ max_length
+ )
+
+ return None
+
+
+def _detect_structured_text(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+ """Detect structured text with consistent patterns."""
+ # Check for consistent line patterns
+ line_counts = [len(o.strip().split('\n')) for o in outputs]
+ avg_lines = sum(line_counts) // len(line_counts) if line_counts else 1
+
+ if avg_lines >= 2:
+ return _create_format_result(
+ 'structured_text',
+ f'Structured text with ~{avg_lines} lines',
+ outputs[0][:200] if outputs else '',
+ [],
+ avg_length,
+ max_length
+ )
+
+ return None
+
+
+def _create_format_result(
+ format_type: str,
+ format_spec: str,
+ format_example: str,
+ detected_keys: List[str],
+ avg_length: int,
+ max_length: int = 0
+) -> Dict[str, Any]:
+ """Create a standardized format detection result."""
+ # Generate format constraint based on type
+ if format_type == 'json':
+ constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Return ONLY a valid JSON object
+- Required keys: {', '.join(detected_keys) if detected_keys else 'as shown in examples'}
+- NO explanations, NO prose, NO markdown code blocks
+- Maximum length: ~{max_length} characters
+- Example format: {format_example[:150]}"""
+
+ elif format_type == 'key_value':
+ constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Return ONLY in key-value format: {format_spec}
+- NO explanations, NO reasoning, NO additional text
+- Be CONCISE - output should be ~{avg_length} characters max
+- Example: {format_example}"""
+
+ elif format_type == 'list':
+ constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Return as a bullet or numbered list
+- NO explanations before or after the list
+- Keep it concise (~{avg_length} characters)"""
+
+ elif format_type == 'structured_text':
+ constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Follow the structured format shown in examples
+- NO additional explanations or commentary
+- Keep output concise (~{avg_length} characters)"""
+
+ else:
+ constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Keep response CONCISE and DIRECT
+- NO lengthy explanations or reasoning
+- Target length: ~{avg_length} characters (max {max_length})
+- Match the format/style of the expected examples"""
+
+ return {
+ 'format_type': format_type,
+ 'format_spec': format_spec,
+ 'format_example': format_example[:200] if format_example else '',
+ 'format_constraint': constraint,
+ 'detected_keys': detected_keys,
+ 'avg_length': avg_length,
+ 'max_length': max_length
+ }
+
+
+def build_format_aware_reflection_prompt(
+ base_prompt: str,
+ format_info: Dict[str, Any],
+ include_example: bool = True
+) -> str:
+ """
+ Enhance a reflection prompt with format awareness.
+
+ Args:
+ base_prompt: The original reflection prompt
+ format_info: Format detection result from detect_output_format()
+ include_example: Whether to include format example
+
+ Returns:
+ Enhanced prompt with format constraints
+ """
+ if not format_info or format_info.get('format_type') == 'unknown':
+ return base_prompt
+
+ format_section = f"""
+
+๐ฏ CRITICAL FORMAT REQUIREMENT:
+The optimized prompt MUST produce outputs that match this EXACT format:
+
+{format_info['format_constraint']}
+
+โ ๏ธ COMMON FAILURE MODES TO AVOID:
+1. Generating explanations when only the answer is needed
+2. Adding "Here's the analysis..." or similar preambles
+3. Producing verbose output when concise is required
+4. Wrong structure (e.g., prose instead of key-value pairs)
+"""
+
+ if include_example and format_info.get('format_example'):
+ format_section += f"""
+๐ EXAMPLE OF CORRECT OUTPUT FORMAT:
+{format_info['format_example']}
+"""
+
+ # Insert format section near the end of the prompt but before any final instructions
+ return base_prompt + format_section
+
+
+def generate_format_feedback(
+ predicted_output: str,
+ expected_output: str,
+ format_info: Dict[str, Any]
+) -> str:
+ """
+ Generate specific feedback about format compliance.
+
+ Args:
+ predicted_output: What the model actually produced
+ expected_output: The ground truth output
+ format_info: Format detection result
+
+ Returns:
+ Specific format-related feedback
+ """
+ predicted_len = len(predicted_output) if predicted_output else 0
+ expected_len = len(expected_output) if expected_output else 0
+
+ issues = []
+
+ # Check length discrepancy
+ if format_info.get('avg_length', 0) > 0:
+ if predicted_len > format_info['avg_length'] * 3:
+ issues.append(f"OUTPUT TOO VERBOSE: Generated {predicted_len} chars, expected ~{format_info['avg_length']} chars")
+ elif predicted_len > format_info.get('max_length', predicted_len) * 2:
+ issues.append(f"OUTPUT TOO LONG: {predicted_len} chars vs max expected {format_info.get('max_length', 'unknown')}")
+
+ # Check format type compliance
+ format_type = format_info.get('format_type', 'unknown')
+
+ if format_type == 'json':
+ try:
+ json.loads(predicted_output.strip() if predicted_output else '{}')
+ except json.JSONDecodeError:
+ issues.append("FORMAT ERROR: Expected JSON but got non-JSON output")
+
+ elif format_type == 'key_value':
+ # Check if output has key-value structure
+ if predicted_output and ':' not in predicted_output:
+ issues.append("FORMAT ERROR: Expected key-value pairs (Key: Value) but output lacks this structure")
+
+ # Check for common verbose patterns
+ verbose_indicators = [
+ 'let me', 'i will', 'here is', "here's", 'analysis:', 'step-by-step',
+ 'first,', 'to begin', 'in order to', 'the following', 'please note'
+ ]
+
+ if predicted_output:
+ lower_output = predicted_output.lower()
+ found_verbose = [v for v in verbose_indicators if v in lower_output]
+ if found_verbose:
+ issues.append(f"VERBOSITY WARNING: Output contains explanatory phrases: {', '.join(found_verbose[:3])}")
+
+ if not issues:
+ return ""
+
+ return "\n๐จ FORMAT ISSUES DETECTED:\n" + "\n".join(f" โข {issue}" for issue in issues)
diff --git a/src/gepa_optimizer/utils/helpers.py b/src/gepa_optimizer/utils/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd57f5d498d68feecf2b293159ffdf665376162
--- /dev/null
+++ b/src/gepa_optimizer/utils/helpers.py
@@ -0,0 +1,23 @@
+"""
+Helper functions for GEPA Optimizer
+"""
+
+def sanitize_prompt(prompt: str) -> str:
+ """
+ Sanitize and validate prompt string
+
+ Args:
+ prompt: Input prompt string to sanitize
+
+ Returns:
+ str: Cleaned and validated prompt
+ """
+ if not isinstance(prompt, str):
+ prompt = str(prompt)
+
+ prompt = prompt.strip()
+
+ if not prompt:
+ prompt = "You are a helpful assistant."
+
+ return prompt
diff --git a/src/gepa_optimizer/utils/llm_judge_prompt.py b/src/gepa_optimizer/utils/llm_judge_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcab0497cde56ad02b9db146d556594add9881cd
--- /dev/null
+++ b/src/gepa_optimizer/utils/llm_judge_prompt.py
@@ -0,0 +1,322 @@
+"""
+LLM-as-Judge Prompt for Index Caching Use Case
+
+This module provides a specialized LLM-as-Judge prompt template for analyzing
+index caching evaluation results and generating actionable feedback for prompt improvement.
+"""
+
+from typing import Dict, Any, Optional
+
+
+def build_index_caching_judge_prompt(
+ task_command: str,
+ predicted_dict: Dict[str, Any],
+ expected_dict: Dict[str, Any],
+ predicted_output: str,
+ expected_output: str,
+ current_prompt: Optional[str] = None,
+ evaluation_results: Optional[Dict[str, Any]] = None,
+ image_base64: Optional[str] = None
+) -> str:
+ """
+ Build LLM-as-Judge prompt for index caching use case.
+
+ This prompt analyzes why the LLM failed to correctly identify:
+ - is_index_based (boolean)
+ - index_value (int or null)
+ - parent_element_id (string or null)
+ - element_id_of_nth_child_of_parent (string or null)
+ - selected_element_is_correct (boolean)
+
+ Args:
+ task_command: The natural language command
+ predicted_dict: Parsed predicted JSON output
+ expected_dict: Parsed expected JSON output
+ predicted_output: Raw predicted output string
+ expected_output: Raw expected output string
+ current_prompt: Current system prompt being optimized
+ evaluation_results: Full evaluation results with field scores
+ image_base64: Optional base64 encoded screenshot
+
+ Returns:
+ Formatted judge prompt string
+ """
+
+ # Extract field values for comparison
+ pred_is_index = predicted_dict.get("is_index_based")
+ exp_is_index = expected_dict.get("is_index_based")
+ pred_index_val = predicted_dict.get("index_value")
+ exp_index_val = expected_dict.get("index_value")
+ pred_parent = predicted_dict.get("parent_element_id")
+ exp_parent = expected_dict.get("parent_element_id")
+ pred_element = predicted_dict.get("element_id_of_nth_child_of_parent")
+ exp_element = expected_dict.get("element_id_of_nth_child_of_parent")
+ pred_selected = predicted_dict.get("selected_element_is_correct")
+ exp_selected = expected_dict.get("selected_element_is_correct")
+
+ # Extract notes/reasoning if available
+ pred_notes = predicted_dict.get("notes", "")
+ exp_notes = expected_dict.get("notes", "")
+
+ # Get field scores from evaluation results
+ field_scores = {}
+ if evaluation_results:
+ field_scores = {
+ "is_index_based": evaluation_results.get("is_index_based_match", 0.0),
+ "index_value": evaluation_results.get("index_value_match", 0.0),
+ "parent_element_id": evaluation_results.get("parent_element_id_match", 0.0),
+ "element_id_of_nth_child": evaluation_results.get("element_id_of_nth_child_match", 0.0),
+ "selected_element_is_correct": evaluation_results.get("selected_element_correct_match", 0.0),
+ }
+
+ # Build field-by-field comparison
+ field_comparisons = []
+
+ # 1. is_index_based
+ is_index_match = pred_is_index == exp_is_index
+ field_comparisons.append(f"""
+1. **is_index_based** ({'โ
CORRECT' if is_index_match else 'โ WRONG'}):
+ - Expected: {exp_is_index}
+ - Predicted: {pred_is_index}
+ - Score: {field_scores.get('is_index_based', 0.0):.0%}
+""")
+
+ # 2. index_value
+ index_val_match = pred_index_val == exp_index_val
+ field_comparisons.append(f"""
+2. **index_value** ({'โ
CORRECT' if index_val_match else 'โ WRONG'}):
+ - Expected: {exp_index_val}
+ - Predicted: {pred_index_val}
+ - Score: {field_scores.get('index_value', 0.0):.0%}
+""")
+
+ # 3. parent_element_id
+ parent_match = pred_parent == exp_parent
+ field_comparisons.append(f"""
+3. **parent_element_id** ({'โ
CORRECT' if parent_match else 'โ WRONG'}):
+ - Expected: {exp_parent}
+ - Predicted: {pred_parent}
+ - Score: {field_scores.get('parent_element_id', 0.0):.0%}
+""")
+
+ # 4. element_id_of_nth_child_of_parent
+ element_match = pred_element == exp_element
+ field_comparisons.append(f"""
+4. **element_id_of_nth_child_of_parent** ({'โ
CORRECT' if element_match else 'โ WRONG'}):
+ - Expected: {exp_element}
+ - Predicted: {pred_element}
+ - Score: {field_scores.get('element_id_of_nth_child', 0.0):.0%}
+""")
+
+ # 5. selected_element_is_correct
+ selected_match = pred_selected == exp_selected
+ field_comparisons.append(f"""
+5. **selected_element_is_correct** ({'โ
CORRECT' if selected_match else 'โ WRONG'}):
+ - Expected: {exp_selected}
+ - Predicted: {pred_selected}
+ - Score: {field_scores.get('selected_element_is_correct', 0.0):.0%}
+""")
+
+ # Visual analysis instruction
+ visual_instruction = ""
+ if image_base64:
+ visual_instruction = """
+๐ผ๏ธ VISUAL ANALYSIS (You can see the screenshot):
+- Look at the annotated screenshot with bounding boxes
+- Identify which element is highlighted (the target element)
+- Understand the UI structure and hierarchy
+- Analyze why the LLM might have misidentified the parent container or nth child
+"""
+
+ judge_prompt = f"""You are an expert prompt engineer specializing in mobile UI automation and index-based element selection prompts.
+
+{"You can SEE the mobile app screenshot with annotated bounding boxes." if image_base64 else "You are analyzing text descriptions only (no image provided)."}
+
+TASK: Improve the SYSTEM PROMPT to better guide the LLM in correctly identifying index-based element selection.
+
+CONTEXT:
+- Task Command: "{task_command}"
+
+FULL EXPECTED OUTPUT (Ground Truth JSON):
+```json
+{expected_output}
+```
+
+FULL PREDICTED OUTPUT (What the LLM Actually Returned):
+```json
+{predicted_output}
+```
+
+FIELD-BY-FIELD COMPARISON:
+{''.join(field_comparisons)}
+{visual_instruction if image_base64 else ""}
+
+EXPECTED REASONING (from notes):
+{exp_notes if exp_notes else "N/A - No reasoning provided in expected output"}
+
+PREDICTED REASONING (from notes):
+{pred_notes if pred_notes else "N/A - No reasoning provided in predicted output"}
+
+CURRENT SYSTEM PROMPT (being optimized):
+{current_prompt if current_prompt else "N/A"}
+
+ANALYSIS REQUIRED:
+
+1. **is_index_based Analysis** (CRITICAL):
+ - Why did the LLM classify this as {"index-based" if pred_is_index else "non-index-based"} when it should be {"index-based" if exp_is_index else "non-index-based"}?
+ - What specific words or patterns in the command "{task_command}" should have led to the correct classification?
+ - What instruction in the prompt failed to guide correct classification?
+ - What edge case or ambiguity caused the misclassification?
+
+2. **index_value Analysis** (if is_index_based should be true):
+ - Why did the LLM extract index_value={pred_index_val} when it should be {exp_index_val}?
+ - What ordinal word ("first", "second", "third", etc.) in "{task_command}" should have been converted to {exp_index_val}?
+ - Did the LLM fail to recognize the ordinal, or did it count incorrectly?
+ - What instruction would help the LLM correctly parse ordinals?
+
+3. **parent_element_id Analysis** (if is_index_based should be true):
+ - Why did the LLM identify parent_element_id="{pred_parent}" when it should be "{exp_parent}"?
+ - What container in the XML hierarchy should have been identified as the parent?
+ - Did the LLM fail to walk up the hierarchy correctly?
+ - Did the LLM include non-item children (like headers) in the parent container?
+ - What instruction would help the LLM identify the correct parent container?
+
+4. **element_id_of_nth_child_of_parent Analysis** (if is_index_based should be true):
+ - Why did the LLM identify element_id_of_nth_child_of_parent="{pred_element}" when it should be "{exp_element}"?
+ - What is the outermost component representing the nth item?
+ - Did the LLM select a nested child instead of the full item?
+ - Did the LLM count items incorrectly (wrong nth position)?
+ - What instruction would help the LLM identify the correct outermost item?
+
+5. **selected_element_is_correct Analysis**:
+ - Why did the LLM determine selected_element_is_correct={pred_selected} when it should be {exp_selected}?
+ - Is the highlighted element actually the correct target for the command?
+ - What visual or structural cue did the LLM miss or misinterpret?
+
+6. **Prompt Weakness Identification**:
+ - Which specific instruction in the current system prompt is missing, unclear, or misleading?
+ - What concept from the expected reasoning should the prompt emphasize more?
+ - What edge case handling is missing?
+ - What example or clarification would help?
+
+7. **Actionable Prompt Improvement**:
+ - What exact instruction should be ADDED to fix each failing field?
+ - What should be REMOVED or CLARIFIED?
+ - What specific wording would guide the LLM to the correct field values?
+ - How can the prompt help the LLM follow the same logic as the expected output?
+
+OUTPUT FORMAT (JSON):
+{{
+ "is_index_based_error": "Specific explanation of why is_index_based classification was wrong. Reference the command and explain what pattern should have been recognized.",
+ "index_value_error": "If index_value was wrong, explain why. What ordinal word should have been converted to which number?",
+ "parent_element_id_error": "If parent_element_id was wrong, explain why. What container should have been identified and why?",
+ "element_id_of_nth_child_error": "If element_id_of_nth_child_of_parent was wrong, explain why. What item should have been selected and why?",
+ "selected_element_correct_error": "If selected_element_is_correct was wrong, explain why. Is the highlighted element actually correct?",
+ "key_weakness": "The single most important prompt weakness that caused the most errors",
+ "missing_instruction": "What specific instruction should be added to address the key weakness",
+ "improvement_suggestion": "Specific, actionable prompt improvement that addresses all field errors",
+ "example_instruction": "An example instruction that would help the LLM correctly identify all 5 fields"
+}}
+
+CRITICAL: Your analysis must focus on WHY each of the 5 fields was wrong. Be specific about:
+- Command interpretation (for is_index_based)
+- Ordinal parsing (for index_value)
+- XML hierarchy traversal (for parent_element_id and element_id_of_nth_child_of_parent)
+- Element correctness assessment (for selected_element_is_correct)
+
+Reference the task command, expected vs predicted values, and provide actionable improvements to the system prompt."""
+
+ return judge_prompt
+
+
+def format_index_caching_judge_feedback(
+ judge_output: str,
+ predicted_dict: Dict[str, Any],
+ expected_dict: Dict[str, Any],
+ task_command: str,
+ field_scores: Dict[str, float]
+) -> str:
+ """
+ Format LLM-as-Judge output into structured feedback.
+
+ Args:
+ judge_output: Raw output from LLM-as-Judge
+ predicted_dict: Parsed predicted JSON
+ expected_dict: Parsed expected JSON
+ task_command: The task command
+ field_scores: Field-by-field scores from evaluation
+
+ Returns:
+ Formatted feedback string
+ """
+ import json
+ import re
+
+ # Try to parse JSON from judge output
+ json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', judge_output, re.DOTALL)
+ if json_match:
+ try:
+ analysis = json.loads(json_match.group(0))
+
+ # Build formatted feedback
+ feedback = f"""โ INDEX CACHING EVALUATION FAILURE
+
+๐ FIELD-BY-FIELD ANALYSIS:
+
+๐ is_index_based Error:
+ Expected: {expected_dict.get('is_index_based')}, Predicted: {predicted_dict.get('is_index_based')}
+ {analysis.get('is_index_based_error', 'N/A')}
+
+๐ index_value Error:
+ Expected: {expected_dict.get('index_value')}, Predicted: {predicted_dict.get('index_value')}
+ {analysis.get('index_value_error', 'N/A')}
+
+๐ parent_element_id Error:
+ Expected: {expected_dict.get('parent_element_id')}, Predicted: {predicted_dict.get('parent_element_id')}
+ {analysis.get('parent_element_id_error', 'N/A')}
+
+๐ element_id_of_nth_child_of_parent Error:
+ Expected: {expected_dict.get('element_id_of_nth_child_of_parent')}, Predicted: {predicted_dict.get('element_id_of_nth_child_of_parent')}
+ {analysis.get('element_id_of_nth_child_error', 'N/A')}
+
+๐ selected_element_is_correct Error:
+ Expected: {expected_dict.get('selected_element_is_correct')}, Predicted: {predicted_dict.get('selected_element_is_correct')}
+ {analysis.get('selected_element_correct_error', 'N/A')}
+
+๐ KEY WEAKNESS:
+{analysis.get('key_weakness', 'N/A')}
+
+๐ก MISSING INSTRUCTION:
+{analysis.get('missing_instruction', 'N/A')}
+
+๐ก IMPROVEMENT SUGGESTION:
+{analysis.get('improvement_suggestion', 'N/A')}
+
+๐ EXAMPLE INSTRUCTION:
+{analysis.get('example_instruction', 'N/A')}
+
+๐ญ CONTEXT:
+- Task: "{task_command}"
+- Field Scores: is_index_based={field_scores.get('is_index_based', 0.0):.0%}, index_value={field_scores.get('index_value', 0.0):.0%}, parent_element_id={field_scores.get('parent_element_id', 0.0):.0%}, element_id_of_nth_child={field_scores.get('element_id_of_nth_child', 0.0):.0%}, selected_element_is_correct={field_scores.get('selected_element_is_correct', 0.0):.0%}"""
+
+ return feedback
+ except json.JSONDecodeError:
+ pass
+
+ # Fallback to raw output
+ return f"LLM-as-Judge Analysis (Index Caching):\n{judge_output}"
+
+
+# System prompt for LLM-as-Judge
+INDEX_CACHING_JUDGE_SYSTEM_PROMPT = """You are an expert prompt engineer analyzing mobile UI automation prompts for index-based element selection.
+
+Your task is to analyze why an LLM failed to correctly identify index-based element selection fields and provide actionable feedback to improve the system prompt.
+
+Focus on:
+- Command interpretation (is_index_based classification)
+- Ordinal parsing (index_value extraction)
+- XML hierarchy traversal (parent_element_id and element_id_of_nth_child_of_parent)
+- Element correctness assessment (selected_element_is_correct)
+
+You can see the screenshot with annotated bounding boxes if provided. Analyze the visual structure to understand why the LLM made errors."""
+
diff --git a/src/gepa_optimizer/utils/log_parser.py b/src/gepa_optimizer/utils/log_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9889a1e09f0bbbfd1378caf3b230bcc5d527a25
--- /dev/null
+++ b/src/gepa_optimizer/utils/log_parser.py
@@ -0,0 +1,298 @@
+"""
+Log Parser for Extracting Candidates and Feedback
+
+Parses optimization logs to extract candidate prompts, feedback, and scores.
+"""
+
+import re
+from typing import List, Dict, Optional, Tuple
+from pathlib import Path
+
+
+class OptimizationLogParser:
+ """Parse optimization logs to extract candidates and feedback"""
+
+ def __init__(self, log_file: str):
+ """
+ Initialize parser with log file path.
+
+ Args:
+ log_file: Path to log file
+ """
+ self.log_file = Path(log_file)
+ self.content = ""
+ if self.log_file.exists():
+ with open(self.log_file, 'r', encoding='utf-8') as f:
+ self.content = f.read()
+
+ def extract_iterations(self) -> List[Dict]:
+ """Extract iteration information from logs"""
+ iterations = []
+
+ # Pattern to find iteration markers
+ iteration_pattern = r'Iteration\s+(\d+)|Starting GEPA optimization|๐ Starting GEPA optimization'
+
+ # Find all iteration starts
+ for match in re.finditer(iteration_pattern, self.content):
+ # Try to extract iteration number
+ iter_num = 1
+ if match.group(1):
+ iter_num = int(match.group(1))
+
+ # Find the section for this iteration
+ start_pos = match.start()
+ next_match = list(re.finditer(iteration_pattern, self.content))
+ next_idx = next((i for i, m in enumerate(next_match) if m.start() > start_pos), None)
+
+ if next_idx is not None:
+ end_pos = next_match[next_idx].start()
+ iter_content = self.content[start_pos:end_pos]
+ else:
+ iter_content = self.content[start_pos:]
+
+ iterations.append({
+ 'iteration': iter_num,
+ 'content': iter_content,
+ 'start_pos': start_pos
+ })
+
+ return iterations
+
+ def extract_candidates(self, iteration_content: str) -> List[Dict]:
+ """
+ Extract candidate prompts from iteration content.
+
+ Args:
+ iteration_content: Content for a single iteration
+
+ Returns:
+ List of candidate dictionaries
+ """
+ candidates = []
+
+ # Pattern 1: GEPA Reflection candidates
+ # Look for "PROPOSED PROMPT" or "๐ PROPOSED PROMPT"
+ gepa_patterns = [
+ r'๐ PROPOSED PROMPT.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ r'PROPOSED PROMPT.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ r'GEPA REFLECTION.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ ]
+
+ for pattern in gepa_patterns:
+ for match in re.finditer(pattern, iteration_content, re.DOTALL):
+ prompt = match.group(1).strip()
+ if prompt and len(prompt) > 20: # Valid prompt
+ candidates.append({
+ 'source': 'GEPA_Reflection',
+ 'prompt': prompt,
+ 'position': match.start()
+ })
+
+ # Pattern 2: LLEGO Crossover candidates
+ crossover_patterns = [
+ r'๐งฌ Crossover.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ r'Crossover.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ ]
+
+ for pattern in crossover_patterns:
+ for match in re.finditer(pattern, iteration_content, re.DOTALL):
+ prompt = match.group(1).strip()
+ if prompt and len(prompt) > 20:
+ candidates.append({
+ 'source': 'LLEGO_Crossover',
+ 'prompt': prompt,
+ 'position': match.start()
+ })
+
+ # Pattern 3: LLEGO Mutation candidates
+ mutation_patterns = [
+ r'๐ฒ Mutation.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ r'Mutation.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ ]
+
+ for pattern in mutation_patterns:
+ for match in re.finditer(pattern, iteration_content, re.DOTALL):
+ prompt = match.group(1).strip()
+ if prompt and len(prompt) > 20:
+ candidates.append({
+ 'source': 'LLEGO_Mutation',
+ 'prompt': prompt,
+ 'position': match.start()
+ })
+
+ # Pattern 4: Generic candidate markers
+ # Look for prompts in quotes or code blocks
+ generic_patterns = [
+ r'"([^"]{50,})"', # Quoted prompts
+ r'```\s*(.*?)\s*```', # Code blocks
+ ]
+
+ for pattern in generic_patterns:
+ for match in re.finditer(pattern, iteration_content, re.DOTALL):
+ prompt = match.group(1).strip()
+ # Check if it looks like a prompt (contains task instructions)
+ if (len(prompt) > 50 and
+ any(keyword in prompt.lower() for keyword in
+ ['you are', 'task', 'instruction', 'element', 'identify', 'select'])):
+ # Check if we haven't already captured this
+ if not any(c['prompt'] == prompt for c in candidates):
+ candidates.append({
+ 'source': 'Unknown',
+ 'prompt': prompt,
+ 'position': match.start()
+ })
+
+ # Sort by position
+ candidates.sort(key=lambda x: x['position'])
+
+ return candidates
+
+ def extract_feedback(self, iteration_content: str) -> List[Dict]:
+ """
+ Extract feedback from iteration content.
+
+ Args:
+ iteration_content: Content for a single iteration
+
+ Returns:
+ List of feedback dictionaries
+ """
+ feedback_list = []
+
+ # Pattern 1: Explicit feedback markers
+ feedback_patterns = [
+ r'๐ฌ FEEDBACK:\s*(.*?)(?=\n\n|\n๐|\n๐|\n๐ก|$)',
+ r'FEEDBACK:\s*(.*?)(?=\n\n|\n๐|\n๐|\n๐ก|$)',
+ r'Feedback:\s*(.*?)(?=\n\n|\n๐|\n๐|\n๐ก|$)',
+ ]
+
+ for pattern in feedback_patterns:
+ for match in re.finditer(pattern, iteration_content, re.DOTALL):
+ feedback_text = match.group(1).strip()
+ if feedback_text and len(feedback_text) > 10:
+ feedback_list.append({
+ 'feedback': feedback_text,
+ 'position': match.start()
+ })
+
+ # Pattern 2: LLM-as-Judge feedback
+ judge_patterns = [
+ r'LLM-as-Judge.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ r'Judge Feedback.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐|๐|$)',
+ ]
+
+ for pattern in judge_patterns:
+ for match in re.finditer(pattern, iteration_content, re.DOTALL):
+ feedback_text = match.group(1).strip()
+ if feedback_text and len(feedback_text) > 10:
+ feedback_list.append({
+ 'feedback': feedback_text,
+ 'position': match.start(),
+ 'source': 'LLM-as-Judge'
+ })
+
+ # Sort by position
+ feedback_list.sort(key=lambda x: x['position'])
+
+ return feedback_list
+
+ def extract_scores(self, iteration_content: str) -> List[Dict]:
+ """
+ Extract scores from iteration content.
+
+ Args:
+ iteration_content: Content for a single iteration
+
+ Returns:
+ List of score dictionaries
+ """
+ scores = []
+
+ # Pattern for scores
+ score_patterns = [
+ r'Score:\s*([\d.]+)',
+ r'Average score:\s*([\d.]+)',
+ r'๐ฏ SCORE:\s*([\d.]+)',
+ r'๐ Score:\s*([\d.]+)',
+ ]
+
+ for pattern in score_patterns:
+ for match in re.finditer(pattern, iteration_content):
+ score_value = float(match.group(1))
+ scores.append({
+ 'score': score_value,
+ 'position': match.start()
+ })
+
+ # Sort by position
+ scores.sort(key=lambda x: x['position'])
+
+ return scores
+
+ def parse_all(self) -> Dict:
+ """
+ Parse entire log file and extract all information.
+
+ Returns:
+ Dictionary with all extracted information
+ """
+ iterations = self.extract_iterations()
+
+ result = {
+ 'iterations': [],
+ 'total_iterations': len(iterations),
+ 'all_candidates': [],
+ 'all_feedback': []
+ }
+
+ for iter_info in iterations:
+ iter_num = iter_info['iteration']
+ iter_content = iter_info['content']
+
+ candidates = self.extract_candidates(iter_content)
+ feedback = self.extract_feedback(iter_content)
+ scores = self.extract_scores(iter_content)
+
+ # Try to associate scores with candidates
+ for i, candidate in enumerate(candidates):
+ # Find nearest score after this candidate
+ candidate_pos = candidate['position']
+ nearest_score = None
+ min_distance = float('inf')
+
+ for score_info in scores:
+ if score_info['position'] > candidate_pos:
+ distance = score_info['position'] - candidate_pos
+ if distance < min_distance:
+ min_distance = distance
+ nearest_score = score_info['score']
+
+ if nearest_score is not None:
+ candidate['score'] = nearest_score
+
+ # Try to associate feedback
+ nearest_feedback = None
+ min_distance = float('inf')
+
+ for feedback_info in feedback:
+ if feedback_info['position'] > candidate_pos:
+ distance = feedback_info['position'] - candidate_pos
+ if distance < min_distance and distance < 5000: # Within reasonable distance
+ min_distance = distance
+ nearest_feedback = feedback_info['feedback']
+
+ if nearest_feedback:
+ candidate['feedback'] = nearest_feedback
+
+ result['iterations'].append({
+ 'iteration': iter_num,
+ 'candidates': candidates,
+ 'feedback': feedback,
+ 'scores': scores
+ })
+
+ result['all_candidates'].extend(candidates)
+ result['all_feedback'].extend(feedback)
+
+ return result
+
diff --git a/src/gepa_optimizer/utils/logging.py b/src/gepa_optimizer/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..401a95751534521d769f664e15639e8240d26be9
--- /dev/null
+++ b/src/gepa_optimizer/utils/logging.py
@@ -0,0 +1,107 @@
+"""
+Logging setup for GEPA Optimizer.
+
+This module provides backward-compatible logging functions that delegate
+to the centralized logging infrastructure.
+
+For new code, prefer importing directly from infrastructure.logging:
+ from gepa_optimizer.infrastructure.logging import get_logger, configure_logging
+"""
+
+import logging
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, Union
+
+# Import from centralized infrastructure
+from ..infrastructure.logging import (
+ get_logger as _get_logger,
+ configure_logging as _configure_logging,
+ LogLevel,
+)
+
+
+def setup_logging(
+ level: str = "INFO",
+ log_file: Optional[Union[str, bool]] = None,
+ use_colors: bool = True,
+ include_emoji: bool = True,
+) -> None:
+ """
+ Configure logging for GEPA Optimizer with optional file logging.
+
+ This function provides backward compatibility with existing code.
+ New code should use configure_logging() from infrastructure.logging.
+
+ Args:
+ level: Logging level (e.g. "DEBUG", "INFO", "WARNING")
+ log_file: Path to log file.
+ - None: Auto-generates timestamped filename in logs/
+ - False: Disables file logging
+ - str: Uses specified path
+ use_colors: Whether to use colored output in console
+ include_emoji: Whether to include emoji in log messages
+
+ Example:
+ # Basic setup
+ setup_logging(level="INFO")
+
+ # With file logging
+ setup_logging(level="DEBUG", log_file="optimization.log")
+
+ # Console only, no colors
+ setup_logging(level="INFO", log_file=False, use_colors=False)
+ """
+ # Handle auto-generated log file
+ actual_log_file: Optional[str] = None
+
+ if log_file is None:
+ # Auto-generate log filename with timestamp
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ log_dir = Path("logs")
+ log_dir.mkdir(exist_ok=True)
+ actual_log_file = str(log_dir / f"optimization_{timestamp}.log")
+ elif log_file is not False:
+ # Use specified path
+ log_path = Path(log_file)
+ log_path.parent.mkdir(parents=True, exist_ok=True)
+ actual_log_file = str(log_file)
+
+ # Delegate to centralized configuration
+ _configure_logging(
+ level=level,
+ log_file=actual_log_file,
+ use_colors=use_colors,
+ include_emoji=include_emoji,
+ )
+
+ # Log configuration info
+ logger = _get_logger(__name__)
+ if actual_log_file:
+ logger.info(f"Logging to file: {actual_log_file}")
+ logger.info(f"Logging configured at {level} level (console + file)")
+ else:
+ logger.info(f"Logging configured at {level} level (console only)")
+
+
+def get_logger(name: str) -> logging.Logger:
+ """
+ Get a logger for a specific module.
+
+ This function provides backward compatibility. New code should use:
+ from gepa_optimizer.infrastructure.logging import get_logger
+
+ Args:
+ name: Module name (typically __name__)
+
+ Returns:
+ Configured Logger instance
+ """
+ return _get_logger(name)
+
+
+# Re-export for convenience
+__all__ = [
+ "setup_logging",
+ "get_logger",
+]
diff --git a/src/gepa_optimizer/utils/metrics.py b/src/gepa_optimizer/utils/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1bbfb9884fd93d548a93a199cbfafd3d8edd96
--- /dev/null
+++ b/src/gepa_optimizer/utils/metrics.py
@@ -0,0 +1,220 @@
+"""
+Comprehensive metrics calculations for GEPA Optimizer
+"""
+
+from typing import Dict, List, Optional, Any
+import re
+import time
+from collections import Counter
+
+def calculate_metrics(original_prompt: str,
+ optimized_prompt: str,
+ performance_data: Optional[Dict[str, Any]] = None) -> Dict[str, float]:
+ """
+ Calculate comprehensive improvement metrics between original and optimized prompts
+
+ Args:
+ original_prompt: Original seed prompt
+ optimized_prompt: GEPA-optimized prompt
+ performance_data: Optional performance metrics from GEPA
+
+ Returns:
+ Dict[str, float]: Comprehensive metrics dictionary
+ """
+ metrics = {}
+
+ # Basic length metrics
+ orig_len = len(original_prompt)
+ opt_len = len(optimized_prompt)
+
+ if orig_len > 0:
+ metrics['length_change_percent'] = ((opt_len - orig_len) / orig_len) * 100
+ else:
+ metrics['length_change_percent'] = 0.0
+
+ metrics['original_length'] = orig_len
+ metrics['optimized_length'] = opt_len
+
+ # Word count metrics
+ orig_words = len(original_prompt.split())
+ opt_words = len(optimized_prompt.split())
+
+ if orig_words > 0:
+ metrics['word_change_percent'] = ((opt_words - orig_words) / orig_words) * 100
+ else:
+ metrics['word_change_percent'] = 0.0
+
+ metrics['original_words'] = orig_words
+ metrics['optimized_words'] = opt_words
+
+ # Complexity metrics
+ metrics['original_complexity'] = calculate_text_complexity(original_prompt)
+ metrics['optimized_complexity'] = calculate_text_complexity(optimized_prompt)
+ metrics['complexity_change'] = metrics['optimized_complexity'] - metrics['original_complexity']
+
+ # Similarity metrics
+ metrics['similarity_score'] = calculate_similarity(original_prompt, optimized_prompt)
+
+ # Include GEPA performance data if available
+ if performance_data:
+ for key, value in performance_data.items():
+ if isinstance(value, (int, float)):
+ metrics[f'gepa_{key}'] = float(value)
+
+ return metrics
+
+def calculate_text_complexity(text: str) -> float:
+ """
+ Calculate a simple complexity score for text
+
+ Args:
+ text: Text to analyze
+
+ Returns:
+ float: Complexity score (higher = more complex)
+ """
+ if not text:
+ return 0.0
+
+ # Count various complexity indicators
+ sentence_count = len(re.findall(r'[.!?]+', text))
+ word_count = len(text.split())
+ char_count = len(text)
+ unique_words = len(set(text.lower().split()))
+
+ # Avoid division by zero
+ if word_count == 0:
+ return 0.0
+
+ # Simple complexity calculation
+ avg_word_length = char_count / word_count
+ lexical_diversity = unique_words / word_count
+ avg_sentence_length = word_count / max(sentence_count, 1)
+
+ # Weighted complexity score
+ complexity = (
+ avg_word_length * 0.3 +
+ lexical_diversity * 0.4 +
+ avg_sentence_length * 0.3
+ )
+
+ return round(complexity, 3)
+
+def calculate_similarity(text1: str, text2: str) -> float:
+ """
+ Calculate similarity between two texts using simple word overlap
+
+ Args:
+ text1: First text
+ text2: Second text
+
+ Returns:
+ float: Similarity score between 0 and 1
+ """
+ if not text1 or not text2:
+ return 0.0
+
+ # Convert to lowercase and split into words
+ words1 = set(text1.lower().split())
+ words2 = set(text2.lower().split())
+
+ # Calculate Jaccard similarity
+ intersection = len(words1.intersection(words2))
+ union = len(words1.union(words2))
+
+ if union == 0:
+ return 0.0
+
+ similarity = intersection / union
+ return round(similarity, 3)
+
+def track_optimization_progress(iteration: int,
+ score: float,
+ improvement: float,
+ time_elapsed: float) -> Dict[str, Any]:
+ """
+ Track progress during optimization iterations
+
+ Args:
+ iteration: Current iteration number
+ score: Current performance score
+ improvement: Improvement over baseline
+ time_elapsed: Time elapsed in seconds
+
+ Returns:
+ Dict[str, Any]: Progress metrics
+ """
+ return {
+ 'iteration': iteration,
+ 'score': round(score, 4),
+ 'improvement': round(improvement, 4),
+ 'time_elapsed': round(time_elapsed, 2),
+ 'score_per_second': round(score / max(time_elapsed, 0.001), 4)
+ }
+
+def calculate_cost_efficiency(improvement_percent: float,
+ estimated_cost: float) -> Dict[str, float]:
+ """
+ Calculate cost efficiency metrics
+
+ Args:
+ improvement_percent: Performance improvement percentage
+ estimated_cost: Estimated cost in USD
+
+ Returns:
+ Dict[str, float]: Cost efficiency metrics
+ """
+ if estimated_cost <= 0:
+ return {'improvement_per_dollar': 0.0, 'cost_efficiency': 0.0}
+
+ improvement_per_dollar = improvement_percent / estimated_cost
+
+ # Cost efficiency score (higher is better)
+ cost_efficiency = min(improvement_per_dollar / 10.0, 1.0) # Normalized to 0-1
+
+ return {
+ 'improvement_per_dollar': round(improvement_per_dollar, 3),
+ 'cost_efficiency': round(cost_efficiency, 3),
+ 'estimated_cost': estimated_cost
+ }
+
+def summarize_optimization_results(metrics: Dict[str, float]) -> str:
+ """
+ Create a human-readable summary of optimization results
+
+ Args:
+ metrics: Metrics dictionary from calculate_metrics
+
+ Returns:
+ str: Human-readable summary
+ """
+ summary_parts = []
+
+ # Length changes
+ length_change = metrics.get('length_change_percent', 0)
+ if length_change > 5:
+ summary_parts.append(f"Prompt expanded by {length_change:.1f}%")
+ elif length_change < -5:
+ summary_parts.append(f"Prompt condensed by {abs(length_change):.1f}%")
+ else:
+ summary_parts.append("Prompt length remained similar")
+
+ # Complexity changes
+ complexity_change = metrics.get('complexity_change', 0)
+ if complexity_change > 0.1:
+ summary_parts.append("increased complexity")
+ elif complexity_change < -0.1:
+ summary_parts.append("reduced complexity")
+ else:
+ summary_parts.append("maintained similar complexity")
+
+ # Similarity
+ similarity = metrics.get('similarity_score', 1.0)
+ if similarity > 0.8:
+ summary_parts.append(f"high similarity to original ({similarity:.2f})")
+ elif similarity > 0.5:
+ summary_parts.append(f"moderate changes from original ({similarity:.2f})")
+ else:
+ summary_parts.append(f"significant changes from original ({similarity:.2f})")
+
+ return f"Optimization results: {', '.join(summary_parts)}"
diff --git a/src/gepa_optimizer/utils/pareto_logger.py b/src/gepa_optimizer/utils/pareto_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..923873788f0f1a9655b4274552ecd48ef6a136a7
--- /dev/null
+++ b/src/gepa_optimizer/utils/pareto_logger.py
@@ -0,0 +1,461 @@
+"""
+Pareto Front Logger - Tracks candidate comparisons and Pareto front updates
+"""
+
+from typing import Dict, List, Optional
+from collections import defaultdict
+import logging
+
+logger = logging.getLogger(__name__)
+
+class ParetoLogger:
+ """Tracks evaluations and Pareto front updates"""
+
+ def __init__(self):
+ self.candidates_evaluated = [] # List of (prompt, score, type, dataset)
+ self.pareto_front = [] # Current Pareto front (prompt, score, type)
+ self.baseline_score = None
+
+ def log_candidate_evaluation(self, prompt: str, score: float, candidate_type: str, dataset_type: str):
+ """Log a candidate evaluation"""
+ self.candidates_evaluated.append({
+ 'prompt': prompt,
+ 'score': score,
+ 'type': candidate_type,
+ 'dataset': dataset_type
+ })
+
+ # If evaluated on Dpareto, check against Pareto front
+ if dataset_type == 'dpareto':
+ self._check_pareto_update(prompt, score, candidate_type)
+
+ def _check_pareto_update(self, prompt: str, score: float, candidate_type: str):
+ """Check if candidate should be added to Pareto front
+
+ ๐ฅ CRITICAL RULE: Candidate must be better than baseline (f(Sโ)) to enter Pareto front
+ Exception: Seed prompt (Sโ) itself is always added as baseline
+ """
+ # Get notation for candidate with better mapping
+ if candidate_type == 'gepa_reflection':
+ cand_notation = 'Sแตฃ'
+ elif candidate_type == 'llego_crossover' or candidate_type == 'llego_crossover1' or candidate_type == 'llego_crossover2':
+ cand_notation = 'Oโโ'
+ elif candidate_type == 'llego_mutation' or candidate_type == 'llego_mutation1' or candidate_type == 'llego_mutation2':
+ cand_notation = 'Oโแตคโ'
+ elif candidate_type == 'seed':
+ cand_notation = 'Sโ'
+ elif candidate_type == 'unknown' or not candidate_type:
+ cand_notation = 'S' # Default for unknown
+ else:
+ # For any other type, use base notation
+ cand_notation = 'S'
+
+ logger.info("\n" + "โ" * 80)
+ logger.info(f"๐ PARETO FRONT P ANALYSIS - Evaluating {cand_notation}")
+ logger.info("โ" * 80)
+
+ logger.info(f"\n ๐ Evaluating: {cand_notation} with f({cand_notation}) = {score:.4f}")
+
+ # ๐ฅ CRITICAL BASELINE CHECK: Candidate must be better than baseline (unless it's the seed itself)
+ # Rule: Only candidates with f(candidate) > f(Sโ) can enter Pareto front
+ # Exception: Seed prompt (Sโ) itself is always added as the baseline
+ if candidate_type == 'seed':
+ logger.info(f"\n โ
{cand_notation} is seed prompt - always added as baseline")
+
+ # Set baseline if not already set (safety check - adapter should have done this)
+ if self.baseline_score is None:
+ self.baseline_score = score
+ logger.info(f" ๐ก Setting baseline: f(Sโ) = {score:.4f}")
+
+ # Add seed to Pareto front immediately (no dominance check needed)
+ self.pareto_front.append({
+ 'prompt': prompt,
+ 'score': score,
+ 'type': candidate_type,
+ 'notation': cand_notation
+ })
+ self.pareto_front.sort(key=lambda x: x['score'], reverse=True)
+
+ # Display Pareto front with seed
+ front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+ logger.info(f"\n โ
ADDED to Pareto Front P (baseline)")
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
+ self._display_pareto_front()
+
+ return # Seed is always added - skip dominance check
+ else:
+ # For non-seed candidates, must be better than baseline to proceed
+ if self.baseline_score is not None:
+ if score > self.baseline_score:
+ logger.info(f"\n โ
{cand_notation} meets baseline requirement:")
+ logger.info(f" f(Sโ) = {self.baseline_score:.4f} (baseline)")
+ logger.info(f" f({cand_notation}) = {score:.4f}")
+ logger.info(f" f({cand_notation}) > f(Sโ) โ Can be added to Pareto front")
+ logger.info(f" Improvement over baseline: +{score - self.baseline_score:.4f}")
+ else:
+ logger.info(f"\n โ {cand_notation} does NOT meet baseline requirement:")
+ logger.info(f" f(Sโ) = {self.baseline_score:.4f} (baseline)")
+ logger.info(f" f({cand_notation}) = {score:.4f}")
+ logger.info(f" f({cand_notation}) โค f(Sโ) โ NOT ADDED to Pareto front")
+ logger.info(f" ๐ก Only candidates better than baseline can enter Pareto front")
+ logger.info(f" ๐ก Difference: {score - self.baseline_score:.4f} (needs to be > 0)")
+ return # Skip Pareto front update - candidate is not better than baseline
+ else:
+ # CRITICAL: Baseline must be set before evaluating any non-seed candidates
+ logger.error(f"\n โ CRITICAL ERROR: Baseline score not set!")
+ logger.error(f" Cannot evaluate {cand_notation} without baseline f(Sโ)")
+ logger.error(f" ๐ก Seed prompt must be evaluated on Dpareto first")
+ logger.error(f" ๐ก Rejecting candidate to maintain correctness")
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "B", "location": "pareto_logger.py:baseline_not_set", "message": "CRITICAL: Baseline not set when checking Pareto", "data": {"candidate_type": candidate_type, "candidate_notation": cand_notation, "score": score}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ return # Reject candidate - baseline is required
+
+ # Check if this candidate dominates any in current front
+ dominated = []
+ for i, front_candidate in enumerate(self.pareto_front):
+ front_score = front_candidate['score']
+ front_notation = front_candidate.get('notation', 'S')
+
+ # Simple dominance: higher score dominates
+ if score > front_score:
+ dominated.append(i)
+ logger.info(f"\n โ
{cand_notation} DOMINATES P{i+1}:")
+ logger.info(f" f(P{i+1}) = {front_score:.4f}")
+ logger.info(f" f({cand_notation}) = {score:.4f}")
+ logger.info(f" f({cand_notation}) > f({front_notation}) โ DOMINANCE")
+ logger.info(f" Improvement: +{score - front_score:.4f}")
+
+ if dominated:
+ # Remove dominated candidates
+ for i in reversed(dominated):
+ removed = self.pareto_front.pop(i)
+ removed_notation = removed.get('notation', 'S')
+ logger.info(f" โก๏ธ Removing {removed_notation} from Pareto front P (dominated by {cand_notation})")
+
+ # Add new candidate
+ self.pareto_front.append({
+ 'prompt': prompt,
+ 'score': score,
+ 'type': candidate_type,
+ 'notation': cand_notation
+ })
+
+ # Sort by score
+ self.pareto_front.sort(key=lambda x: x['score'], reverse=True)
+
+ # Display Pareto front with candidate notations
+ front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+ logger.info(f"\n โ
ADDED to Pareto Front P")
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
+ else:
+ # Check if any in front dominates this candidate
+ is_dominated = False
+ for i, front_candidate in enumerate(self.pareto_front):
+ if front_candidate['score'] > score:
+ front_notation = front_candidate.get('notation', 'S')
+ logger.info(f"\n โ {cand_notation} is DOMINATED by {front_notation}:")
+ logger.info(f" f({front_notation}) = {front_candidate['score']:.4f}")
+ logger.info(f" f({cand_notation}) = {score:.4f}")
+ logger.info(f" f({front_notation}) > f({cand_notation}) โ DOMINATED")
+ logger.info(f" Difference: {score - front_candidate['score']:.4f}")
+ is_dominated = True
+ break
+
+ if not is_dominated:
+ # Check for equal scores (for single-objective, we can add if non-dominated)
+ equal_candidates = [c.get('notation', 'S') for c in self.pareto_front if abs(c['score'] - score) < 1e-6]
+
+ # Non-dominated: add to front
+ self.pareto_front.append({
+ 'prompt': prompt,
+ 'score': score,
+ 'type': candidate_type,
+ 'notation': cand_notation
+ })
+ self.pareto_front.sort(key=lambda x: x['score'], reverse=True)
+
+ # Display Pareto front with candidate notations
+ front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+ if equal_candidates:
+ logger.info(f"\n โ
ADDED to Pareto Front P (non-dominated)")
+ logger.info(f" f({cand_notation}) = {score:.4f} (same score as {', '.join(equal_candidates)})")
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
+ else:
+ logger.info(f"\n โ
ADDED to Pareto Front P (non-dominated)")
+ logger.info(f" {cand_notation} is non-dominated โ kept in P")
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
+ else:
+ # Show all dominating candidates with their notations
+ dominating_list = [(c.get('notation', 'S'), c['score']) for c in self.pareto_front if c['score'] > score]
+ if dominating_list:
+ for dom_notation, dom_score in dominating_list:
+ logger.info(f"\n โ {cand_notation} is DOMINATED by {dom_notation}:")
+ logger.info(f" f({dom_notation}) = {dom_score:.4f}")
+ logger.info(f" f({cand_notation}) = {score:.4f}")
+ logger.info(f" f({dom_notation}) > f({cand_notation}) โ DOMINATED")
+ logger.info(f"\n โ NOT ADDED to Pareto Front P (dominated)")
+
+ self._display_pareto_front()
+
+ def _display_pareto_front(self):
+ """Display current Pareto front with candidate notation"""
+ logger.info(f"\n๐ CURRENT PARETO FRONT P (Size: |P| = {len(self.pareto_front)}):")
+ logger.info("โ" * 80)
+
+ if not self.pareto_front:
+ logger.info(" P = {} (Empty - no candidates added yet)")
+ logger.info(" ๐ก NOTATION: P = Pareto front (non-dominated solutions)")
+ return
+
+ # Display Pareto front using candidate notations instead of P1, P2, etc.
+ front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
+
+ for candidate in self.pareto_front:
+ notation = candidate.get('notation', 'S')
+
+ # Enhanced type labels with full notation
+ type_labels = {
+ 'seed': ('๐ฑ Seed Prompt', 'Sโ'),
+ 'gepa_reflection': ('๐ GEPA Reflection Candidate', 'Sแตฃ'),
+ 'llego_crossover': ('๐ LLEGO Crossover Offspring', 'Oโโ'),
+ 'llego_mutation': ('๐ฒ LLEGO Mutation Offspring', 'Oโแตคโ'),
+ 'unknown': ('๐ Unknown Candidate', 'S')
+ }
+
+ cand_type = candidate.get('type', 'unknown')
+ type_label, type_notation = type_labels.get(cand_type, (f'๐ {cand_type}', notation))
+
+ # Use the notation from the candidate if available, otherwise use type notation
+ display_notation = notation if notation != 'S' else type_notation
+
+ logger.info(f"\n {display_notation}: {type_label}")
+ logger.info(f" f({display_notation}) = {candidate['score']:.4f}")
+ prompt_preview = candidate['prompt'][:150] if len(candidate['prompt']) > 150 else candidate['prompt']
+ logger.info(f" Prompt ({len(candidate['prompt'])} chars): {prompt_preview}{'...' if len(candidate['prompt']) > 150 else ''}")
+
+ logger.info(f"\n ๐ก NOTATION EXPLANATION:")
+ logger.info(f" P = Pareto front (set of non-dominated solutions)")
+ logger.info(f" Sโ = Seed prompt (baseline)")
+ logger.info(f" Sแตฃ = GEPA Reflection candidate")
+ logger.info(f" Oโโ = LLEGO Crossover offspring (combines parents)")
+ logger.info(f" Oโแตคโ = LLEGO Mutation offspring (explores variations)")
+ logger.info(f" f({', '.join(front_notations[:3])}) = Fitness scores of candidates in Pareto front")
+ logger.info("โ" * 80)
+
+ def set_baseline(self, score: float):
+ """Set baseline score for comparison"""
+ self.baseline_score = score
+ # Add seed to Pareto front if we have it
+ if self.pareto_front:
+ seed_candidate = self.pareto_front[0] # First is usually seed
+ seed_candidate['baseline_score'] = score
+
+ def batch_update_pareto_front(self, candidates_with_scores: List[Dict]) -> List[Dict]:
+ """
+ ๐ฅ BATCH PARETO FRONT UPDATE
+
+ Efficiently update Pareto front with multiple candidates in one operation.
+
+ Steps:
+ 1. Filter by baseline (score > baseline_score)
+ 2. Find non-dominated among filtered candidates
+ 3. Compare with current Pareto front
+ 4. Update Pareto front (remove dominated, add non-dominated)
+
+ Args:
+ candidates_with_scores: List of dicts with keys:
+ - 'prompt': str
+ - 'score': float
+ - 'type': str (candidate_type)
+ - 'notation': str (optional, will be generated if missing)
+
+ Returns:
+ List of candidates that were added to Pareto front
+ """
+ if not candidates_with_scores:
+ return []
+
+ logger.info("\n" + "โ" * 80)
+ logger.info(f"๐ฅ BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
+ logger.info("โ" * 80)
+
+ # Step 1: Filter by baseline (score > baseline_score)
+ if self.baseline_score is None:
+ logger.error("โ Baseline score not set - cannot perform batch update")
+ logger.error(" ๐ก Seed prompt must be evaluated on Dpareto first")
+ return []
+
+ baseline = self.baseline_score
+ filtered = []
+
+ for cand in candidates_with_scores:
+ score = cand.get('score', 0.0)
+ cand_type = cand.get('type', 'unknown')
+
+ # Seed is always included (it's the baseline)
+ if cand_type == 'seed':
+ filtered.append(cand)
+ continue
+
+ # Non-seed candidates must be better than baseline
+ if score > baseline:
+ filtered.append(cand)
+ logger.info(f" โ
{cand.get('notation', 'S')} passes baseline: f={score:.4f} > f(Sโ)={baseline:.4f}")
+ else:
+ notation = cand.get('notation', 'S')
+ logger.info(f" โ {notation} fails baseline: f={score:.4f} โค f(Sโ)={baseline:.4f}")
+
+ if not filtered:
+ logger.info(f"\n โ No candidates pass baseline filter (baseline: {baseline:.4f})")
+ logger.info(" ๐ก All candidates are worse than or equal to seed prompt")
+ return []
+
+ logger.info(f"\n ๐ After baseline filter: {len(filtered)}/{len(candidates_with_scores)} candidates remain")
+
+ # Step 2: Find non-dominated among filtered candidates
+ # Sort by score (descending) for easier dominance checking
+ filtered_sorted = sorted(filtered, key=lambda x: x.get('score', 0.0), reverse=True)
+ non_dominated_batch = []
+
+ for i, cand in enumerate(filtered_sorted):
+ cand_score = cand.get('score', 0.0)
+ cand_notation = cand.get('notation', 'S')
+ is_dominated = False
+
+ # Check if dominated by any other candidate in batch
+ for other in filtered_sorted[:i]: # Only check candidates with higher scores
+ other_score = other.get('score', 0.0)
+ if other_score > cand_score:
+ other_notation = other.get('notation', 'S')
+ logger.info(f" โ {cand_notation} dominated by {other_notation} in batch: f({other_notation})={other_score:.4f} > f({cand_notation})={cand_score:.4f}")
+ is_dominated = True
+ break
+
+ if not is_dominated:
+ non_dominated_batch.append(cand)
+ logger.info(f" โ
{cand_notation} is non-dominated in batch: f={cand_score:.4f}")
+
+ logger.info(f"\n ๐ After batch dominance check: {len(non_dominated_batch)}/{len(filtered)} non-dominated candidates")
+
+ if not non_dominated_batch:
+ logger.info(" โ No non-dominated candidates in batch")
+ return []
+
+ # Step 3: Compare with current Pareto front and update
+ added_to_front = []
+ candidates_to_remove = []
+
+ # First, check which current front candidates are dominated by new batch
+ for front_cand in self.pareto_front:
+ front_score = front_cand.get('score', 0.0)
+ front_notation = front_cand.get('notation', 'S')
+
+ # Check if any new candidate dominates this front candidate
+ for new_cand in non_dominated_batch:
+ new_score = new_cand.get('score', 0.0)
+ new_notation = new_cand.get('notation', 'S')
+
+ if new_score > front_score:
+ candidates_to_remove.append(front_cand)
+ logger.info(f" โก๏ธ {front_notation} will be removed (dominated by {new_notation}): f({front_notation})={front_score:.4f} < f({new_notation})={new_score:.4f}")
+ break
+
+ # Remove dominated candidates from front
+ for cand_to_remove in candidates_to_remove:
+ if cand_to_remove in self.pareto_front:
+ self.pareto_front.remove(cand_to_remove)
+
+ # Now add non-dominated new candidates (check they're not dominated by remaining front)
+ for new_cand in non_dominated_batch:
+ new_score = new_cand.get('score', 0.0)
+ new_notation = new_cand.get('notation', 'S')
+ new_type = new_cand.get('type', 'unknown')
+ new_prompt = new_cand.get('prompt', '')
+
+ # Check if dominated by any remaining front candidate
+ is_dominated_by_front = False
+ for front_cand in self.pareto_front:
+ front_score = front_cand.get('score', 0.0)
+ if front_score > new_score:
+ front_notation = front_cand.get('notation', 'S')
+ logger.info(f" โ {new_notation} dominated by existing {front_notation}: f({front_notation})={front_score:.4f} > f({new_notation})={new_score:.4f}")
+ is_dominated_by_front = True
+ break
+
+ if not is_dominated_by_front:
+ # Generate notation if missing
+ if 'notation' not in new_cand:
+ if new_type == 'gepa_reflection':
+ new_notation = 'Sแตฃ'
+ elif new_type.startswith('llego_crossover'):
+ new_notation = 'Oโโ'
+ elif new_type.startswith('llego_mutation'):
+ new_notation = 'Oโแตคโ'
+ elif new_type == 'seed':
+ new_notation = 'Sโ'
+ else:
+ new_notation = 'S'
+
+ # Add to Pareto front
+ front_entry = {
+ 'prompt': new_prompt,
+ 'score': new_score,
+ 'type': new_type,
+ 'notation': new_notation
+ }
+ self.pareto_front.append(front_entry)
+ added_to_front.append(new_cand)
+
+ # Also log to candidates_evaluated for tracking
+ self.candidates_evaluated.append({
+ 'prompt': new_prompt,
+ 'score': new_score,
+ 'type': new_type,
+ 'dataset': 'dpareto'
+ })
+
+ logger.info(f" โ
{new_notation} ADDED to Pareto front: f={new_score:.4f}")
+
+ # Sort Pareto front by score
+ self.pareto_front.sort(key=lambda x: x.get('score', 0.0), reverse=True)
+
+ # Display updated Pareto front
+ logger.info(f"\n{'โ'*80}")
+ logger.info(f"โ
BATCH UPDATE COMPLETE")
+ logger.info(f" Added: {len(added_to_front)} candidates")
+ logger.info(f" Removed: {len(candidates_to_remove)} dominated candidates")
+ logger.info(f" Pareto front size: |P| = {len(self.pareto_front)}")
+
+ front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+ logger.info(f" P = {{{', '.join(front_notations)}}}")
+ self._display_pareto_front()
+ logger.info("โ" * 80 + "\n")
+
+ return added_to_front
+
+# Global instance
+_pareto_logger = ParetoLogger()
+
+def get_pareto_logger() -> ParetoLogger:
+ """Get global Pareto logger instance"""
+ return _pareto_logger
+
+def reset_pareto_logger() -> ParetoLogger:
+ """Reset global Pareto logger instance (for new runs)"""
+ global _pareto_logger
+ _pareto_logger = ParetoLogger()
+ # #region agent log
+ import json as _json_debug
+ import time as _time_debug
+ _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+ with open(_debug_log_path, "a") as _f:
+ _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "pareto_logger.py:reset", "message": "Pareto logger reset", "data": {"baseline_score": _pareto_logger.baseline_score, "pareto_front_size": len(_pareto_logger.pareto_front)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+ # #endregion
+ return _pareto_logger
+
diff --git a/src/gepa_optimizer/utils/universal_judge_prompt.py b/src/gepa_optimizer/utils/universal_judge_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ab8d9152f41ae6a087ebb66a3b40d38e9e285d
--- /dev/null
+++ b/src/gepa_optimizer/utils/universal_judge_prompt.py
@@ -0,0 +1,317 @@
+"""
+Universal LLM-as-Judge Prompt Builder for ANY prompt optimization use case.
+
+This module provides prompts for semantic comparison and feedback generation
+that work for text, JSON, structured outputs, and any other task type.
+
+NO UI-specific assumptions. NO element IDs. NO bounding boxes.
+Pure semantic and structural comparison for universal prompt optimization.
+"""
+
+from typing import Dict, Any, Optional
+
+
+def build_universal_judge_prompt(
+ task_input: str,
+ predicted_output: str,
+ expected_output: str,
+ current_prompt: Optional[str] = None,
+ evaluation_results: Optional[Dict[str, Any]] = None,
+ image_base64: Optional[str] = None
+) -> str:
+ """
+ Build a universal LLM-as-Judge prompt for ANY task type.
+
+ Works for:
+ - Text extraction (NER, summarization, translation)
+ - JSON generation (structured data extraction)
+ - Classification tasks (sentiment, category)
+ - Question answering
+ - Code generation
+ - Multi-modal tasks (with images)
+
+ Args:
+ task_input: The input given to the LLM (task/question/text to process)
+ predicted_output: What the LLM actually returned
+ expected_output: The ground truth / desired output
+ current_prompt: The system prompt being optimized
+ evaluation_results: Optional evaluation scores
+ image_base64: Optional image for multi-modal tasks
+
+ Returns:
+ Formatted judge prompt string
+ """
+ # Handle empty outputs
+ if not predicted_output or predicted_output.strip() == '':
+ predicted_display = "[EMPTY - No output generated]"
+ else:
+ predicted_display = predicted_output
+
+ if not expected_output or expected_output.strip() == '':
+ expected_display = "[EMPTY - No expected output provided]"
+ else:
+ expected_display = expected_output
+
+ # Build evaluation context if available
+ eval_context = ""
+ if evaluation_results:
+ score = evaluation_results.get('composite_score', 0.0)
+ semantic = evaluation_results.get('semantic_similarity', 0.0)
+ structural = evaluation_results.get('structural_similarity', 0.0)
+ eval_context = f"""
+EVALUATION SCORES:
+- Composite Score: {score:.2%}
+- Semantic Similarity: {semantic:.2%}
+- Structural Similarity: {structural:.2%}
+"""
+
+ # Image context for multi-modal
+ image_context = ""
+ if image_base64:
+ image_context = """
+NOTE: An image was provided with this task. The LLM should have analyzed the image content.
+Consider whether the predicted output accurately reflects the image content.
+"""
+
+ # Build the universal judge prompt - OPTIMIZED for complex enterprise use cases
+ # Uses 3-Layer Forensic Analysis: Syntax -> Structure -> Semantics
+ judge_prompt = f"""
+You are a **Principal Forensic Prompt Auditor**. Your specialty is analyzing failures in Enterprise AI systems.
+Your goal is to compare a [PREDICTED_OUTPUT] against an [EXPECTED_OUTPUT] to identify the *exact* root cause of failure in the [SYSTEM_PROMPT].
+
+
+
+
+{task_input}
+
+
+
+{current_prompt if current_prompt else "[No system prompt provided - Baseline Test]"}
+
+
+
+
+
+{expected_display}
+
+
+
+{predicted_display}
+
+{eval_context}
+{image_context}
+
+
+
+You must evaluate the prediction using a 3-Layer Depth approach:
+
+1. **SYNTAX LAYER (Format)**:
+ - Is the output valid JSON/XML/Code?
+ - Are data types correct? (e.g., string "100" vs number 100).
+ - Are required headers or markdown tags present?
+
+2. **STRUCTURAL LAYER (Schema)**:
+ - For JSON: Do specific paths match? (e.g., check `orders[0].items[3].price`).
+ - For Lists: Is the count correct? Are items in the correct order?
+ - **CRITICAL**: Identify the *exact* nested key that failed.
+
+3. **SEMANTIC LAYER (Meaning)**:
+ - "Phone" vs "Mobile Device" (Acceptable Synonym).
+ - "User is 25" vs "Age: 25" (Acceptable Logic).
+ - Hallucinations: Did the model invent data not in the source?
+
+
+
+Return a JSON object analyzing the failure. NO preamble.
+{{
+ "match_status": "FULL_MATCH" | "PARTIAL_MATCH" | "CRITICAL_FAILURE",
+ "structural_analysis": {{
+ "format_valid": true,
+ "schema_compliance": true,
+ "deep_diff": ["List specific paths that failed, e.g., 'data.users[0].id expected int, got string'"]
+ }},
+ "semantic_analysis": {{
+ "meaning_preserved": true,
+ "hallucinations": ["List specific invented facts"],
+ "missed_constraints": ["List specific constraints from prompt that were ignored"]
+ }},
+ "root_cause_hypothesis": "Why did the prompt fail? (e.g., 'Ambiguity in field naming', 'Lack of negative constraint for X')",
+ "surgical_fix": "The EXACT instruction to add/change. (e.g., 'Change: Extract entities -> To: Extract entities and return as JSON list of objects')"
+}}
+"""
+
+ return judge_prompt
+
+
+def get_universal_judge_system_prompt(has_image: bool = False) -> str:
+ """
+ Get the system prompt for the universal LLM-as-Judge.
+
+ Args:
+ has_image: Whether an image is involved in the task
+
+ Returns:
+ System prompt string for the judge
+ """
+ base_prompt = """You are a **Principal Forensic Prompt Auditor** specializing in Enterprise AI system failures.
+
+Your task is to:
+1. Perform 3-Layer Analysis: SYNTAX (format) โ STRUCTURE (schema) โ SEMANTICS (meaning)
+2. Identify the EXACT nested path that failed (e.g., `data.items[2].price`)
+3. Provide a ROOT CAUSE hypothesis for why the prompt failed
+4. Deliver a SURGICAL FIX - the exact instruction to add or change
+
+Key principles:
+- DEEP DIFF: Traverse nested JSON structures to find exact failure points
+- SEMANTIC FLEXIBILITY: "Phone" == "Mobile Device" (synonyms OK)
+- STRICT DATA: Wrong IDs, numbers, or hallucinated facts = CRITICAL_FAILURE
+- ROOT CAUSE: Explain WHY the prompt failed (ambiguity? missing constraint?)
+
+Return your analysis as valid JSON only. No preamble."""
+
+ if has_image:
+ base_prompt += """
+
+Note: This task involved image analysis. Factor visual content accuracy into your
+SEMANTIC LAYER analysis. Did the model correctly interpret the image?"""
+
+ return base_prompt
+
+
+def format_universal_judge_feedback(
+ judge_output: str,
+ task_input: str,
+ predicted_output: str,
+ expected_output: str,
+ score: float = 0.0
+) -> str:
+ """
+ Format the LLM-as-Judge output into readable feedback.
+
+ Handles the new forensic analysis JSON schema with structural/semantic layers.
+
+ Args:
+ judge_output: Raw output from the judge LLM
+ task_input: The original task input
+ predicted_output: The LLM's predicted output
+ expected_output: The expected output
+ score: Evaluation score
+
+ Returns:
+ Formatted feedback string
+ """
+ import json
+ import re
+
+ # Try to parse JSON from judge output
+ json_match = re.search(r'\{[\s\S]*\}', judge_output)
+
+ if json_match:
+ try:
+ analysis = json.loads(json_match.group(0))
+
+ # Determine status icon based on match_status
+ match_status = analysis.get('match_status', 'CRITICAL_FAILURE')
+ status_icon = 'โ
' if match_status == 'FULL_MATCH' else 'โ ๏ธ' if match_status == 'PARTIAL_MATCH' else 'โ'
+
+ # Extract structural analysis
+ structural = analysis.get('structural_analysis', {})
+ deep_diff = structural.get('deep_diff', [])
+ deep_diff_str = '\n - '.join(deep_diff) if deep_diff else 'No structural issues'
+
+ # Extract semantic analysis
+ semantic = analysis.get('semantic_analysis', {})
+ hallucinations = semantic.get('hallucinations', [])
+ hallucinations_str = '\n - '.join(hallucinations) if hallucinations else 'None detected'
+ missed_constraints = semantic.get('missed_constraints', [])
+ missed_str = '\n - '.join(missed_constraints) if missed_constraints else 'None'
+
+ # Format as detailed, actionable feedback
+ feedback = f"""{status_icon} Forensic Analysis (Score: {score:.2%}) - {match_status}
+
+๐ STRUCTURAL ANALYSIS (Schema Layer):
+ Format Valid: {'โ
' if structural.get('format_valid', True) else 'โ'}
+ Schema Compliance: {'โ
' if structural.get('schema_compliance', True) else 'โ'}
+ Deep Diff Issues:
+ - {deep_diff_str}
+
+๐ง SEMANTIC ANALYSIS (Meaning Layer):
+ Meaning Preserved: {'โ
' if semantic.get('meaning_preserved', True) else 'โ'}
+ Hallucinations:
+ - {hallucinations_str}
+ Missed Constraints:
+ - {missed_str}
+
+๐ฌ ROOT CAUSE HYPOTHESIS:
+{analysis.get('root_cause_hypothesis', 'Unable to determine root cause')}
+
+๐ SURGICAL FIX:
+{analysis.get('surgical_fix', 'No specific fix suggested')}
+
+๐ญ CONTEXT:
+- Task: "{task_input[:200]}{'...' if len(task_input) > 200 else ''}"
+- Expected: {expected_output[:200]}{'...' if len(expected_output) > 200 else ''}
+- Predicted: {predicted_output[:200] if predicted_output else '[EMPTY]'}{'...' if predicted_output and len(predicted_output) > 200 else ''}"""
+
+ return feedback
+
+ except json.JSONDecodeError:
+ pass
+
+ # Fallback: return raw output with header
+ return f"""Forensic Analysis (Score: {score:.2%}):
+
+{judge_output}
+
+๐ญ CONTEXT:
+- Task: "{task_input[:200]}{'...' if len(task_input) > 200 else ''}"
+- Expected: {expected_output[:200]}{'...' if len(expected_output) > 200 else ''}
+- Predicted: {predicted_output[:200] if predicted_output else '[EMPTY]'}"""
+
+
+def build_empty_output_feedback(
+ task_input: str,
+ expected_output: str,
+ current_prompt: Optional[str] = None
+) -> str:
+ """
+ Generate feedback specifically for when the LLM produces no output.
+
+ Args:
+ task_input: The task input
+ expected_output: What was expected
+ current_prompt: The current system prompt
+
+ Returns:
+ Feedback explaining the empty output issue
+ """
+ return f"""โ CRITICAL: Empty Output Generated
+
+๐ PROBLEM:
+The LLM produced NO OUTPUT for this task.
+
+๐ TASK INPUT:
+{task_input[:500]}{'...' if len(task_input) > 500 else ''}
+
+๐ EXPECTED OUTPUT:
+{expected_output[:500]}{'...' if len(expected_output) > 500 else ''}
+
+๐ CURRENT PROMPT:
+{current_prompt[:300] if current_prompt else '[No prompt provided]'}{'...' if current_prompt and len(current_prompt) > 300 else ''}
+
+๐ LIKELY CAUSES:
+1. Prompt is too vague - doesn't clearly specify what output is expected
+2. Prompt lacks output format instructions
+3. Prompt might be confusing the LLM about what action to take
+4. Task input might not align with prompt expectations
+
+๐ก SUGGESTED FIX:
+Add explicit output instructions to the prompt:
+- "You MUST provide a response for every input"
+- "Always output in the following format: ..."
+- "Extract and return: [specific fields]"
+
+๐ EXAMPLE IMPROVEMENT:
+If extracting JSON, add: "Extract the following fields and return as JSON: [list expected fields from expected output]"
+"""
+
diff --git a/src/gepa_optimizer/version.py b/src/gepa_optimizer/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa6349bd4b32a6d902f3d2b32367b0436f33142
--- /dev/null
+++ b/src/gepa_optimizer/version.py
@@ -0,0 +1,5 @@
+"""
+Version information for GEPA Optimizer
+"""
+
+__version__ = "0.1.0"