diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,35 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b87cde4ef37e7285d3c0477b2b76c1909fb790b5
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,27 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+
+# Virtual environments
+venv/
+env/
+ENV/
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Build artifacts
+*.egg-info/
+dist/
+build/
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..02e8e6352061c1c5eff24631d8dc314c56e599c8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,44 @@
+
+
+---
+title: Universal Prompt Optimizer
+emoji: 🧬
+colorFrom: blue
+colorTo: cyan
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# Universal Prompt Optimizer
+
+A powerful genetic evolutionary prompt optimization tool built with GEPA (Genetic Evolutionary Prompt Agent). Optimize your prompts using genetic algorithms with optional LLEGO crossover for faster convergence.
+
+## Features
+
+- 🧬 **Genetic Algorithm Optimization**: Evolve prompts through multiple iterations
+- 🎯 **Multi-Model Support**: Works with OpenAI, Anthropic, Google, and custom models
+- 📊 **Real-time Metrics**: Track optimization progress and improvements
+- 🖼️ **Multi-modal Support**: Include images in your training examples
+- ⚡ **LLEGO Crossover**: Advanced genetic operations for faster convergence
+
+## How to Use
+
+1. **Select Model**: Choose your target LLM (GPT-4, Claude, Gemini, or custom)
+2. **Enter Seed Prompt**: Describe your task, constraints, and desired output format
+3. **Add Training Examples**: Provide input/output pairs (images optional)
+4. **Configure Optimization**: Set evolution rounds, batch size, and enable LLEGO
+5. **Start Optimization**: Watch as the genetic algorithm evolves your prompt
+
+## API Keys
+
+API keys are stored in-session only and never logged. You can provide them in the UI or set them as environment variables:
+
+- `OPENAI_API_KEY`
+- `ANTHROPIC_API_KEY`
+- `GOOGLE_API_KEY`
+
+## License
+
+MIT License
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..68f431f56a38e1a4767609a3d601b84667596c92
--- /dev/null
+++ b/app.py
@@ -0,0 +1,1563 @@
+"""
+🚀 Universal Prompt Optimizer - Enhanced Production UI v8.0
+Principal Engineer Edition: Linear/Vercel-style Dark Mode with Premium UX
+"""
+
+import sys
+import os
+# Add src directory to Python path for gepa_optimizer imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+
+import gradio as gr
+import json
+import base64
+import io
+import os
+import logging
+import traceback
+import html
+import numpy as np
+from PIL import Image as PILImage
+from typing import List, Dict, Optional, Any, Tuple
+import threading
+from collections import deque
+
+# Optional import for URL image downloads
+try:
+    import requests
+    REQUESTS_AVAILABLE = True
+except ImportError:
+    REQUESTS_AVAILABLE = False
+
+# ==========================================
+# 0. LOGGING & BACKEND UTILS
+# ==========================================
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+
+# Global Candidates Store (Thread-safe)
+_candidates_store = {
+    'candidates': deque(maxlen=100),
+    'lock': threading.Lock(),
+    'iteration': 0
+}
+
+def add_candidate_to_store(candidate: Dict[str, Any]):
+    with _candidates_store['lock']:
+        _candidates_store['candidates'].append({
+            'iteration': _candidates_store['iteration'],
+            'source': candidate.get('source', 'unknown'),
+            'prompt': candidate.get('prompt', ''),
+            'timestamp': candidate.get('timestamp', ''),
+            'index': len(_candidates_store['candidates']) + 1
+        })
+
+def get_candidates_from_store() -> List[Dict[str, Any]]:
+    with _candidates_store['lock']:
+        return list(_candidates_store['candidates'])
+
+def clear_candidates_store():
+    with _candidates_store['lock']:
+        _candidates_store['candidates'].clear()
+        _candidates_store['iteration'] = 0
+
+def increment_iteration():
+    with _candidates_store['lock']:
+        _candidates_store['iteration'] += 1
+
+# ==========================================
+# 1. MOCK BACKEND (Kept as provided)
+# ==========================================
+try:
+    from gepa_optimizer import quick_optimize_sync, OptimizedResult
+    BACKEND_AVAILABLE = True
+except ImportError:
+    BACKEND_AVAILABLE = False
+    from dataclasses import dataclass
+
+    @dataclass
+    class OptimizedResult:
+        optimized_prompt: str
+        improvement_metrics: dict
+        iteration_history: list
+
+    def quick_optimize_sync(seed_prompt, dataset, model, **kwargs):
+        import time
+        iterations = kwargs.get('max_iterations', 5)
+        batch_size = kwargs.get('batch_size', 4)
+        use_llego = kwargs.get('use_llego', True)
+        
+        # Simulate processing time based on iterations
+        time.sleep(0.5 * iterations)
+        
+        llego_note = "with LLEGO crossover" if use_llego else "standard mutation only"
+
+        return OptimizedResult(
+            optimized_prompt=f"""# OPTIMIZED PROMPT FOR {model}
+# ----------------------------------------
+# Optimization: {iterations} iterations, batch size {batch_size}, {llego_note}
+
+## Task Context
+{seed_prompt}
+
+## Refined Instructions
+1. Analyse the input constraints strictly.
+2. Verify output format against expected schema.
+3. Apply chain-of-thought reasoning before answering.
+4. Cross-reference with provided examples for consistency.
+
+## Safety & Edge Cases
+- If input is ambiguous, ask for clarification.
+- Maintain a professional, neutral tone.
+- Handle edge cases gracefully with informative responses.""",
+            improvement_metrics={
+                "baseline_score": 0.45,
+                "final_score": 0.92,
+                "improvement": "+104.4%",
+                "iterations_run": iterations,
+                "candidates_evaluated": iterations * batch_size,
+            },
+            iteration_history=[
+                f"Iter 1: Baseline evaluation - Score: 0.45",
+                f"Iter 2: Added Chain-of-Thought constraints - Score: 0.62",
+                f"Iter 3: Refined output formatting rules - Score: 0.78",
+                f"Iter 4: {'LLEGO crossover applied' if use_llego else 'Mutation applied'} - Score: 0.88",
+                f"Iter 5: Final refinement - Score: 0.92",
+            ][:iterations],
+        )
+
+# ==========================================
+# 2. HELPER FUNCTIONS
+# ==========================================
+def gradio_image_to_base64(image_input) -> Optional[str]:
+    """Convert Gradio image input to base64 string with comprehensive error handling."""
+    if image_input is None:
+        return None
+    
+    try:
+        pil_image = None
+        
+        if isinstance(image_input, np.ndarray):
+            try:
+                # Validate array shape and dtype
+                if image_input.size == 0:
+                    logger.warning("Empty image array provided")
+                    return None
+                pil_image = PILImage.fromarray(image_input)
+            except (ValueError, TypeError) as e:
+                logger.error(f"Failed to convert numpy array to PIL Image: {str(e)}")
+                return None
+        elif isinstance(image_input, PILImage.Image):
+            pil_image = image_input
+        elif isinstance(image_input, str):
+            if not os.path.exists(image_input):
+                logger.warning(f"Image file not found: {image_input}")
+                return None
+            try:
+                pil_image = PILImage.open(image_input)
+            except (IOError, OSError) as e:
+                logger.error(f"Failed to open image file: {str(e)}")
+                return None
+        else:
+            logger.warning(f"Unsupported image input type: {type(image_input)}")
+            return None
+
+        if pil_image is None:
+            return None
+
+        try:
+            # Validate image before encoding
+            pil_image.verify()
+            # Reopen after verify (verify closes the image)
+            pil_image = PILImage.open(io.BytesIO(pil_image.tobytes()))
+        except Exception:
+            # If verify fails, try to proceed anyway
+            pass
+
+        try:
+            buffered = io.BytesIO()
+            pil_image.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            return f"data:image/png;base64,{img_str}"
+        except (IOError, OSError, ValueError) as e:
+            logger.error(f"Failed to encode image to base64: {str(e)}")
+            return None
+    except Exception as e:
+        logger.error(f"Unexpected error in image conversion: {str(e)}\n{traceback.format_exc()}")
+        return None
+
+def validate_dataset(dataset: List[Dict]) -> Tuple[bool, str]:
+    """Validate dataset structure and content with detailed error messages."""
+    if not isinstance(dataset, list):
+        return False, "Dataset must be a list of examples."
+    
+    if len(dataset) == 0:
+        return False, "Dataset is empty. Add at least one example."
+    
+    # Validate each item in the dataset
+    for i, item in enumerate(dataset):
+        if not isinstance(item, dict):
+            return False, f"Dataset item {i+1} must be a dictionary with 'input' and 'output' keys."
+        
+        if "input" not in item or "output" not in item:
+            return False, f"Dataset item {i+1} is missing required 'input' or 'output' field."
+        
+        if not isinstance(item.get("input"), str) or not isinstance(item.get("output"), str):
+            return False, f"Dataset item {i+1} has invalid 'input' or 'output' type (must be strings)."
+        
+        if not item.get("input", "").strip() or not item.get("output", "").strip():
+            return False, f"Dataset item {i+1} has empty 'input' or 'output' field."
+    
+    return True, ""
+
+def validate_model(model: str, custom_model: str) -> Tuple[bool, str]:
+    """Validate model selection and custom model format."""
+    if not model:
+        return False, "Please select a foundation model."
+    
+    if model == "custom":
+        if not custom_model or not custom_model.strip():
+            return False, "Custom model selected but no model ID provided."
+        
+        # Validate custom model format (provider/model_name)
+        parts = custom_model.strip().split("/")
+        if len(parts) != 2:
+            return False, "Custom model ID must be in format 'provider/model_name' (e.g., 'openai/gpt-4')."
+        
+        if not parts[0].strip() or not parts[1].strip():
+            return False, "Custom model ID provider and model name cannot be empty."
+    
+    return True, ""
+
+def validate_api_keys(model: str, api_keys: Dict[str, str]) -> Tuple[bool, str]:
+    """Validate that required API keys are provided for the selected model."""
+    if not api_keys:
+        return True, ""  # Keys are optional if already set in environment
+    
+    model_provider = model.split("/")[0] if "/" in model else model.lower()
+    
+    # Check if model requires a specific provider key
+    required_providers = {
+        "openai": "openai",
+        "anthropic": "anthropic",
+        "google": "google"
+    }
+    
+    if model_provider in required_providers:
+        provider = required_providers[model_provider]
+        key_value = api_keys.get(provider, "").strip() if api_keys.get(provider) else ""
+        
+        # Check environment variable as fallback
+        env_vars = {
+            "openai": "OPENAI_API_KEY",
+            "anthropic": "ANTHROPIC_API_KEY",
+            "google": "GOOGLE_API_KEY"
+        }
+        
+        if not key_value and not os.environ.get(env_vars.get(provider, "")):
+            return False, f"API key for {provider.capitalize()} is required for model '{model}' but not provided."
+    
+    return True, ""
+
+def safe_optimize(seed_prompt, dataset, model, custom_model="", max_iterations=5, max_metric_calls=50, batch_size=4, use_llego=True, api_keys=None):
+    """Safely run optimization with comprehensive error handling."""
+    try:
+        # Validate seed prompt
+        if not seed_prompt or not isinstance(seed_prompt, str):
+            return False, "Seed prompt is required and must be a string.", None
+        
+        if not seed_prompt.strip():
+            return False, "Seed prompt cannot be empty.", None
+        
+        # Validate dataset
+        is_valid, msg = validate_dataset(dataset)
+        if not is_valid:
+            return False, msg, None
+        
+        # Determine final model
+        final_model = custom_model.strip() if custom_model and custom_model.strip() else model
+        
+        # Validate model
+        model_valid, model_msg = validate_model(model, custom_model)
+        if not model_valid:
+            return False, model_msg, None
+        
+        # Validate API keys
+        api_valid, api_msg = validate_api_keys(final_model, api_keys or {})
+        if not api_valid:
+            return False, api_msg, None
+        
+        # Validate optimization parameters
+        if not isinstance(max_iterations, int) or max_iterations < 1 or max_iterations > 50:
+            return False, "Max iterations must be between 1 and 50.", None
+        
+        if not isinstance(max_metric_calls, int) or max_metric_calls < 10 or max_metric_calls > 500:
+            return False, "Max metric calls must be between 10 and 500.", None
+        
+        if not isinstance(batch_size, int) or batch_size < 1 or batch_size > 20:
+            return False, "Batch size must be between 1 and 20.", None
+        
+        # Check backend availability
+        if not BACKEND_AVAILABLE:
+            logger.warning("Backend not available, using mock optimizer")
+        
+        # Set API keys from UI if provided
+        if api_keys:
+            try:
+                key_mapping = {
+                    "openai": "OPENAI_API_KEY",
+                    "google": "GOOGLE_API_KEY",
+                    "anthropic": "ANTHROPIC_API_KEY",
+                }
+                for provider, env_var in key_mapping.items():
+                    if api_keys.get(provider) and api_keys[provider].strip():
+                        os.environ[env_var] = api_keys[provider].strip()
+                        logger.info(f"Set {provider} API key from UI")
+            except Exception as e:
+                logger.error(f"Failed to set API keys: {str(e)}")
+                return False, f"Failed to configure API keys: {str(e)}", None
+
+        # Run optimization
+        try:
+            result = quick_optimize_sync(
+                seed_prompt=seed_prompt,
+                dataset=dataset,
+                model=final_model,
+                max_iterations=max_iterations,
+                max_metric_calls=max_metric_calls,
+                batch_size=batch_size,
+                use_llego=use_llego,
+                verbose=True,
+            )
+            
+            # Validate result structure
+            if not result:
+                return False, "Optimization returned no result.", None
+            
+            if not hasattr(result, 'optimized_prompt'):
+                return False, "Optimization result is missing required fields.", None
+            
+            return True, "Success", result
+            
+        except KeyboardInterrupt:
+            logger.warning("Optimization interrupted by user")
+            return False, "Optimization was interrupted.", None
+        except TimeoutError:
+            logger.error("Optimization timed out")
+            return False, "Optimization timed out. Try reducing max_iterations or max_metric_calls.", None
+        except ConnectionError as e:
+            logger.error(f"Connection error during optimization: {str(e)}")
+            return False, f"Connection error: {str(e)}. Check your internet connection and API keys.", None
+        except ValueError as e:
+            logger.error(f"Invalid parameter in optimization: {str(e)}")
+            return False, f"Invalid configuration: {str(e)}", None
+        except Exception as e:
+            error_msg = str(e)
+            logger.error(f"Optimization failed: {error_msg}\n{traceback.format_exc()}")
+            # Provide user-friendly error messages
+            if "api" in error_msg.lower() or "key" in error_msg.lower():
+                return False, f"API error: {error_msg}. Please check your API keys.", None
+            elif "rate limit" in error_msg.lower():
+                return False, "Rate limit exceeded. Please wait a moment and try again.", None
+            elif "quota" in error_msg.lower():
+                return False, "API quota exceeded. Please check your account limits.", None
+            else:
+                return False, f"Optimization failed: {error_msg}", None
+                
+    except Exception as e:
+        logger.error(f"Unexpected error in safe_optimize: {str(e)}\n{traceback.format_exc()}")
+        return False, f"Unexpected error: {str(e)}", None
+
+# ==========================================
+# 3. UI LOGIC
+# ==========================================
+def add_example(input_text, output_text, image_input, current_dataset):
+    """Add an example to the dataset with comprehensive error handling."""
+    try:
+        # Validate inputs
+        if not input_text:
+            raise gr.Error("Input text is required.")
+        
+        if not output_text:
+            raise gr.Error("Output text is required.")
+        
+        if not isinstance(input_text, str) or not isinstance(output_text, str):
+            raise gr.Error("Input and Output must be text strings.")
+        
+        input_text = input_text.strip()
+        output_text = output_text.strip()
+        
+        if not input_text:
+            raise gr.Error("Input text cannot be empty.")
+        
+        if not output_text:
+            raise gr.Error("Output text cannot be empty.")
+        
+        # Validate dataset state
+        if not isinstance(current_dataset, list):
+            raise gr.Error("Dataset state is invalid. Please refresh the page.")
+        
+        # Process image with error handling
+        img_b64 = None
+        try:
+            img_b64 = gradio_image_to_base64(image_input)
+        except Exception as e:
+            logger.warning(f"Image processing failed, continuing without image: {str(e)}")
+            # Continue without image - it's optional
+        
+        # Create new item
+        try:
+            new_item = {
+                "input": input_text,
+                "output": output_text,
+                "image": img_b64,
+                "image_preview": "🖼️ Image" if img_b64 else "-"
+            }
+            
+            # Validate item structure
+            if not isinstance(new_item["input"], str) or not isinstance(new_item["output"], str):
+                raise gr.Error("Failed to create dataset item: invalid data types.")
+            
+            current_dataset.append(new_item)
+            
+            return current_dataset, "", "", None
+            
+        except Exception as e:
+            logger.error(f"Failed to add example to dataset: {str(e)}")
+            raise gr.Error(f"Failed to add example: {str(e)}")
+            
+    except gr.Error:
+        # Re-raise Gradio errors as-is
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error in add_example: {str(e)}\n{traceback.format_exc()}")
+        raise gr.Error(f"Unexpected error: {str(e)}")
+
+def update_table(dataset):
+    """Update the dataset table display with error handling."""
+    try:
+        if not dataset:
+            return []
+        
+        if not isinstance(dataset, list):
+            logger.error(f"Invalid dataset type: {type(dataset)}")
+            return []
+        
+        table_data = []
+        for i, item in enumerate(dataset):
+            try:
+                if not isinstance(item, dict):
+                    logger.warning(f"Skipping invalid dataset item {i+1}: not a dictionary")
+                    continue
+                
+                input_text = str(item.get("input", ""))[:50] if item.get("input") else ""
+                output_text = str(item.get("output", ""))[:50] if item.get("output") else ""
+                image_preview = str(item.get("image_preview", "-"))
+                
+                table_data.append([i+1, input_text, output_text, image_preview])
+            except Exception as e:
+                logger.warning(f"Error processing dataset item {i+1}: {str(e)}")
+                continue
+        
+        return table_data
+        
+    except Exception as e:
+        logger.error(f"Error updating table: {str(e)}\n{traceback.format_exc()}")
+        return []
+
+def clear_dataset():
+    """Clear the dataset with error handling."""
+    try:
+        return [], []
+    except Exception as e:
+        logger.error(f"Error clearing dataset: {str(e)}")
+        return [], []
+
+def get_candidates_display():
+    """Generate HTML display for candidates with error handling."""
+    try:
+        candidates = get_candidates_from_store()
+        
+        if not candidates:
+            return "<div style='padding: 2rem; text-align: center; color: #6b7280;'><div style='font-size: 3rem; opacity: 0.3; margin-bottom: 1rem;'>🧬</div><p>Waiting for optimization to start...</p></div>"
+        
+        if not isinstance(candidates, list):
+            logger.error(f"Invalid candidates type: {type(candidates)}")
+            return "<div style='padding: 2rem; text-align: center; color: #ef4444;'>Error loading candidates.</div>"
+        
+        html_output = "<div style='display: flex; flex-direction: column; gap: 12px;'>"
+        
+        # Show last 10 candidates
+        candidates_to_show = list(candidates)[-10:]
+        for c in reversed(candidates_to_show):
+            try:
+                if not isinstance(c, dict):
+                    continue
+                
+                iteration = str(c.get('iteration', '?'))
+                source = str(c.get('source', 'unknown')).upper()
+                prompt = str(c.get('prompt', ''))[:200]
+                
+                # Escape HTML to prevent XSS
+                iteration = html.escape(iteration)
+                source = html.escape(source)
+                prompt = html.escape(prompt)
+                
+                html_output += f"""
+                <div style='background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%); border: 1px solid #334155; border-radius: 8px; padding: 16px; position: relative; overflow: hidden;'>
+                    <div style='position: absolute; top: 0; left: 0; width: 100%; height: 2px; background: linear-gradient(90deg, #06b6d4, #3b82f6);'></div>
+                    <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;'>
+                        <span style='font-family: "JetBrains Mono", monospace; font-size: 0.75rem; color: #06b6d4; font-weight: 600;'>ITERATION {iteration}</span>
+                        <span style='background: #1e293b; border: 1px solid #334155; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; color: #94a3b8;'>{source}</span>
+                    </div>
+                    <div style='font-family: "JetBrains Mono", monospace; font-size: 0.85rem; color: #cbd5e1; line-height: 1.6;'>{prompt}...</div>
+                </div>
+                """
+            except Exception as e:
+                logger.warning(f"Error rendering candidate: {str(e)}")
+                continue
+        
+        html_output += "</div>"
+        return html_output
+        
+    except Exception as e:
+        logger.error(f"Error generating candidates display: {str(e)}\n{traceback.format_exc()}")
+        return "<div style='padding: 2rem; text-align: center; color: #ef4444;'>Error loading candidates display.</div>"
+
+def run_optimization_flow(seed, dataset, model, custom_model, iter_count, call_count, batch, llego, k_openai, k_google, k_anthropic, progress=gr.Progress()):
+    """Run the optimization flow with comprehensive error handling."""
+    import time
+    
+    try:
+        # Validate inputs
+        if not seed:
+            raise gr.Error("Seed prompt is required.")
+        
+        if not dataset:
+            raise gr.Error("Dataset is required. Add at least one example.")
+        
+        if not model:
+            raise gr.Error("Model selection is required.")
+        
+        # Validate numeric parameters
+        try:
+            iter_count = int(iter_count) if iter_count else 5
+            call_count = int(call_count) if call_count else 50
+            batch = int(batch) if batch else 4
+        except (ValueError, TypeError) as e:
+            raise gr.Error(f"Invalid optimization parameters: {str(e)}")
+        
+        # Determine final model
+        try:
+            final_model = custom_model.strip() if custom_model and custom_model.strip() else model
+        except Exception as e:
+            logger.warning(f"Error processing custom model: {str(e)}")
+            final_model = model
+        
+        # Clear candidates store
+        try:
+            clear_candidates_store()
+        except Exception as e:
+            logger.warning(f"Error clearing candidates store: {str(e)}")
+        
+        # Prepare API keys
+        api_keys = {}
+        try:
+            api_keys = {
+                "openai": k_openai if k_openai else "",
+                "google": k_google if k_google else "",
+                "anthropic": k_anthropic if k_anthropic else ""
+            }
+        except Exception as e:
+            logger.warning(f"Error processing API keys: {str(e)}")
+        
+        # Initial state
+        try:
+            yield (
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                "🚀 Initializing Genetic Algorithm...", 
+                "", {}, "", ""
+            )
+            time.sleep(0.5)  # Brief pause for UI update
+        except Exception as e:
+            logger.error(f"Error in initial UI update: {str(e)}")
+            raise gr.Error(f"Failed to initialize UI: {str(e)}")
+        
+        # Evolution loop (visual progress - actual work happens in safe_optimize)
+        try:
+            for i in range(1, iter_count + 1):
+                try:
+                    increment_iteration()
+                    add_candidate_to_store({
+                        "source": "evolution_step", 
+                        "prompt": f"Candidate {i}: Optimizing instruction clarity and task alignment...",
+                        "timestamp": "now"
+                    })
+                    
+                    progress(i/iter_count, desc=f"Evolution Round {i}/{iter_count}")
+                    yield (
+                        gr.update(), gr.update(), gr.update(),
+                        f"🧬 **Evolution Round {i}/{iter_count}**\n\n• Generating {batch} prompt mutations\n• Evaluating fitness scores\n• Selecting top candidates",
+                        "", {}, "", get_candidates_display()
+                    )
+                    time.sleep(0.3)  # Pause to show progress
+                except Exception as e:
+                    logger.warning(f"Error in evolution step {i}: {str(e)}")
+                    # Continue with next iteration
+                    continue
+        except Exception as e:
+            logger.error(f"Error in evolution loop: {str(e)}")
+            # Continue to optimization attempt
+
+        # Final optimization
+        try:
+            success, msg, result = safe_optimize(
+                seed_prompt=seed, 
+                dataset=dataset,
+                model=model,
+                custom_model=custom_model,
+                max_iterations=iter_count, 
+                max_metric_calls=call_count, 
+                batch_size=batch, 
+                use_llego=llego, 
+                api_keys=api_keys
+            )
+
+            if not success:
+                # Show error state
+                yield (
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    f"❌ **Optimization Failed**\n\n{msg}",
+                    "", {}, "", get_candidates_display()
+                )
+                raise gr.Error(msg)
+
+            # Validate result before displaying
+            if not result:
+                raise gr.Error("Optimization completed but returned no result.")
+            
+            if not hasattr(result, 'optimized_prompt'):
+                raise gr.Error("Optimization result is missing required fields.")
+            
+            # Show results
+            try:
+                optimized_prompt = result.optimized_prompt if result.optimized_prompt else ""
+                improvement_metrics = result.improvement_metrics if hasattr(result, 'improvement_metrics') else {}
+                iteration_history = result.iteration_history if hasattr(result, 'iteration_history') else []
+                
+                history_text = "\n".join(iteration_history) if isinstance(iteration_history, list) else str(iteration_history)
+                
+                yield (
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=True),
+                    "✅ Optimization Complete",
+                    optimized_prompt,
+                    improvement_metrics,
+                    history_text,
+                    get_candidates_display()
+                )
+            except Exception as e:
+                logger.error(f"Error displaying results: {str(e)}")
+                raise gr.Error(f"Failed to display results: {str(e)}")
+                
+        except gr.Error:
+            # Re-raise Gradio errors
+            raise
+        except Exception as e:
+            logger.error(f"Error in optimization: {str(e)}\n{traceback.format_exc()}")
+            raise gr.Error(f"Optimization error: {str(e)}")
+            
+    except gr.Error:
+        # Re-raise Gradio errors as-is
+        raise
+    except KeyboardInterrupt:
+        logger.warning("Optimization interrupted by user")
+        raise gr.Error("Optimization was interrupted.")
+    except Exception as e:
+        logger.error(f"Unexpected error in optimization flow: {str(e)}\n{traceback.format_exc()}")
+        raise gr.Error(f"Unexpected error: {str(e)}")
+
+# ==========================================
+# 4. ENHANCED CSS (Linear/Vercel-style)
+# ==========================================
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
+
+:root {
+  --bg0: #070A0F;
+  --bg1: #0B1020;
+  --bg2: rgba(255,255,255,0.04);
+  --bg3: rgba(255,255,255,0.06);
+
+  --stroke0: rgba(148,163,184,0.14);
+  --stroke1: rgba(148,163,184,0.22);
+
+  --text0: #EAF0FF;
+  --text1: rgba(234,240,255,0.74);
+  --text2: rgba(234,240,255,0.56);
+
+  --teal: #06B6D4;
+  --blue: #3B82F6;
+
+  --ok: #10B981;
+  --okGlow: rgba(16,185,129,0.18);
+
+  --bad: #EF4444;
+
+  --shadow: 0 12px 40px rgba(0,0,0,0.45);
+  --shadowSoft: 0 10px 24px rgba(0,0,0,0.32);
+
+  --radius: 14px;
+  --radiusSm: 10px;
+}
+
+html, body {
+  background: radial-gradient(1200px 700px at 20% -10%, rgba(6,182,212,0.13), transparent 55%),
+              radial-gradient(1000px 650px at 90% 0%, rgba(59,130,246,0.10), transparent 60%),
+              linear-gradient(180deg, var(--bg0) 0%, var(--bg1) 100%);
+  color: var(--text0);
+  font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
+}
+
+.gradio-container {
+  max-width: 1520px !important;
+  padding: 12px 18px !important;
+  margin: 0 auto !important;
+}
+
+/* --- App shell --- */
+.app-shell { min-height: auto !important; }
+.topbar {
+  padding: 12px 14px 12px 14px;
+  margin-bottom: 4px;
+  border: 1px solid var(--stroke0);
+  border-radius: var(--radius);
+  background: linear-gradient(180deg, rgba(255,255,255,0.04) 0%, rgba(255,255,255,0.02) 100%);
+  box-shadow: var(--shadowSoft);
+}
+.topbar-wrap { margin-bottom: 0 !important; }
+
+.brand-row { display: flex; align-items: center; justify-content: space-between; gap: 16px; }
+.brand-left { display: flex; align-items: center; gap: 14px; }
+.brand-mark {
+  width: 44px; height: 44px; border-radius: 12px;
+  background: linear-gradient(135deg, rgba(6,182,212,0.26), rgba(59,130,246,0.20));
+  border: 1px solid rgba(6,182,212,0.30);
+  box-shadow: 0 0 0 4px rgba(6,182,212,0.10);
+  display: flex; align-items: center; justify-content: center;
+  font-weight: 800;
+}
+.h1 {
+  font-size: 22px; font-weight: 800; letter-spacing: -0.02em;
+  margin: 0; line-height: 1.2;
+}
+.subtitle { margin-top: 4px; color: var(--text1); font-weight: 500; font-size: 13px; }
+
+.status-pill {
+  display: inline-flex; align-items: center; gap: 10px;
+  padding: 10px 12px; border-radius: 999px;
+  background: rgba(255,255,255,0.03);
+  border: 1px solid var(--stroke0);
+  color: var(--text1);
+  font-size: 12px; font-weight: 700; letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.dot {
+  width: 10px; height: 10px; border-radius: 999px;
+  background: var(--ok);
+  box-shadow: 0 0 16px rgba(16,185,129,0.40);
+  animation: pulse 1.8s ease-in-out infinite;
+}
+@keyframes pulse { 0%, 100% { transform: scale(1); opacity: 0.95; } 50% { transform: scale(1.18); opacity: 0.70; } }
+
+/* --- Two-column layout helpers --- */
+.left-col, .right-col { min-width: 280px; }
+
+/* --- Cards / Sections --- */
+.card {
+  border-radius: var(--radius);
+  background: linear-gradient(180deg, rgba(255,255,255,0.045) 0%, rgba(255,255,255,0.022) 100%);
+  border: 1px solid var(--stroke0);
+  box-shadow: var(--shadowSoft);
+  padding: 16px;
+}
+.card + .card { margin-top: 14px; }
+
+.card-head {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 12px;
+  padding-bottom: 12px;
+  margin-bottom: 12px;
+  border-bottom: 1px solid var(--stroke0);
+}
+.card-title {
+  display: flex; align-items: center; gap: 10px;
+  font-size: 13px; font-weight: 800; letter-spacing: 0.12em;
+  text-transform: uppercase; color: var(--text1);
+}
+.step {
+  width: 30px; height: 30px; border-radius: 10px;
+  background: linear-gradient(135deg, rgba(6,182,212,0.95), rgba(59,130,246,0.95));
+  box-shadow: 0 10px 20px rgba(6,182,212,0.18);
+  display: flex; align-items: center; justify-content: center;
+  color: white; font-weight: 900; font-size: 13px;
+}
+.hint { color: var(--text2); font-size: 12px; line-height: 1.4; }
+
+.ds-count span {
+  display: inline-flex;
+  align-items: center;
+  padding: 7px 10px;
+  border-radius: 999px;
+  border: 1px solid var(--stroke0);
+  background: rgba(255,255,255,0.02);
+  color: var(--text1) !important;
+  font-weight: 700;
+  font-size: 12px;
+}
+
+/* --- Inputs --- */
+label { color: var(--text1) !important; font-weight: 650 !important; font-size: 12px !important; }
+
+textarea, input, select {
+  background: rgba(255,255,255,0.03) !important;
+  border: 1px solid var(--stroke0) !important;
+  border-radius: 12px !important;
+  color: var(--text0) !important;
+  transition: border-color 0.15s ease, box-shadow 0.15s ease, transform 0.15s ease;
+}
+
+textarea:focus, input:focus, select:focus {
+  outline: none !important;
+  border-color: rgba(6,182,212,0.55) !important;
+  box-shadow: 0 0 0 4px rgba(6,182,212,0.14) !important;
+}
+
+.keybox input { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; }
+
+.seed textarea { min-height: 160px !important; }
+.mono textarea { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; font-size: 12.5px !important; }
+
+/* --- Buttons --- */
+.cta button {
+  width: 100% !important;
+  border: 0 !important;
+  border-radius: 14px !important;
+  padding: 14px 16px !important;
+  font-size: 13px !important;
+  font-weight: 900 !important;
+  letter-spacing: 0.12em !important;
+  text-transform: uppercase !important;
+  color: white !important;
+  background: linear-gradient(135deg, rgba(6,182,212,1) 0%, rgba(59,130,246,1) 100%) !important;
+  box-shadow: 0 18px 48px rgba(6,182,212,0.22) !important;
+  position: relative !important;
+  overflow: hidden !important;
+}
+.cta button::after {
+  content: "";
+  position: absolute; inset: -120px;
+  background: radial-gradient(closest-side, rgba(255,255,255,0.18), transparent 60%);
+  transform: translateX(-40%);
+  transition: transform 0.45s ease;
+}
+.cta button:hover { transform: translateY(-1px); }
+.cta button:hover::after { transform: translateX(40%); }
+.cta button:active { transform: translateY(0px); }
+
+.btn-secondary button {
+  border-radius: 12px !important;
+  border: 1px solid var(--stroke1) !important;
+  background: rgba(255,255,255,0.03) !important;
+  color: var(--text0) !important;
+  font-weight: 800 !important;
+}
+.btn-secondary button:hover { border-color: rgba(6,182,212,0.55) !important; }
+
+.btn-danger button {
+  border-radius: 12px !important;
+  border: 1px solid rgba(239,68,68,0.55) !important;
+  background: rgba(239,68,68,0.06) !important;
+  color: rgba(255,170,170,1) !important;
+  font-weight: 900 !important;
+}
+
+/* --- Dataframe --- */
+.dataframe {
+  border-radius: 14px !important;
+  border: 1px solid var(--stroke0) !important;
+  background: rgba(255,255,255,0.02) !important;
+  overflow: hidden !important;
+}
+.dataframe thead th {
+  background: rgba(255,255,255,0.04) !important;
+  color: var(--text1) !important;
+  font-weight: 900 !important;
+  font-size: 11px !important;
+  letter-spacing: 0.10em !important;
+  text-transform: uppercase !important;
+  border-bottom: 1px solid var(--stroke0) !important;
+}
+.dataframe tbody td {
+  color: var(--text0) !important;
+  font-size: 12px !important;
+  border-bottom: 1px solid rgba(148,163,184,0.10) !important;
+}
+.dataframe tbody tr:hover { background: rgba(255,255,255,0.03) !important; }
+
+/* --- Status / Results --- */
+.panel {
+  border-radius: var(--radius);
+  border: 1px solid var(--stroke0);
+  background: linear-gradient(180deg, rgba(255,255,255,0.045), rgba(255,255,255,0.020));
+  box-shadow: var(--shadowSoft);
+  padding: 16px;
+}
+.panel-title {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 10px;
+  padding-bottom: 12px; margin-bottom: 12px;
+  border-bottom: 1px solid var(--stroke0);
+}
+.panel-title h3 { margin: 0; font-size: 13px; letter-spacing: 0.12em; text-transform: uppercase; color: var(--text1); }
+.running-pill {
+  display: inline-flex; align-items: center; gap: 10px;
+  padding: 8px 10px; border-radius: 999px;
+  border: 1px solid rgba(6,182,212,0.38);
+  background: rgba(6,182,212,0.08);
+  color: rgba(153,246,228,0.95);
+  font-weight: 900; font-size: 11px; letter-spacing: 0.10em; text-transform: uppercase;
+}
+.running-dot { width: 9px; height: 9px; border-radius: 99px; background: var(--teal); box-shadow: 0 0 18px rgba(6,182,212,0.45); animation: pulse 1.8s ease-in-out infinite; }
+
+.empty {
+  border-radius: var(--radius);
+  border: 1px dashed rgba(148,163,184,0.26);
+  background: rgba(255,255,255,0.02);
+  padding: 28px;
+  text-align: center;
+  color: var(--text2);
+}
+.empty .big { font-size: 40px; opacity: 0.22; margin-bottom: 10px; }
+.empty .t { color: var(--text1); font-weight: 800; margin-bottom: 6px; }
+.empty .s { font-size: 12px; }
+
+.results {
+  border-radius: var(--radius);
+  border: 1px solid rgba(16,185,129,0.55);
+  background: linear-gradient(180deg, rgba(16,185,129,0.12), rgba(255,255,255,0.02));
+  box-shadow: 0 0 0 4px rgba(16,185,129,0.10), 0 20px 60px rgba(0,0,0,0.42);
+  padding: 16px;
+}
+.results-banner {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 12px;
+  padding-bottom: 12px; margin-bottom: 12px;
+  border-bottom: 1px solid rgba(16,185,129,0.28);
+}
+.results-banner .k { display: flex; align-items: center; gap: 10px; }
+.results-banner .k .icon {
+  width: 36px; height: 36px; border-radius: 12px;
+  background: rgba(16,185,129,0.18);
+  border: 1px solid rgba(16,185,129,0.45);
+  display: flex; align-items: center; justify-content: center;
+}
+.results-banner .k .title { font-weight: 900; color: rgba(189,255,225,0.98); letter-spacing: 0.06em; text-transform: uppercase; font-size: 12px; }
+.results-banner .k .sub { margin-top: 2px; color: rgba(189,255,225,0.70); font-size: 12px; }
+
+.tabs { background: transparent !important; }
+.tab-nav button {
+  background: transparent !important;
+  border: 0 !important;
+  border-bottom: 2px solid transparent !important;
+  color: var(--text2) !important;
+  font-weight: 800 !important;
+  padding: 10px 12px !important;
+}
+.tab-nav button[aria-selected="true"] {
+  color: rgba(153,246,228,0.98) !important;
+  border-bottom-color: rgba(6,182,212,0.75) !important;
+}
+.tab-nav button:hover { color: var(--text0) !important; }
+
+.small-note { color: var(--text2); font-size: 12px; }
+
+/* --- Candidates stream --- */
+.cand-empty { padding: 28px; text-align: center; color: var(--text2); }
+.cand-empty-icon { font-size: 40px; opacity: 0.25; margin-bottom: 10px; }
+.cand-empty-title { color: var(--text1); font-weight: 900; margin-bottom: 4px; }
+.cand-empty-sub { font-size: 12px; }
+
+.cand-stream { display: flex; flex-direction: column; gap: 10px; }
+.cand-card {
+  border-radius: 14px;
+  border: 1px solid rgba(148,163,184,0.18);
+  background: linear-gradient(135deg, rgba(15,23,42,0.85), rgba(2,6,23,0.45));
+  overflow: hidden;
+}
+.cand-topbar { height: 2px; background: linear-gradient(90deg, var(--teal), var(--blue)); }
+.cand-header {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 10px;
+  padding: 10px 12px 0 12px;
+}
+.cand-iter { font-family: "JetBrains Mono", ui-monospace; font-size: 11px; color: rgba(153,246,228,0.92); font-weight: 800; letter-spacing: 0.08em; }
+.cand-pill {
+  font-size: 10px; font-weight: 900; letter-spacing: 0.10em;
+  padding: 5px 8px; border-radius: 999px;
+  border: 1px solid rgba(148,163,184,0.20);
+  background: rgba(255,255,255,0.03);
+  color: var(--text2);
+}
+.cand-body {
+  padding: 10px 12px 12px 12px;
+  font-family: "JetBrains Mono", ui-monospace;
+  font-size: 12px;
+  line-height: 1.6;
+  color: rgba(234,240,255,0.75);
+}
+
+/* --- Responsive --- */
+@media (max-width: 980px) {
+  .gradio-container { padding: 16px 12px !important; }
+  .brand-row { flex-direction: column; align-items: flex-start; }
+  .status-pill { align-self: stretch; justify-content: center; }
+}
+"""
+
+FORCE_DARK_JS = """
+function forceDarkTheme() {
+  try {
+    const url = new URL(window.location.href);
+    if (url.searchParams.get("__theme") !== "dark") {
+      url.searchParams.set("__theme", "dark");
+      window.location.replace(url.toString());
+    }
+  } catch (e) {
+    // no-op
+  }
+}
+forceDarkTheme();
+"""
+
+# ==========================================
+# 5. UI CONSTRUCTION (Redesigned)
+# ==========================================
+APP_TITLE = "Universal Prompt Optimizer"
+APP_SUBTITLE = "Genetic Evolutionary Prompt Agent (GEPA)"
+STATUS_READY = "System Ready"
+
+with gr.Blocks(
+    title="Universal Prompt Optimizer",
+    theme=gr.themes.Base()
+) as app:
+    dataset_state = gr.State([])
+
+    # TOP BAR
+    gr.HTML(
+        f"""
+        <div class="topbar">
+          <div class="brand-row">
+            <div class="brand-left">
+              <div class="brand-mark">GE</div>
+              <div>
+                <div class="h1">{APP_TITLE}</div>
+                <div class="subtitle">{APP_SUBTITLE}</div>
+              </div>
+            </div>
+            <div class="status-pill"><span class="dot"></span> {STATUS_READY}</div>
+          </div>
+        </div>
+        """,
+        elem_classes=["topbar-wrap"]
+    )
+
+    # MAIN LAYOUT
+    with gr.Row():
+
+        # LEFT COLUMN: Configuration
+        with gr.Column(scale=5):
+
+            # Step 1
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">1</div> Model & Credentials</div>
+                      <div class="hint">Select a target model, then provide keys (stored in-session only).</div>
+                    </div>
+                    """
+                )
+
+                with gr.Row():
+                    model_select = gr.Dropdown(
+                        label="Foundation Model",
+                        choices=[
+                            "openai/gpt-4o", 
+                            "openai/gpt-4-turbo", 
+                            "anthropic/claude-3-5-sonnet", 
+                            "google/gemini-1.5-pro", 
+                            "custom"
+                        ],
+                        value="openai/gpt-4o",
+                        scale=2
+                    )
+                    custom_model_input = gr.Textbox(
+                        label="Custom Model ID", 
+                        placeholder="provider/model_name", 
+                        scale=1
+                    )
+                
+                gr.HTML('<div class="subsection-title">API Access Keys</div>')
+                gr.Markdown("*Keys are stored in-session only and never logged*", elem_classes=["text-xs"])
+                
+                with gr.Row():
+                    key_openai = gr.Textbox(
+                        label="OpenAI API Key", 
+                        type="password",
+                        placeholder="sk-...",
+                        scale=1
+                    )
+                    key_google = gr.Textbox(
+                        label="Google API Key", 
+                        type="password",
+                        placeholder="AIza...",
+                        scale=1
+                    )
+                    key_anthropic = gr.Textbox(
+                        label="Anthropic API Key", 
+                        type="password",
+                        placeholder="sk-ant...",
+                        scale=1
+                    )
+
+            # Step 2
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">2</div> Seed Prompt</div>
+                      <div class="hint">Describe the task, constraints, output format, and tone.</div>
+                    </div>
+                    """
+                )
+                seed_input = gr.Textbox(
+                    label="Task Description",
+                    placeholder="Example: You are a code reviewer that identifies security vulnerabilities in Python code. Return a JSON report with severity and fixes...",
+                    lines=7,
+                    max_lines=14,
+                    elem_classes=["seed", "mono"]
+                )
+
+            # Step 3
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">3</div> Training Examples</div>
+                      <div class="hint">Add a few high-quality I/O pairs (images optional) to shape the optimizer.</div>
+                    </div>
+                    """
+                )
+
+                with gr.Tabs():
+                    with gr.Tab("Manual Entry"):
+                        with gr.Row():
+                            with gr.Column(scale=2):
+                                d_in = gr.Textbox(
+                                    label="Input / User Prompt",
+                                    placeholder="Example user input...",
+                                    lines=3
+                                )
+                                d_out = gr.Textbox(
+                                    label="Ideal Output",
+                                    placeholder="Expected AI response...",
+                                    lines=3
+                                )
+                            with gr.Column(scale=1):
+                                d_img = gr.Image(
+                                    label="Attach Image (Optional)",
+                                    type="numpy",
+                                    height=170
+                                )
+
+                        btn_add = gr.Button(
+                            "Add Example",
+                            elem_classes=["btn-secondary"]
+                        )
+
+                    with gr.Tab("Bulk Import (JSON)"):
+                        gr.Markdown(
+                            "Paste a JSON array like: `[{\"input\": \"...\", \"output\": \"...\"}]`",
+                            elem_classes=["small-note"]
+                        )
+                        bulk_json = gr.Textbox(
+                            show_label=False,
+                            placeholder='[{"input": "...", "output": "..."}]',
+                            lines=6
+                        )
+                        btn_import = gr.Button(
+                            "Import JSON",
+                            elem_classes=["btn-secondary"]
+                        )
+
+                with gr.Row():
+                    gr.HTML("<div class='hint'>Current dataset</div>")
+                    ds_count = gr.HTML(
+                        "<span style='color: var(--text-secondary);'>0 examples loaded</span>",
+                        elem_classes=["ds-count"]
+                    )
+
+                ds_table = gr.Dataframe(
+                    headers=["ID", "Input", "Output", "Media"],
+                    datatype=["number", "str", "str", "str"],
+                    row_count=6,
+                    column_count=(4, "fixed"),
+                    interactive=False
+                )
+
+                with gr.Row():
+                    btn_clear = gr.Button(
+                        "Clear All",
+                        elem_classes=["btn-danger"],
+                        size="sm"
+                    )
+
+            # Step 4 (Prominent, not buried)
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">4</div> Optimization Controls</div>
+                      <div class="hint">Tune evolution budget. Defaults are safe for quick runs.</div>
+                    </div>
+                    """
+                )
+
+                with gr.Row():
+                    slider_iter = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=5,
+                        step=1,
+                        label="Evolution Rounds",
+                        info="Number of genetic iterations"
+                    )
+                    slider_calls = gr.Slider(
+                        minimum=10,
+                        maximum=200,
+                        value=50,
+                        step=10,
+                        label="Max LLM Calls",
+                        info="Total API call budget"
+                    )
+
+                with gr.Row():
+                    slider_batch = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=4,
+                        step=1,
+                        label="Batch Size",
+                        info="Candidates per iteration"
+                    )
+                    check_llego = gr.Checkbox(
+                        value=True,
+                        label="Enable LLEGO Crossover",
+                        info="Use advanced genetic operations"
+                    )
+
+                btn_optimize = gr.Button(
+                    "Start Optimization",
+                    elem_classes=["cta", "mt-6"]
+                )
+
+        # RIGHT: STATUS + RESULTS
+        with gr.Column(scale=5, elem_classes=["right-col"]):
+            # STATUS PANEL (Hidden by default)
+            status_panel = gr.Group(visible=False, elem_classes=["panel"])
+            with status_panel:
+                gr.HTML(
+                    """
+                    <div class="panel-title">
+                      <h3>Optimization status</h3>
+                      <div class="running-pill"><span class="running-dot"></span> Running</div>
+                    </div>
+                    """
+                )
+                txt_status = gr.Markdown("Initializing genetic algorithm...")
+
+            # EMPTY STATE
+            empty_state = gr.HTML(
+                """
+                <div class="empty">
+                  <div class="big">🧬</div>
+                  <div class="t">Ready to optimize</div>
+                  <div class="s">Fill Steps 1–3, then click <b>Start Optimization</b> to begin prompt evolution.</div>
+                </div>
+                """,
+                visible=True
+            )
+
+            # RESULTS PANEL (Hidden by default)
+            results_panel = gr.Group(visible=False, elem_classes=["results"])
+            with results_panel:
+                gr.HTML(
+                    """
+                    <div class="results-banner">
+                      <div class="k">
+                        <div class="icon">✓</div>
+                        <div>
+                          <div class="title">Optimization successful</div>
+                          <div class="sub">Review the optimized prompt, metrics, and evolution traces.</div>
+                        </div>
+                      </div>
+                    </div>
+                    """
+                )
+
+                with gr.Tabs():
+                    with gr.Tab("Optimized Prompt"):
+                        res_prompt = gr.Textbox(
+                            label="Optimized Prompt",
+                            lines=18,
+                            max_lines=28,
+                            interactive=False,
+                            show_label=True,
+                            elem_classes=["mono"]
+                        )
+
+                    with gr.Tab("Metrics & Log"):
+                        res_metrics = gr.JSON(label="Performance Gains")
+                        res_history = gr.TextArea(
+                            label="Evolution Log",
+                            interactive=False,
+                            lines=10
+                        )
+                    
+                    with gr.Tab("🧬 Live Candidates"):
+                        gr.Markdown("Real-time stream of generated prompt candidates during optimization:")
+                        live_candidates = gr.HTML()
+                        btn_refresh_cand = gr.Button(
+                            "🔄 Refresh Stream", 
+                            elem_classes=["secondary-btn"],
+                            size="sm"
+                        )
+
+    # ==========================================
+    # 6. EVENT HANDLERS
+    # ==========================================
+    
+    # Dataset Management
+    def update_dataset_count(dataset):
+        """Update dataset count display with error handling."""
+        try:
+            if not isinstance(dataset, list):
+                return "<span style='color: var(--text-secondary);'>0 examples loaded</span>"
+            count = len(dataset)
+            return f"<span style='color: var(--text-secondary);'>{count} example{'s' if count != 1 else ''} loaded</span>"
+        except Exception as e:
+            logger.error(f"Error updating dataset count: {str(e)}")
+            return "<span style='color: var(--text-secondary);'>Error</span>"
+    
+    # Wrap event handlers with error handling
+    def safe_add_example(*args):
+        """Wrapper for add_example with error handling."""
+        try:
+            return add_example(*args)
+        except gr.Error:
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error in add_example: {str(e)}")
+            raise gr.Error(f"Failed to add example: {str(e)}")
+    
+    def safe_update_table(dataset):
+        """Wrapper for update_table with error handling."""
+        try:
+            return update_table(dataset)
+        except Exception as e:
+            logger.error(f"Error updating table: {str(e)}")
+            return []
+    
+    def safe_clear_dataset():
+        """Wrapper for clear_dataset with error handling."""
+        try:
+            return clear_dataset()
+        except Exception as e:
+            logger.error(f"Error clearing dataset: {str(e)}")
+            return [], []
+    
+    btn_add.click(
+        safe_add_example, 
+        inputs=[d_in, d_out, d_img, dataset_state], 
+        outputs=[dataset_state, d_in, d_out, d_img]
+    ).then(
+        safe_update_table, 
+        inputs=[dataset_state],
+        outputs=[ds_table]
+    ).then(
+        update_dataset_count,
+        inputs=[dataset_state],
+        outputs=[ds_count]
+    )
+    
+    btn_clear.click(
+        safe_clear_dataset, 
+        outputs=[dataset_state, ds_table]
+    ).then(
+        lambda: "<span style='color: var(--text-secondary);'>0 examples loaded</span>",
+        outputs=[ds_count]
+    )
+    
+    # Bulk Import
+    def import_bulk_json(json_text, current_dataset):
+        """Import examples from JSON with comprehensive error handling."""
+        try:
+            # Validate inputs
+            if not json_text or not json_text.strip():
+                raise gr.Error("JSON input is empty. Please provide a JSON array.")
+            
+            if not isinstance(current_dataset, list):
+                raise gr.Error("Dataset state is invalid. Please refresh the page.")
+            
+            # Parse JSON
+            try:
+                data = json.loads(json_text.strip())
+            except json.JSONDecodeError as e:
+                raise gr.Error(f"Invalid JSON format: {str(e)}. Please check your JSON syntax.")
+            
+            # Validate structure
+            if not isinstance(data, list):
+                raise gr.Error("JSON must be an array of objects. Example: [{\"input\": \"...\", \"output\": \"...\"}]")
+            
+            if len(data) == 0:
+                raise gr.Error("JSON array is empty. Add at least one example object.")
+            
+            # Validate and import items
+            imported_count = 0
+            errors = []
+            
+            for i, item in enumerate(data):
+                try:
+                    if not isinstance(item, dict):
+                        errors.append(f"Item {i+1}: not a dictionary")
+                        continue
+                    
+                    if "input" not in item or "output" not in item:
+                        errors.append(f"Item {i+1}: missing 'input' or 'output' field")
+                        continue
+                    
+                    input_val = item["input"]
+                    output_val = item["output"]
+                    
+                    if not isinstance(input_val, str) or not isinstance(output_val, str):
+                        errors.append(f"Item {i+1}: 'input' and 'output' must be strings")
+                        continue
+                    
+                    if not input_val.strip() or not output_val.strip():
+                        errors.append(f"Item {i+1}: 'input' and 'output' cannot be empty")
+                        continue
+                    
+                    # Add valid item
+                    current_dataset.append({
+                        "input": input_val.strip(),
+                        "output": output_val.strip(),
+                        "image": item.get("image"),  # Optional
+                        "image_preview": "🖼️ Image" if item.get("image") else "-"
+                    })
+                    imported_count += 1
+                    
+                except Exception as e:
+                    errors.append(f"Item {i+1}: {str(e)}")
+                    logger.warning(f"Error importing item {i+1}: {str(e)}")
+                    continue
+            
+            # Report results
+            if imported_count == 0:
+                error_msg = "No valid examples imported. "
+                if errors:
+                    error_msg += "Errors: " + "; ".join(errors[:3])
+                    if len(errors) > 3:
+                        error_msg += f" (and {len(errors) - 3} more)"
+                raise gr.Error(error_msg)
+            
+            if errors:
+                warning_msg = f"Imported {imported_count} example(s). "
+                if len(errors) <= 3:
+                    warning_msg += f"Warnings: {'; '.join(errors)}"
+                else:
+                    warning_msg += f"{len(errors)} items had errors."
+                logger.warning(warning_msg)
+            
+            return current_dataset, ""
+            
+        except gr.Error:
+            # Re-raise Gradio errors
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error in import_bulk_json: {str(e)}\n{traceback.format_exc()}")
+            raise gr.Error(f"Failed to import JSON: {str(e)}")
+    
+    btn_import.click(
+        import_bulk_json,
+        inputs=[bulk_json, dataset_state],
+        outputs=[dataset_state, bulk_json]
+    ).then(
+        safe_update_table,
+        inputs=[dataset_state],
+        outputs=[ds_table]
+    ).then(
+        update_dataset_count,
+        inputs=[dataset_state],
+        outputs=[ds_count]
+    )
+    
+    # Main Optimization Flow
+    btn_optimize.click(
+        run_optimization_flow,
+        inputs=[
+            seed_input, dataset_state, model_select, custom_model_input,
+            slider_iter, slider_calls, slider_batch, check_llego,
+            key_openai, key_google, key_anthropic
+        ],
+        outputs=[
+            status_panel, empty_state, results_panel, 
+            txt_status, res_prompt, res_metrics, res_history, live_candidates
+        ]
+    )
+
+    # Refresh Candidates
+    def safe_get_candidates_display():
+        """Wrapper for get_candidates_display with error handling."""
+        try:
+            return get_candidates_display()
+        except Exception as e:
+            logger.error(f"Error refreshing candidates: {str(e)}")
+            return "<div style='padding: 2rem; text-align: center; color: #ef4444;'>Error loading candidates.</div>"
+    
+    btn_refresh_cand.click(
+        safe_get_candidates_display, 
+        outputs=[live_candidates]
+    )
+
+# ==========================================
+# 7. LAUNCH
+# ==========================================
+if __name__ == "__main__":
+    app.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Set to False for HF Spaces
+        show_error=True,
+        css=CUSTOM_CSS,
+        js=FORCE_DARK_JS
+    )
+    
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b415c2f7705cd0945b80f6960401d2591fe23cf0
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,23 @@
+# Core dependencies - gepa from git
+git+https://github.com/gepa-ai/gepa.git
+numpy>=1.21.0
+pandas>=1.5.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+
+# HTTP/API clients
+requests>=2.31.0
+aiohttp>=3.8.0
+asyncio-throttle>=1.0.0
+
+# LLM Provider SDKs
+openai>=1.0.0
+anthropic>=0.18.0
+google-generativeai>=0.3.0
+google-genai>=0.2.0
+
+# Image processing
+Pillow>=9.0.0
+
+# Gradio UI (version will be set by README.md sdk_version)
+gradio>=4.0.0
\ No newline at end of file
diff --git a/src/gepa_optimizer.egg-info/PKG-INFO b/src/gepa_optimizer.egg-info/PKG-INFO
new file mode 100644
index 0000000000000000000000000000000000000000..272d9e1fc41f406056f6ddb09898b32ddd8a6037
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/PKG-INFO
@@ -0,0 +1,439 @@
+Metadata-Version: 2.4
+Name: gepa-optimizer
+Version: 0.1.0
+Summary: Universal prompt optimization framework based on GEPA
+Home-page: https://github.com/suhasb-dev/Prompt-Optimizer
+Author: Suhas
+Author-email: Suhas <s8hasgrylls@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/suhasb-dev/Prompt-Optimizer
+Project-URL: Repository, https://github.com/suhasb-dev/Prompt-Optimizer
+Project-URL: Documentation, https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/
+Project-URL: Bug Reports, https://github.com/suhasb-dev/Prompt-Optimizer/issues
+Keywords: prompt-optimization,llm,gepa,ai,machine-learning,ui-tree-extraction
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: gepa>=0.0.12
+Requires-Dist: pandas>=1.5.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: requests>=2.31.0
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio-throttle>=1.0.0
+Requires-Dist: google-generativeai>=0.3.0
+Requires-Dist: Pillow>=9.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: flake8>=6.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: sphinx>=5.0.0; extra == "docs"
+Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "docs"
+Provides-Extra: all
+Requires-Dist: pytest>=7.0.0; extra == "all"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "all"
+Requires-Dist: black>=23.0.0; extra == "all"
+Requires-Dist: flake8>=6.0.0; extra == "all"
+Requires-Dist: mypy>=1.0.0; extra == "all"
+Requires-Dist: sphinx>=5.0.0; extra == "all"
+Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "all"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+
+# GEPA Optimizer
+
+[![PyPI version](https://badge.fury.io/py/gepa-optimizer.svg)](https://badge.fury.io/py/gepa-optimizer)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+
+A universal prompt optimization framework built on [GEPA](https://arxiv.org/abs/2507.19457) with optional [LLEGO](https://arxiv.org/abs/2503.14217) genetic operators for accelerated convergence.
+
+## Overview
+
+GEPA Optimizer provides a modular architecture for optimizing prompts through reflective evolution. It requires custom evaluators and LLM clients, enabling domain-specific optimization for any use case.
+
+**Key capabilities:**
+- Multi-modal support (text + vision models)
+- Hybrid GEPA + LLEGO optimization modes
+- Configurable train/val/test data splitting
+- Batch API support for cost reduction
+- Async-first architecture
+
+## Installation
+
+```bash
+pip install gepa-optimizer
+```
+
+**From source:**
+```bash
+git clone https://github.com/suhasb-dev/Prompt-Optimizer.git
+cd Prompt-Optimizer
+pip install -e .
+```
+
+## Quick Start
+
+```python
+import asyncio
+from gepa_optimizer import (
+    GepaOptimizer,
+    OptimizationConfig,
+    BaseEvaluator,
+    BaseLLMClient
+)
+
+# Define custom evaluator
+class MyEvaluator(BaseEvaluator):
+    def evaluate(self, predicted: str, expected: str) -> dict:
+        score = 1.0 if predicted.strip() == expected.strip() else 0.0
+        return {"accuracy": score, "composite_score": score}
+
+# Define custom LLM client
+class MyLLMClient(BaseLLMClient):
+    def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> dict:
+        # Your LLM integration here
+        return {"content": "response"}
+
+async def main():
+    config = OptimizationConfig(
+        model="openai/gpt-4o",
+        reflection_model="openai/gpt-4o",
+        max_iterations=5,
+        max_metric_calls=50,
+        batch_size=8
+    )
+    
+    optimizer = GepaOptimizer(
+        config=config,
+        llm_client=MyLLMClient("openai", "gpt-4o"),
+        evaluator=MyEvaluator()
+    )
+    
+    result = await optimizer.train(
+        seed_prompt="Your initial prompt",
+        dataset=your_dataset
+    )
+    
+    print(f"Optimized: {result.prompt}")
+    print(f"Score: {result.improvement_data}")
+
+asyncio.run(main())
+```
+
+## Project Structure
+
+```
+src/gepa_optimizer/
+├── core/                   # Core optimization logic
+│   ├── optimizer.py        # GepaOptimizer main class
+│   ├── base_adapter.py     # BaseGepaAdapter interface
+│   └── universal_adapter.py
+├── evaluation/             # Evaluator implementations
+│   ├── base_evaluator.py   # BaseEvaluator abstract class
+│   ├── scroll_evaluator.py
+│   ├── validation_evaluator.py
+│   └── index_caching_evaluator.py
+├── llms/                   # LLM client implementations
+│   ├── base_llm.py         # BaseLLMClient abstract class
+│   ├── vision_llm.py       # VisionLLMClient (OpenAI, Google, Anthropic)
+│   └── batch_llm.py        # BatchLLMClient (50% cost savings)
+├── operators/              # LLEGO genetic operators
+│   └── llego_operators.py  # FitnessGuidedCrossover, DiversityGuidedMutation
+├── data/                   # Dataset loaders and converters
+├── models/                 # Configuration and result models
+└── utils/                  # Utilities and helpers
+```
+
+## Configuration
+
+### Basic Configuration
+
+```python
+from gepa_optimizer import OptimizationConfig, ModelConfig
+
+config = OptimizationConfig(
+    # Required parameters
+    model="openai/gpt-4o",              # or ModelConfig instance
+    reflection_model="openai/gpt-4o",
+    max_iterations=10,
+    max_metric_calls=100,
+    batch_size=8,
+    
+    # Data splitting (train/val/test)
+    data_split=DataSplitConfig(
+        train_ratio=0.6,
+        val_ratio=0.2,
+        test_ratio=0.2
+    ),
+    
+    # Optional settings
+    reflection_examples=3,    # Examples per reflection (2-5 recommended)
+    evaluate_on_test=True,    # Final evaluation on held-out test set
+    log_level="INFO"          # DEBUG, INFO, WARNING, ERROR
+)
+```
+
+### LLEGO Genetic Operators
+
+Enable LLEGO for faster convergence through fitness-guided crossover and diversity-guided mutation:
+
+```python
+config = OptimizationConfig(
+    model="openai/gpt-4o",
+    reflection_model="openai/gpt-4o",
+    max_iterations=5,
+    max_metric_calls=50,
+    batch_size=8,
+    
+    # Enable LLEGO
+    use_llego_operators=True,
+    alpha=0.15,           # Fitness extrapolation factor
+    tau=10.0,             # Diversity temperature
+    nu=4,                 # Parent arity
+    n_crossover=2,        # Crossover offspring per iteration
+    n_mutation=3,         # Mutation offspring per iteration
+    population_size=15
+)
+```
+
+### Hybrid Mode (GEPA + LLEGO)
+
+Combine GEPA's semantic reflection with LLEGO's structural diversity:
+
+```python
+config = OptimizationConfig(
+    model="openai/gpt-4o",
+    reflection_model="openai/gpt-4o",
+    max_iterations=6,
+    max_metric_calls=200,
+    batch_size=10,
+    
+    # Hybrid mode
+    use_llego_operators=True,
+    enable_gepa_reflection_with_llego=True,
+    num_gepa_reflection_candidates=3,
+    n_crossover=3,
+    n_mutation=3
+    # Total: 9 candidates per iteration (3 GEPA + 3 crossover + 3 mutation)
+)
+```
+
+### Batch API (Cost Optimization)
+
+Use batch processing for 50% cost reduction:
+
+```python
+from gepa_optimizer.llms import BatchLLMClient
+
+llm_client = BatchLLMClient(
+    provider="google",
+    model_name="gemini-2.5-flash",
+    batch_size=20,
+    polling_interval=30
+)
+
+optimizer = GepaOptimizer(
+    config=config,
+    llm_client=llm_client,
+    evaluator=evaluator
+)
+```
+
+## Built-in Components
+
+### LLM Clients
+
+| Client | Description | Use Case |
+|--------|-------------|----------|
+| `VisionLLMClient` | Multi-modal client for OpenAI, Google, Anthropic | Real-time requests |
+| `BatchLLMClient` | Batch processing client | Cost-sensitive workloads |
+
+### Evaluators
+
+| Evaluator | Description |
+|-----------|-------------|
+| `ScrollElementEvaluator` | UI element detection scoring |
+| `ValidationEvaluator` | Screen validation tasks |
+| `IndexCachingEvaluator` | Index-based element selection |
+| `UITreeEvaluator` | UI tree extraction |
+
+### Dataset Loaders
+
+| Loader | Description |
+|--------|-------------|
+| `load_scroll_dataset()` | Load scroll detection datasets |
+| `load_validation_split()` | Load validation datasets with splits |
+| `load_index_caching_split()` | Load index caching datasets |
+
+## Creating Custom Components
+
+### Custom Evaluator
+
+```python
+from gepa_optimizer import BaseEvaluator
+
+class CustomEvaluator(BaseEvaluator):
+    def __init__(self):
+        super().__init__(metric_weights={
+            "accuracy": 0.5,
+            "completeness": 0.3,
+            "format": 0.2
+        })
+    
+    def evaluate(self, predicted: str, expected: str) -> dict:
+        accuracy = self._compute_accuracy(predicted, expected)
+        completeness = self._compute_completeness(predicted, expected)
+        format_score = self._compute_format(predicted)
+        
+        composite = (
+            accuracy * 0.5 +
+            completeness * 0.3 +
+            format_score * 0.2
+        )
+        
+        return {
+            "accuracy": accuracy,
+            "completeness": completeness,
+            "format": format_score,
+            "composite_score": composite  # Required key
+        }
+```
+
+### Custom LLM Client
+
+```python
+from gepa_optimizer import BaseLLMClient
+
+class CustomLLMClient(BaseLLMClient):
+    def __init__(self, api_key: str):
+        super().__init__(provider="custom", model_name="my-model")
+        self.api_key = api_key
+    
+    def generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = None,
+        **kwargs
+    ) -> dict:
+        # Your API call here
+        response = call_your_api(system_prompt, user_prompt, image_base64)
+        return {"content": response}
+```
+
+## Examples
+
+| File | Description |
+|------|-------------|
+| [`examples/basic_usage.py`](examples/basic_usage.py) | Basic optimization workflow |
+| [`examples/advanced_usage.py`](examples/advanced_usage.py) | Advanced configuration |
+| [`examples/batch_api_example.py`](examples/batch_api_example.py) | Batch API usage |
+| [`examples/gemini_usage.py`](examples/gemini_usage.py) | Google Gemini integration |
+
+**Run examples:**
+```bash
+python examples/basic_usage.py
+```
+
+## Testing
+
+```bash
+# Run all tests
+pytest tests/
+
+# Run unit tests only
+pytest tests/unit/
+
+# Run integration tests
+pytest tests/integration/
+```
+
+## API Reference
+
+### GepaOptimizer
+
+```python
+class GepaOptimizer:
+    def __init__(
+        self,
+        config: OptimizationConfig,
+        llm_client: BaseLLMClient,
+        evaluator: BaseEvaluator,
+        adapter_type: str = "universal"
+    )
+    
+    async def train(
+        self,
+        seed_prompt: str,
+        dataset: Union[List, Dict],
+        **kwargs
+    ) -> OptimizedResult
+```
+
+### OptimizationConfig
+
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `model` | `str \| ModelConfig` | Required | Target model |
+| `reflection_model` | `str \| ModelConfig` | Required | Reflection model |
+| `max_iterations` | `int` | Required | Maximum optimization iterations |
+| `max_metric_calls` | `int` | Required | Maximum evaluation calls |
+| `batch_size` | `int` | Required | Samples per evaluation batch |
+| `use_llego_operators` | `bool` | `False` | Enable LLEGO genetic operators |
+| `enable_gepa_reflection_with_llego` | `bool` | `False` | Enable hybrid mode |
+| `use_llm_as_judge` | `bool` | `True` | Enable LLM-as-Judge feedback |
+| `log_level` | `str` | `"INFO"` | Logging verbosity |
+
+### OptimizedResult
+
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `prompt` | `str` | Optimized prompt |
+| `original_prompt` | `str` | Initial seed prompt |
+| `improvement_data` | `dict` | Score improvements |
+| `optimization_time` | `float` | Total time in seconds |
+| `is_successful` | `bool` | Optimization success status |
+
+## Environment Variables
+
+| Variable | Description |
+|----------|-------------|
+| `OPENAI_API_KEY` | OpenAI API key |
+| `ANTHROPIC_API_KEY` | Anthropic API key |
+| `GOOGLE_API_KEY` | Google AI API key |
+
+## References
+
+- **GEPA Paper:** [Reflective Prompt Evolution Can Outperform Reinforcement Learning](https://arxiv.org/abs/2507.19457)
+- **LLEGO Paper:** [Decision Tree Induction Through LLMs via Semantically-Aware Evolution](https://arxiv.org/abs/2503.14217)
+- **GEPA Library:** [github.com/gepa-ai/gepa](https://github.com/gepa-ai/gepa)
+
+## License
+
+MIT License - see [LICENSE](LICENSE) for details.
+
+## Contributing
+
+Contributions welcome. Please open an issue or submit a pull request.
+
+## Support
+
+- **Issues:** [GitHub Issues](https://github.com/suhasb-dev/Prompt-Optimizer/issues)
+- **Documentation:** [GitBook](https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/)
diff --git a/src/gepa_optimizer.egg-info/SOURCES.txt b/src/gepa_optimizer.egg-info/SOURCES.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f019258d7e1ba6587c93e9adafd7e606b206503b
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/SOURCES.txt
@@ -0,0 +1,65 @@
+LICENSE
+README.md
+pyproject.toml
+setup.py
+src/gepa_optimizer/__init__.py
+src/gepa_optimizer/cli.py
+src/gepa_optimizer/types.py
+src/gepa_optimizer/version.py
+src/gepa_optimizer.egg-info/PKG-INFO
+src/gepa_optimizer.egg-info/SOURCES.txt
+src/gepa_optimizer.egg-info/dependency_links.txt
+src/gepa_optimizer.egg-info/entry_points.txt
+src/gepa_optimizer.egg-info/requires.txt
+src/gepa_optimizer.egg-info/top_level.txt
+src/gepa_optimizer/core/__init__.py
+src/gepa_optimizer/core/base_adapter.py
+src/gepa_optimizer/core/custom_adapter.py
+src/gepa_optimizer/core/optimizer.py
+src/gepa_optimizer/core/result.py
+src/gepa_optimizer/core/universal_adapter.py
+src/gepa_optimizer/data/__init__.py
+src/gepa_optimizer/data/converters.py
+src/gepa_optimizer/data/index_caching_loader.py
+src/gepa_optimizer/data/loaders.py
+src/gepa_optimizer/data/scroll_dataset_loader.py
+src/gepa_optimizer/data/validation_dataset_loader.py
+src/gepa_optimizer/data/validators.py
+src/gepa_optimizer/evaluation/__init__.py
+src/gepa_optimizer/evaluation/base_evaluator.py
+src/gepa_optimizer/evaluation/index_caching_evaluator.py
+src/gepa_optimizer/evaluation/scroll_evaluator.py
+src/gepa_optimizer/evaluation/ui_evaluator.py
+src/gepa_optimizer/evaluation/universal_evaluator.py
+src/gepa_optimizer/evaluation/validation_evaluator.py
+src/gepa_optimizer/infrastructure/__init__.py
+src/gepa_optimizer/infrastructure/logging/__init__.py
+src/gepa_optimizer/infrastructure/logging/context.py
+src/gepa_optimizer/infrastructure/logging/formatters.py
+src/gepa_optimizer/infrastructure/logging/logger.py
+src/gepa_optimizer/llms/__init__.py
+src/gepa_optimizer/llms/base_llm.py
+src/gepa_optimizer/llms/batch_llm.py
+src/gepa_optimizer/llms/llego_enhanced_llm.py
+src/gepa_optimizer/llms/vision_llm.py
+src/gepa_optimizer/models/__init__.py
+src/gepa_optimizer/models/config.py
+src/gepa_optimizer/models/dataset.py
+src/gepa_optimizer/models/result.py
+src/gepa_optimizer/operators/__init__.py
+src/gepa_optimizer/operators/base_operator.py
+src/gepa_optimizer/operators/crossover.py
+src/gepa_optimizer/operators/llego_operators.py
+src/gepa_optimizer/operators/models.py
+src/gepa_optimizer/operators/mutation.py
+src/gepa_optimizer/utils/__init__.py
+src/gepa_optimizer/utils/api_keys.py
+src/gepa_optimizer/utils/candidate_collector.py
+src/gepa_optimizer/utils/clean_logger.py
+src/gepa_optimizer/utils/exceptions.py
+src/gepa_optimizer/utils/helpers.py
+src/gepa_optimizer/utils/llm_judge_prompt.py
+src/gepa_optimizer/utils/log_parser.py
+src/gepa_optimizer/utils/logging.py
+src/gepa_optimizer/utils/metrics.py
+src/gepa_optimizer/utils/pareto_logger.py
\ No newline at end of file
diff --git a/src/gepa_optimizer.egg-info/dependency_links.txt b/src/gepa_optimizer.egg-info/dependency_links.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/src/gepa_optimizer.egg-info/entry_points.txt b/src/gepa_optimizer.egg-info/entry_points.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c9b0dbe7680b3733ee391c2c83177f29594117eb
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/entry_points.txt
@@ -0,0 +1,2 @@
+[console_scripts]
+gepa-optimize = gepa_optimizer.cli:main
diff --git a/src/gepa_optimizer.egg-info/requires.txt b/src/gepa_optimizer.egg-info/requires.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ecfbd2e6e4a4482e0036fa85ac2ca7d695b13be6
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/requires.txt
@@ -0,0 +1,29 @@
+gepa>=0.0.12
+pandas>=1.5.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+requests>=2.31.0
+aiohttp>=3.8.0
+asyncio-throttle>=1.0.0
+google-generativeai>=0.3.0
+Pillow>=9.0.0
+
+[all]
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+black>=23.0.0
+flake8>=6.0.0
+mypy>=1.0.0
+sphinx>=5.0.0
+sphinx-rtd-theme>=1.2.0
+
+[dev]
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+black>=23.0.0
+flake8>=6.0.0
+mypy>=1.0.0
+
+[docs]
+sphinx>=5.0.0
+sphinx-rtd-theme>=1.2.0
diff --git a/src/gepa_optimizer.egg-info/top_level.txt b/src/gepa_optimizer.egg-info/top_level.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a53df9f6ea55c2b670c462010432f5969311d777
--- /dev/null
+++ b/src/gepa_optimizer.egg-info/top_level.txt
@@ -0,0 +1 @@
+gepa_optimizer
diff --git a/src/gepa_optimizer/__init__.py b/src/gepa_optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9a4dc05ed44b418078c9404690ba7af8d163d7f
--- /dev/null
+++ b/src/gepa_optimizer/__init__.py
@@ -0,0 +1,295 @@
+"""
+GEPA Universal Prompt Optimizer
+
+A modern, modular Python library for universal prompt optimization powered by GEPA.
+
+Quick Start (No custom evaluator needed!):
+    
+    from gepa_optimizer import quick_optimize
+    
+    result = await quick_optimize(
+        seed_prompt="Your initial prompt",
+        dataset=[
+            {"input": "task1", "output": "expected1"},
+            {"input": "task2", "output": "expected2"},
+        ],
+        model="openai/gpt-4o"  # or any: "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022"
+    )
+    print(result.optimized_prompt)
+"""
+
+# Core functionality
+from .core import GepaOptimizer
+from .core.base_adapter import BaseGepaAdapter
+from .core.universal_adapter import UniversalGepaAdapter
+
+# Configuration and models
+from .models import OptimizationConfig, OptimizationResult, OptimizedResult, ModelConfig
+
+# Data processing
+from .data import UniversalConverter, DataLoader, DataValidator
+from .data.scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset
+from .data.validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split
+from .data.index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split
+
+# LLM clients
+from .llms import VisionLLMClient
+from .llms.base_llm import BaseLLMClient
+from .llms.batch_llm import BatchLLMClient
+
+# Evaluators - including Universal Semantic Evaluator (works for ANY task!)
+from .evaluation import (
+    BaseEvaluator,
+    UniversalSemanticEvaluator,
+    create_universal_evaluator,
+    UITreeEvaluator,
+    ScrollElementEvaluator,
+    ValidationEvaluator,
+    IndexCachingEvaluator
+)
+
+# LLEGO Genetic Operators
+from .operators import (
+    # Base interfaces
+    BaseGeneticOperator,
+    BaseCrossoverOperator,
+    BaseMutationOperator,
+    # Concrete operators
+    FitnessGuidedCrossover,
+    DiversityGuidedMutation,
+    LLEGOIntegrationLayer,
+    # Data models
+    PromptCandidate,
+    PromptMetadata
+)
+
+# Utilities
+from .utils import setup_logging, calculate_metrics, sanitize_prompt, APIKeyManager
+from .utils.exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError
+
+# Logging infrastructure
+from .infrastructure.logging import get_logger, configure_logging, LogContext
+
+# Type definitions (for type hints in user code)
+from .types import (
+    DatasetItem,
+    EvaluationResult,
+    LLMResponse,
+    CandidateDict,
+    LLMClientProtocol,
+    EvaluatorProtocol,
+)
+
+__version__ = "0.1.0"
+
+
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONVENIENCE FUNCTION: quick_optimize
+# No evaluator needed - uses Universal Semantic Evaluator automatically
+# ═══════════════════════════════════════════════════════════════════════════════
+
+async def quick_optimize(
+    seed_prompt: str,
+    dataset: list,
+    model: str,
+    max_iterations: int = 5,
+    max_metric_calls: int = 50,
+    batch_size: int = 4,
+    use_llego: bool = True,
+    verbose: bool = True
+) -> OptimizedResult:
+    """
+    🚀 Quick prompt optimization - no custom evaluator needed!
+    
+    Uses Universal Semantic Evaluator that works for ANY task.
+    
+    Args:
+        seed_prompt: Your initial prompt to optimize
+        dataset: List of dicts with 'input' and 'output' (expected) keys
+                 Can also include 'image' key for multi-modal tasks
+        model: LLM model to use in format "provider/model-name" (REQUIRED)
+               Examples:
+                 - "google/gemini-1.5-pro"
+                 - "google/gemini-2.5-flash-preview-05-20"
+                 - "openai/gpt-4o"
+                 - "openai/gpt-4-turbo"
+                 - "anthropic/claude-3-5-sonnet-20241022"
+        max_iterations: Maximum optimization iterations (default: 5)
+        max_metric_calls: Maximum evaluation calls (default: 50)
+        batch_size: Samples per evaluation batch (default: 4)
+        use_llego: Enable LLEGO genetic operators (default: True)
+        verbose: Show progress logs (default: True)
+    
+    Returns:
+        OptimizedResult with optimized prompt and improvement metrics
+    
+    Example:
+        >>> result = await quick_optimize(
+        ...     seed_prompt="Count the objects in the image",
+        ...     dataset=[
+        ...         {"input": "image1.jpg", "output": "5 objects", "image": "base64..."},
+        ...         {"input": "image2.jpg", "output": "3 objects", "image": "base64..."},
+        ...     ],
+        ...     model="openai/gpt-4o",  # or "google/gemini-1.5-pro", etc.
+        ...     max_iterations=3
+        ... )
+        >>> print(result.optimized_prompt)
+    """
+    import logging
+    
+    if verbose:
+        logging.basicConfig(level=logging.INFO)
+    
+    # Create LLM client
+    llm_client = VisionLLMClient.from_model_string(model)
+    
+    # Create Universal Semantic Evaluator (uses same LLM for analysis)
+    evaluator = UniversalSemanticEvaluator(
+        llm_client=llm_client,
+        use_llm_analysis=True
+    )
+    
+    # Create configuration
+    config = OptimizationConfig(
+        model=model,
+        reflection_model=model,
+        max_iterations=max_iterations,
+        max_metric_calls=max_metric_calls,
+        batch_size=batch_size,
+        use_llego_operators=use_llego,
+        enable_gepa_reflection_with_llego=use_llego,
+        num_gepa_reflection_candidates=3,
+        n_crossover=2,
+        n_mutation=2,
+        verbose=verbose
+    )
+    
+    # Create optimizer
+    optimizer = GepaOptimizer(
+        config=config,
+        llm_client=llm_client,
+        evaluator=evaluator
+    )
+    
+    # Run optimization
+    result = await optimizer.train(
+        seed_prompt=seed_prompt,
+        dataset=dataset
+    )
+    
+    return result
+
+
+def quick_optimize_sync(
+    seed_prompt: str,
+    dataset: list,
+    model: str,
+    max_iterations: int = 5,
+    max_metric_calls: int = 50,
+    batch_size: int = 4,
+    use_llego: bool = True,
+    verbose: bool = True
+) -> OptimizedResult:
+    """
+    🚀 Synchronous version of quick_optimize.
+    
+    Same as quick_optimize but runs synchronously (blocks until complete).
+    
+    Args:
+        model: LLM model to use in format "provider/model-name" (REQUIRED)
+               Examples: "openai/gpt-4o", "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022"
+    
+    See quick_optimize for full documentation.
+    """
+    import asyncio
+    return asyncio.run(quick_optimize(
+        seed_prompt=seed_prompt,
+        dataset=dataset,
+        model=model,
+        max_iterations=max_iterations,
+        max_metric_calls=max_metric_calls,
+        batch_size=batch_size,
+        use_llego=use_llego,
+        verbose=verbose
+    ))
+
+
+__all__ = [
+    # 🚀 Quick Start (recommended for new users)
+    "quick_optimize",
+    "quick_optimize_sync",
+    
+    # Core functionality
+    "GepaOptimizer",
+    "BaseGepaAdapter",
+    "UniversalGepaAdapter",
+    
+    # Configuration
+    "OptimizationConfig", 
+    "OptimizationResult",
+    "OptimizedResult",
+    "ModelConfig",
+    
+    # Data processing
+    "UniversalConverter",
+    "DataLoader",
+    "DataValidator",
+    
+    # Dataset loaders
+    "ScrollDatasetLoader",
+    "load_scroll_dataset",
+    "ValidationDatasetLoader",
+    "load_validation_dataset",
+    "load_validation_split",
+    "IndexCachingDatasetLoader",
+    "load_index_caching_dataset",
+    "load_index_caching_split",
+    
+    # LLM clients
+    "VisionLLMClient",
+    "BaseLLMClient",
+    "BatchLLMClient",
+    
+    # Evaluators (Universal recommended for general use)
+    "UniversalSemanticEvaluator",
+    "create_universal_evaluator",
+    "BaseEvaluator",
+    "UITreeEvaluator",
+    "ScrollElementEvaluator",
+    "ValidationEvaluator",
+    "IndexCachingEvaluator",
+    
+    # LLEGO Genetic Operators - Base interfaces
+    "BaseGeneticOperator",
+    "BaseCrossoverOperator",
+    "BaseMutationOperator",
+    # LLEGO Genetic Operators - Concrete implementations
+    "FitnessGuidedCrossover",
+    "DiversityGuidedMutation",
+    "LLEGOIntegrationLayer",
+    "PromptCandidate",
+    "PromptMetadata",
+    
+    # Utilities
+    "APIKeyManager",
+    "GepaOptimizerError",
+    "GepaDependencyError",
+    "InvalidInputError",
+    "DatasetError",
+    "setup_logging",
+    "calculate_metrics",
+    "sanitize_prompt",
+    
+    # Logging infrastructure
+    "get_logger",
+    "configure_logging",
+    "LogContext",
+    
+    # Type definitions
+    "DatasetItem",
+    "EvaluationResult",
+    "LLMResponse",
+    "CandidateDict",
+    "LLMClientProtocol",
+    "EvaluatorProtocol",
+]
diff --git a/src/gepa_optimizer/cli.py b/src/gepa_optimizer/cli.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a1593308fcd2ae0cab1fa15fc734815d877252
--- /dev/null
+++ b/src/gepa_optimizer/cli.py
@@ -0,0 +1,239 @@
+"""
+Command Line Interface for GEPA Optimizer
+"""
+
+import argparse
+import sys
+import json
+import asyncio
+from pathlib import Path
+from typing import Optional
+
+from .core import GepaOptimizer
+from .models import OptimizationConfig, ModelConfig
+from .utils import setup_logging, APIKeyManager
+
+
+def main():
+    """Main CLI entry point"""
+    parser = argparse.ArgumentParser(
+        description="GEPA Universal Prompt Optimizer CLI",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  gepa-optimize --model openai/gpt-4-turbo --prompt "Extract UI elements" --dataset data.json
+  gepa-optimize --config config.json --prompt "Analyze interface" --dataset images/
+        """
+    )
+    
+    # Required arguments
+    parser.add_argument(
+        "--prompt", 
+        required=True,
+        help="Initial seed prompt to optimize"
+    )
+    parser.add_argument(
+        "--dataset",
+        required=True, 
+        help="Path to dataset file or directory"
+    )
+    
+    # Model configuration
+    parser.add_argument(
+        "--model",
+        help="Model specification (e.g., 'openai/gpt-4-turbo')"
+    )
+    parser.add_argument(
+        "--reflection-model",
+        help="Reflection model specification"
+    )
+    parser.add_argument(
+        "--config",
+        help="Path to configuration JSON file"
+    )
+    
+    # Optimization parameters
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=10,
+        help="Maximum optimization iterations (default: 10)"
+    )
+    parser.add_argument(
+        "--max-metric-calls", 
+        type=int,
+        default=100,
+        help="Maximum metric evaluation calls (default: 100)"
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size for evaluation (default: 4)"
+    )
+    
+    # GEPA-specific parameters
+    parser.add_argument(
+        "--candidate-selection-strategy",
+        type=str,
+        default="pareto",
+        choices=["pareto", "best"],
+        help="Strategy for selecting candidates (default: pareto)"
+    )
+    parser.add_argument(
+        "--skip-perfect-score",
+        action="store_true",
+        help="Skip updating candidates with perfect scores"
+    )
+    parser.add_argument(
+        "--reflection-minibatch-size",
+        type=int,
+        default=None,
+        help="Number of examples to use for reflection (default: use batch_size)"
+    )
+    parser.add_argument(
+        "--perfect-score",
+        type=float,
+        default=1.0,
+        help="Perfect score threshold (default: 1.0)"
+    )
+    parser.add_argument(
+        "--module-selector",
+        type=str,
+        default="round_robin",
+        choices=["round_robin", "all"],
+        help="Component selection strategy (default: round_robin)"
+    )
+    
+    # Output options
+    parser.add_argument(
+        "--output",
+        help="Output file path for results (default: stdout)"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    
+    args = parser.parse_args()
+    
+    # Setup logging
+    setup_logging(level="DEBUG" if args.verbose else "INFO")
+    
+    try:
+        # Load configuration
+        if args.config:
+            config = load_config_from_file(args.config)
+        else:
+            config = create_config_from_args(args)
+        
+        # Validate API keys
+        validate_api_keys(config)
+        
+        # Create optimizer
+        optimizer = GepaOptimizer(config=config)
+        
+        # Run optimization (async)
+        print(f"🚀 Starting optimization with model: {config.model.model_name}")
+        result = asyncio.run(optimizer.train(
+            seed_prompt=args.prompt,
+            dataset=args.dataset
+        ))
+        
+        # Output results
+        output_results(result, args.output)
+        
+        print("✅ Optimization completed successfully!")
+        
+    except Exception as e:
+        print(f"❌ Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+
+
+def load_config_from_file(config_path: str) -> OptimizationConfig:
+    """Load configuration from JSON file"""
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    
+    with open(path, 'r') as f:
+        config_data = json.load(f)
+    
+    # Convert model configs
+    if 'model' in config_data and isinstance(config_data['model'], dict):
+        config_data['model'] = ModelConfig(**config_data['model'])
+    
+    if 'reflection_model' in config_data and isinstance(config_data['reflection_model'], dict):
+        config_data['reflection_model'] = ModelConfig(**config_data['reflection_model'])
+    
+    return OptimizationConfig(**config_data)
+
+
+def create_config_from_args(args) -> OptimizationConfig:
+    """Create configuration from command line arguments"""
+    if not args.model:
+        raise ValueError("Either --model or --config must be specified")
+    
+    # Parse model specification
+    model_config = ModelConfig.from_string(args.model)
+    
+    reflection_model_config = None
+    if args.reflection_model:
+        reflection_model_config = ModelConfig.from_string(args.reflection_model)
+    
+    return OptimizationConfig(
+        model=model_config,
+        reflection_model=reflection_model_config,
+        max_iterations=args.max_iterations,
+        max_metric_calls=args.max_metric_calls,
+        batch_size=args.batch_size
+    )
+
+
+def validate_api_keys(config: OptimizationConfig):
+    """Validate that required API keys are available"""
+    api_manager = APIKeyManager()
+    
+    providers = [config.model.provider]
+    if config.reflection_model:
+        providers.append(config.reflection_model.provider)
+    
+    missing_keys = api_manager.get_missing_keys(providers)
+    
+    if missing_keys:
+        print("❌ Missing API keys for the following providers:")
+        for provider in missing_keys:
+            print(f"   - {provider.upper()}_API_KEY")
+        print("\nPlease set the required environment variables or use a .env file")
+        sys.exit(1)
+
+def output_results(result, output_path: Optional[str]):
+    """Output optimization results"""
+    output_data = {
+        "optimized_prompt": result.prompt,
+        "original_prompt": result.original_prompt,
+        "improvement_metrics": result.improvement_data,
+        "optimization_time": result.optimization_time,
+        "status": result.status,
+        "session_id": result.session_id
+    }
+    
+    if output_path:
+        with open(output_path, 'w') as f:
+            json.dump(output_data, f, indent=2)
+        print(f"📄 Results saved to: {output_path}")
+    else:
+        print("\n📊 Optimization Results:")
+        print(f"Session ID: {result.session_id}")
+        print(f"Status: {result.status}")
+        print(f"Time: {result.optimization_time:.2f}s")
+        print(f"\nOriginal Prompt:\n{result.original_prompt}")
+        print(f"\nOptimized Prompt:\n{result.prompt}")
+        
+        if 'improvement_percent' in result.improvement_data:
+            print(f"\nImprovement: {result.improvement_data['improvement_percent']:.2f}%")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/gepa_optimizer/core/__init__.py b/src/gepa_optimizer/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..41630b803c65248370baa2b0874fdf287ed0d052
--- /dev/null
+++ b/src/gepa_optimizer/core/__init__.py
@@ -0,0 +1,8 @@
+"""
+Core functionality for GEPA Universal Prompt Optimizer
+"""
+
+from .optimizer import GepaOptimizer
+from .result import ResultProcessor
+
+__all__ = ["GepaOptimizer", "ResultProcessor"]
diff --git a/src/gepa_optimizer/core/base_adapter.py b/src/gepa_optimizer/core/base_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f1ff8ea3eb47fbbd0ffc89b011fe233cafc68ce
--- /dev/null
+++ b/src/gepa_optimizer/core/base_adapter.py
@@ -0,0 +1,85 @@
+"""
+Base adapter class for all GEPA adapters.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+import logging
+from gepa.core.adapter import GEPAAdapter, EvaluationBatch
+
+from ..llms.base_llm import BaseLLMClient
+from ..evaluation.base_evaluator import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+class BaseGepaAdapter(GEPAAdapter, ABC):
+    """
+    Abstract base class for GEPA adapters.
+    
+    Provides the foundation for creating task-specific adapters while
+    maintaining compatibility with the GEPA framework.
+    """
+    
+    def __init__(self, llm_client: BaseLLMClient, evaluator: BaseEvaluator):
+        """
+        Initialize adapter with LLM client and evaluator.
+        
+        Args:
+            llm_client: LLM client for generating responses
+            evaluator: Evaluator for scoring predictions
+        """
+        if not isinstance(llm_client, BaseLLMClient):
+            raise TypeError("llm_client must be an instance of BaseLLMClient")
+        if not isinstance(evaluator, BaseEvaluator):
+            raise TypeError("evaluator must be an instance of BaseEvaluator")
+        
+        self.llm_client = llm_client
+        self.evaluator = evaluator
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        
+        # Performance tracking
+        self._evaluation_count = 0
+        self._best_score = 0.0
+        self._best_candidate = None
+    
+    @abstractmethod
+    def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str], 
+                capture_traces: bool = False) -> EvaluationBatch:
+        """
+        Evaluate candidate on a batch of data.
+        
+        Args:
+            batch: List of data items to evaluate
+            candidate: Prompt candidate to evaluate
+            capture_traces: Whether to capture detailed traces
+            
+        Returns:
+            EvaluationBatch with outputs, scores, and optional trajectories
+        """
+        pass
+    
+    @abstractmethod
+    def make_reflective_dataset(self, candidate: Dict[str, str], 
+                              eval_batch: EvaluationBatch, 
+                              components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Create reflective dataset for GEPA's reflection process.
+        
+        Args:
+            candidate: Current prompt candidate
+            eval_batch: Results from evaluation
+            components_to_update: List of components to update
+            
+        Returns:
+            Dictionary mapping components to reflection data
+        """
+        pass
+    
+    def get_performance_stats(self) -> Dict[str, Any]:
+        """Get performance statistics for monitoring"""
+        return {
+            'evaluation_count': self._evaluation_count,
+            'best_score': self._best_score,
+            'model_info': self.llm_client.get_model_info(),
+            'evaluator_class': self.evaluator.__class__.__name__
+        }
diff --git a/src/gepa_optimizer/core/custom_adapter.py b/src/gepa_optimizer/core/custom_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f0d32ddc2a39b866d77fa5693be5e178cf20d09
--- /dev/null
+++ b/src/gepa_optimizer/core/custom_adapter.py
@@ -0,0 +1,389 @@
+"""
+Custom GEPA Adapter for the GEPA Universal Prompt Optimizer
+"""
+
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional
+
+# Import ModelConfig
+from ..models import ModelConfig
+
+from gepa.core.adapter import GEPAAdapter, EvaluationBatch
+from ..llms.vision_llm import VisionLLMClient
+from ..evaluation.ui_evaluator import UITreeEvaluator
+from .base_adapter import BaseGepaAdapter
+
+logger = logging.getLogger(__name__)
+
+class CustomGepaAdapter(BaseGepaAdapter):
+    """
+    Custom adapter for the GEPA Universal Prompt Optimizer.
+    """
+
+    def __init__(self, model_config: 'ModelConfig', metric_weights: Optional[Dict[str, float]] = None):
+        """Initialize the custom GEPA adapter with model configuration."""
+        # Convert string model to ModelConfig if needed
+        if not isinstance(model_config, ModelConfig):
+            model_config = ModelConfig(
+                provider='openai',
+                model_name=str(model_config),
+                api_key=None
+            )
+        
+        # Initialize components
+        llm_client = VisionLLMClient(
+            provider=model_config.provider,
+            model_name=model_config.model_name,
+            api_key=model_config.api_key,
+            base_url=model_config.base_url,
+            temperature=model_config.temperature,
+            max_tokens=model_config.max_tokens,
+            top_p=model_config.top_p,
+            frequency_penalty=model_config.frequency_penalty,
+            presence_penalty=model_config.presence_penalty
+        )
+        
+        evaluator = UITreeEvaluator(metric_weights=metric_weights)
+        
+        # Initialize parent class
+        super().__init__(llm_client, evaluator)
+        
+        # Track candidates for logging
+        self._last_candidate = None
+        self._evaluation_count = 0
+        
+        self.logger.info(f"🚀 Initialized UI Tree adapter with {model_config.provider}/{model_config.model_name}")
+
+    def _parse_json_safely(self, json_str: str) -> Dict[str, Any]:
+        """Safely parse JSON string to dictionary with enhanced parsing and repair."""
+        if not json_str or not isinstance(json_str, str):
+            return {}
+        
+        # Try direct parsing first
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass
+        
+        # Try to extract JSON from markdown code blocks
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', json_str, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        
+        # Try to find JSON object in the string
+        json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(0))
+            except json.JSONDecodeError:
+                pass
+        
+        # Try repair and parse
+        repaired_json = self._repair_json(json_str)
+        if repaired_json:
+            try:
+                return json.loads(repaired_json)
+            except json.JSONDecodeError:
+                pass
+        
+        self.logger.warning(f"Failed to parse JSON: {json_str[:100]}...")
+        return {}
+
+    def _repair_json(self, json_str: str) -> str:
+        """Attempt to repair common JSON issues."""
+        try:
+            # Remove markdown formatting
+            json_str = re.sub(r'```(?:json)?\s*', '', json_str)
+            json_str = re.sub(r'```\s*$', '', json_str)
+            
+            # Remove extra text before/after JSON
+            json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
+            if json_match:
+                json_str = json_match.group(0)
+            
+            # Fix common issues
+            json_str = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas
+            json_str = re.sub(r',\s*]', ']', json_str)  # Remove trailing commas in arrays
+            json_str = re.sub(r'([{,]\s*)(\w+):', r'\1"\2":', json_str)  # Quote unquoted keys
+            
+            return json_str
+        except Exception as e:
+            self.logger.warning(f"🔧 JSON repair failed: {e}")
+            return ""
+
+    def evaluate(
+        self,
+        batch: List[Dict[str, Any]],
+        candidate: Dict[str, str],
+        capture_traces: bool = False,
+    ) -> EvaluationBatch:
+        """Evaluate the candidate on a batch of data."""
+        outputs = []
+        scores = []
+        trajectories = [] if capture_traces else None
+
+        system_prompt = candidate.get('system_prompt', '')
+
+        # Check if this is a new candidate (different from last one)
+        if self._last_candidate != system_prompt:
+            self._evaluation_count += 1
+            self.log_proposed_candidate(candidate, self._evaluation_count)
+            self._last_candidate = system_prompt
+
+        self.logger.info(f"📊 Evaluating {len(batch)} samples with prompt: '{system_prompt[:50]}...'")
+
+        for i, item in enumerate(batch):
+            input_text = item.get('input', '')
+            image_base64 = item.get('image', '')
+            ground_truth_json = item.get('output', '')
+
+            # Call the LLM client
+            llm_response = self.llm_client.generate(system_prompt, input_text, image_base64=image_base64)
+            
+            # Extract content from the response dictionary
+            if isinstance(llm_response, dict):
+                llm_output_json_str = llm_response.get("content", "")
+                if not llm_output_json_str:
+                    llm_output_json_str = str(llm_response)
+            else:
+                llm_output_json_str = str(llm_response) if llm_response else ""
+            
+            # 🔍 DEBUG: Log essential info only (removed verbose JSON content)
+            self.logger.debug(f"🔍 Sample {i+1} - LLM Response Type: {type(llm_response)}")
+            self.logger.debug(f"🔍 Sample {i+1} - Response Length: {len(llm_output_json_str)} chars")
+            
+            outputs.append(llm_output_json_str)
+
+            # Parse JSON strings to dictionaries for evaluation
+            llm_output_dict = self._parse_json_safely(llm_output_json_str)
+            ground_truth_dict = self._parse_json_safely(ground_truth_json)
+
+            # Initialize evaluation_results with default values
+            evaluation_results = {
+                "composite_score": 0.0,
+                "element_completeness": 0.0,
+                "element_type_accuracy": 0.0,
+                "text_content_accuracy": 0.0,
+                "hierarchy_accuracy": 0.0,
+                "style_accuracy": 0.0
+            }
+
+            # Calculate composite score and evaluation results
+            if not llm_output_dict and not ground_truth_dict:
+                composite_score = 0.1
+                evaluation_results = {k: 0.1 for k in evaluation_results.keys()}
+                self.logger.warning(f"⚠️  Sample {i+1}: Empty results - using default score: {composite_score}")
+            elif not llm_output_dict or not ground_truth_dict:
+                composite_score = 0.05
+                evaluation_results = {k: 0.05 for k in evaluation_results.keys()}
+                self.logger.warning(f"⚠️  Sample {i+1}: Incomplete results - using low score: {composite_score}")
+            else:
+                # Calculate score using evaluator with parsed dictionaries
+                evaluation_results = self.evaluator.evaluate(llm_output_dict, ground_truth_dict)
+                composite_score = evaluation_results["composite_score"]
+                
+                # Clean, readable logging (removed verbose JSON dumps)
+                llm_children = len(llm_output_dict.get('children', []))
+                gt_children = len(ground_truth_dict.get('children', []))
+                
+                if composite_score < 0.1:
+                    self.logger.warning(f"⚠️  Sample {i+1}: Low score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements")
+                    self.logger.debug(f"   Score breakdown: {evaluation_results}")
+                else:
+                    self.logger.info(f"✅ Sample {i+1}: Score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements")
+            
+            scores.append(composite_score)
+
+            if capture_traces:
+                trajectories.append({
+                    'input_text': input_text,
+                    'image_base64': image_base64,
+                    'ground_truth_json': ground_truth_json,
+                    'llm_output_json': llm_output_json_str,
+                    'evaluation_results': evaluation_results
+                })
+
+        avg_score = sum(scores) / len(scores) if scores else 0.0
+        
+        # Update performance tracking (handled by parent class)
+        if avg_score > self._best_score:
+            self._best_score = avg_score
+            self._best_candidate = candidate.copy()
+            self.logger.info(f"🎯 New best candidate found with score: {avg_score:.4f}")
+        
+        self.logger.info(f"📈 Batch evaluation complete - Average score: {avg_score:.4f}")
+
+        return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
+
+    def make_reflective_dataset(
+        self,
+        candidate: Dict[str, str],
+        eval_batch: EvaluationBatch,
+        components_to_update: List[str],
+    ) -> Dict[str, List[Dict[str, Any]]]:
+        """Create a reflective dataset from the evaluation results."""
+        reflective_dataset = {}
+        system_prompt = candidate.get('system_prompt', '')
+
+        # 🎯 NEW: Log the proposed new prompt being evaluated
+        self.logger.info(f"📝 Creating reflection dataset for prompt: '{system_prompt[:100]}...'")
+        
+        # Pretty print reflection dataset creation
+        self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update)
+
+        for component in components_to_update:
+            reflective_dataset[component] = []
+            for i, trace in enumerate(eval_batch.trajectories):
+                feedback = self._generate_feedback(trace['evaluation_results'])
+                reflective_dataset[component].append({
+                    "current_prompt": system_prompt,
+                    "input_text": trace['input_text'],
+                    "image_base64": trace['image_base64'],
+                    "generated_json": trace['llm_output_json'],
+                    "ground_truth_json": trace['ground_truth_json'],
+                    "score": trace['evaluation_results']["composite_score"],
+                    "feedback": feedback,
+                    "detailed_scores": trace['evaluation_results']
+                })
+
+        # 🎯 NEW: Log reflection dataset summary
+        total_samples = sum(len(data) for data in reflective_dataset.values())
+        avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0
+        self.logger.info(f"📝 Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}")
+
+        return reflective_dataset
+
+    def _generate_feedback(self, evaluation_results: Dict[str, float]) -> str:
+        """Generate textual feedback based on evaluation results."""
+        composite_score = evaluation_results.get("composite_score", 0.0)
+        
+        feedback_parts = []
+        
+        # Overall quality assessment
+        if composite_score >= 0.8:
+            feedback_parts.append("The overall quality is good.")
+        elif composite_score >= 0.5:
+            feedback_parts.append("The overall quality is moderate.")
+        else:
+            feedback_parts.append("The overall quality is low. Focus on fundamental accuracy.")
+        
+        # Specific metric feedback
+        if evaluation_results.get("element_completeness", 0.0) < 0.7:
+            feedback_parts.append("Element completeness is low. Ensure all UI elements are captured.")
+        
+        if evaluation_results.get("element_type_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Element type accuracy is low. Verify correct UI element identification (Button, Text, Image, etc.).")
+        
+        if evaluation_results.get("text_content_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Text content accuracy is low. Improve text extraction fidelity.")
+        
+        if evaluation_results.get("hierarchy_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Hierarchy accuracy is low. Ensure correct parent-child relationships.")
+        
+        if evaluation_results.get("style_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Style accuracy is low. Capture more styling properties (colors, sizes, positioning).")
+        
+        return " ".join(feedback_parts)
+    
+    def get_best_candidate(self) -> Optional[Dict[str, str]]:
+        """Get the best candidate found so far."""
+        return self._best_candidate
+    
+    def get_best_score(self) -> float:
+        """Get the best score found so far."""
+        return self._best_score
+    
+    def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0):
+        """
+        Log the new proposed candidate prompt.
+        
+        Args:
+            candidate: The new candidate prompt from GEPA
+            iteration: Current optimization iteration
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        
+        logger.info("="*80)
+        logger.info(f"NEW PROPOSED CANDIDATE (Iteration {iteration})")
+        logger.info("="*80)
+        logger.info(f"PROPOSED PROMPT:")
+        logger.info("-" * 40)
+        logger.debug(f'"{system_prompt}"')
+        logger.info("-" * 40)
+        logger.info(f"Prompt Length: {len(system_prompt)} characters")
+        logger.info(f"Word Count: {len(system_prompt.split())} words")
+        logger.info("="*80)
+    
+    def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch, 
+                                       components_to_update: List[str]):
+        """
+        Log the reflection dataset creation process.
+        
+        Args:
+            candidate: Current candidate being evaluated
+            eval_batch: Evaluation results
+            components_to_update: Components being updated
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        
+        logger.info("="*80)
+        logger.info("REFLECTION DATASET CREATION")
+        logger.info("="*80)
+        
+        logger.info(f"CURRENT PROMPT BEING ANALYZED:")
+        logger.info("-" * 40)
+        logger.debug(f'"{system_prompt}"')
+        logger.info("-" * 40)
+        
+        logger.info(f"EVALUATION SUMMARY:")
+        logger.info("-" * 40)
+        if eval_batch.scores:
+            avg_score = sum(eval_batch.scores) / len(eval_batch.scores)
+            min_score = min(eval_batch.scores)
+            max_score = max(eval_batch.scores)
+            logger.info(f"   Average Score: {avg_score:.4f}")
+            logger.info(f"   Min Score: {min_score:.4f}")
+            logger.info(f"   Max Score: {max_score:.4f}")
+            logger.info(f"   Total Samples: {len(eval_batch.scores)}")
+        
+        logger.info(f"COMPONENTS TO UPDATE:")
+        logger.info("-" * 40)
+        for i, component in enumerate(components_to_update, 1):
+            logger.info(f"   {i}. {component}")
+        
+        if eval_batch.trajectories:
+            logger.debug(f"DETAILED ANALYSIS:")
+            logger.debug("-" * 40)
+            for i, trace in enumerate(eval_batch.trajectories[:3], 1):  # Show first 3 samples
+                evaluation_results = trace['evaluation_results']
+                composite_score = evaluation_results.get("composite_score", 0.0)
+                
+                logger.debug(f"   Sample {i} (Score: {composite_score:.4f}):")
+                
+                # Show input data (truncated)
+                input_text = trace['input_text'][:100] + "..." if len(trace['input_text']) > 100 else trace['input_text']
+                logger.debug(f"      Input: \"{input_text}\"")
+                
+                # Show predicted output (truncated)
+                predicted_output = trace['llm_output_json'][:100] + "..." if len(trace['llm_output_json']) > 100 else trace['llm_output_json']
+                logger.debug(f"      Output: \"{predicted_output}\"")
+                
+                # Show detailed scores
+                logger.debug(f"      Detailed Scores:")
+                for metric, score in evaluation_results.items():
+                    if metric != "composite_score":
+                        logger.debug(f"        {metric.replace('_', ' ').title()}: {score:.4f}")
+                
+                # Show generated feedback
+                feedback = self._generate_feedback(evaluation_results)
+                logger.debug(f"      Feedback: \"{feedback}\"")
+            
+            if len(eval_batch.trajectories) > 3:
+                logger.debug(f"   ... and {len(eval_batch.trajectories) - 3} more samples")
+        
+        logger.info("="*80)
diff --git a/src/gepa_optimizer/core/optimizer.py b/src/gepa_optimizer/core/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..30271dce298b80e1dc6b4478bb69aa8e2766b65c
--- /dev/null
+++ b/src/gepa_optimizer/core/optimizer.py
@@ -0,0 +1,1279 @@
+"""
+Main GepaOptimizer class - the heart of the optimization system
+"""
+
+import time
+import logging
+from typing import Any, Dict, List, Optional, Union
+import asyncio
+import io
+import sys
+from contextlib import redirect_stdout, redirect_stderr
+
+import gepa
+from ..utils.api_keys import APIKeyManager
+from .result import ResultProcessor
+from ..data.converters import UniversalConverter
+from ..models.result import OptimizationResult, OptimizedResult
+from ..models.config import OptimizationConfig, ModelConfig
+from ..utils.helpers import sanitize_prompt
+from ..utils.exceptions import GepaDependencyError, InvalidInputError, DatasetError, GepaOptimizerError
+
+logger = logging.getLogger(__name__)
+
+class GepaOptimizer:
+    """
+    Main class for prompt optimization using GEPA
+    
+    This is the primary interface that users interact with.
+    Provides both simple and advanced optimization capabilities.
+    """
+    
+    def __init__(self, config: Optional[OptimizationConfig] = None, 
+                 adapter_type: str = "universal",
+                 custom_adapter: Optional[Any] = None,
+                 llm_model_name: Optional[str] = None, 
+                 metric_weights: Optional[Dict[str, float]] = None,
+                 **kwargs):
+        """
+        Initialize the optimizer
+        
+        Args:
+            config: Optimization configuration (required)
+            adapter_type: Type of adapter to use ("universal" only - fully configurable)
+            custom_adapter: Custom adapter instance (overrides adapter_type)
+            llm_model_name: [Deprecated] Use config.model instead. Will be removed in future versions.
+            metric_weights: [Deprecated] Not used - evaluator handles metrics. Will be removed in future versions.
+            **kwargs: Additional parameters for universal adapter (llm_client, evaluator, etc.)
+            
+        Raises:
+            ValueError: If required configuration is missing
+            GepaDependencyError: If GEPA library is not available
+        """
+        if config is None:
+            raise ValueError("config parameter is required. Use OptimizationConfig to configure the optimizer.")
+            
+        # Initialize logger first
+        self.logger = logging.getLogger(__name__)
+        
+        self.config = config
+        self.converter = UniversalConverter(data_split_config=config.data_split)
+        self.api_manager = APIKeyManager()
+        self.result_processor = ResultProcessor()
+        
+        # Initialize adapter based on configuration
+        if custom_adapter:
+            # User provided custom adapter
+            from .base_adapter import BaseGepaAdapter
+            if not isinstance(custom_adapter, BaseGepaAdapter):
+                raise TypeError("custom_adapter must be an instance of BaseGepaAdapter")
+            self.adapter = custom_adapter
+            self.logger.info("Using user-provided custom adapter")
+        elif adapter_type == "universal":
+            # Universal adapter requires user to provide components
+            llm_client = kwargs.get('llm_client')
+            evaluator = kwargs.get('evaluator')
+            
+            if not llm_client or not evaluator:
+                raise ValueError(
+                    "llm_client and evaluator are required for universal adapter. "
+                    "Example: GepaOptimizer(config=config, adapter_type='universal', "
+                    "llm_client=llm_client, evaluator=evaluator)"
+                )
+            
+            from .universal_adapter import UniversalGepaAdapter
+            self.adapter = UniversalGepaAdapter(
+                llm_client=llm_client,
+                evaluator=evaluator,
+                data_converter=kwargs.get('data_converter')
+            )
+            self.logger.info("Using universal adapter")
+        else:
+            raise ValueError(
+                f"Unknown adapter_type: {adapter_type}. "
+                f"Only 'universal' is supported. "
+                f"Provide llm_client and evaluator when using universal adapter."
+            )
+        
+        # Keep backward compatibility
+        self.custom_adapter = self.adapter
+        
+        # Log model configuration
+        model_info = self.adapter.get_performance_stats()
+        self.logger.info(f"Initialized adapter: {model_info}")
+        
+        # Set up logging
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        
+        # Validate GEPA availability
+        if gepa is None:
+            raise GepaDependencyError("GEPA library is not available. Please install it with: pip install gepa")
+
+    async def train(self,
+                   seed_prompt: str,
+                   dataset: Union[List[Any], str, Dict, Any],
+                   **kwargs) -> OptimizedResult:
+        """
+        Main training method for prompt optimization
+        
+        Args:
+            seed_prompt: Initial prompt to optimize
+            dataset: Training data in any format
+            **kwargs: Additional parameters that can override config
+            
+        Returns:
+            OptimizedResult: Optimization result with improved prompt
+            
+        Raises:
+            InvalidInputError: For invalid input parameters
+            DatasetError: For issues with dataset processing
+            GepaOptimizerError: For optimization failures
+        """
+        start_time = time.time()
+        session_id = f"opt_{int(start_time)}_{id(self)}"
+        
+        try:
+            self.logger.info(f"Starting optimization session: {session_id}")
+            self.logger.info(f"Using model: {self.config.model.model_name} (provider: {self.config.model.provider})")
+            
+            # #region agent log
+            import json as _json_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "optimizer.py:train_start", "message": "Optimization train() started", "data": {"session_id": session_id, "max_iterations": self.config.max_iterations}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            # #endregion
+            
+            # 🔥 FIX E: Reset Pareto logger at start of each optimization run
+            from ..utils.pareto_logger import reset_pareto_logger
+            reset_pareto_logger()
+            self.logger.info("✅ Reset Pareto logger for new optimization run")
+            
+            # Update config with any overrides from kwargs
+            self._update_config_from_kwargs(kwargs)
+            
+            # Step 1: Validate inputs
+            self._validate_inputs(seed_prompt)
+            
+            # Step 2: Convert dataset to GEPA format with 3-way split
+            # 🔥 FIX: Support pre-split datasets (user-provided train/val/test)
+            if isinstance(dataset, dict) and all(k in dataset for k in ['train', 'val', 'test']):
+                # User provided pre-split dataset - use it directly
+                self.logger.info("✅ Detected pre-split dataset - using user's split (no re-splitting)")
+                trainset_raw = dataset.get('train', [])
+                valset_raw = dataset.get('val', [])
+                testset_raw = dataset.get('test', [])
+                
+                # Still need to standardize the format (convert to GEPA format)
+                trainset = self.converter._standardize(trainset_raw)
+                valset = self.converter._standardize(valset_raw)
+                testset = self.converter._standardize(testset_raw) if testset_raw else []
+                
+                self.logger.info(
+                    f"Using pre-split dataset: {len(trainset)} train (Dfeedback), "
+                    f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)"
+                )
+            else:
+                # Standard path: convert and split automatically
+                self.logger.info("Converting dataset to GEPA format with 3-way split...")
+                trainset, valset, testset = self.converter.convert(
+                    dataset, 
+                    split_config=self.config.data_split
+                )
+                
+                # Log split with adaptive strategy info
+                split_strategy = self.config.data_split.small_dataset_strategy
+                strategy_note = ""
+                if split_strategy == 'adaptive':
+                    total_size = len(trainset) + len(valset) + len(testset)
+                    train_ratio, val_ratio, test_ratio = self.config.data_split.get_adaptive_ratios(total_size)
+                    strategy_note = f" (adaptive: {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% ratios)"
+                self.logger.info(
+                    f"Dataset split{strategy_note}: {len(trainset)} train (Dfeedback), "
+                    f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)"
+                )
+            
+            if not trainset:
+                raise DatasetError("Dataset appears to be empty after conversion")
+            
+            # Step 3: Create seed candidate
+            seed_candidate = self._create_seed_candidate(seed_prompt)
+            
+            # 🔥 CRITICAL: Set valset info in adapter BEFORE baseline evaluation
+            # This ensures adapter correctly detects 'dpareto' dataset type
+            # Use direct assignment (don't rely on hasattr) to ensure attributes are set
+            try:
+                self.adapter._valset_size = len(valset) if valset else 0
+                self.logger.info(f"✅ Set valset_size in adapter: {len(valset) if valset else 0} for Dpareto detection")
+            except AttributeError:
+                self.logger.warning("⚠️ Could not set _valset_size in adapter - attribute not supported")
+            
+            try:
+                self.adapter._valset = valset
+                self.logger.info(f"✅ Stored valset in adapter ({len(valset) if valset else 0} samples)")
+            except AttributeError:
+                self.logger.warning("⚠️ Could not set _valset in adapter - attribute not supported")
+            
+            # Step 3.5: Calculate baseline score on VALIDATION set (not test set)
+            # This ensures fair comparison since optimization uses validation set for Pareto selection
+            baseline_val_score = None
+            if valset:
+                self.logger.info("📊 Evaluating seed prompt on validation set for baseline...")
+                # Set baseline flag so adapter knows this is baseline, not optimization
+                # Use direct assignment to ensure the flag is set
+                try:
+                    self.adapter._is_baseline_evaluation = True
+                    self.logger.info("✅ Set baseline evaluation flag in adapter")
+                except AttributeError:
+                    self.logger.warning("⚠️ Could not set _is_baseline_evaluation in adapter")
+                
+                try:
+                    # Evaluate on validation set (same as what GEPA will use for Pareto selection)
+                    eval_result = self.adapter.evaluate(
+                        batch=valset,
+                        candidate=seed_candidate,
+                        capture_traces=False
+                    )
+                    baseline_val_score = sum(eval_result.scores) / len(eval_result.scores) if eval_result.scores else 0.0
+                    self.logger.info(f"📊 Baseline validation score: {baseline_val_score:.4f} (on {len(valset)} samples)")
+                    
+                    # Store baseline in adapter for later use
+                    if hasattr(self.adapter, '_baseline_score'):
+                        self.adapter._baseline_score = baseline_val_score
+                    
+                    # 🔥 CRITICAL FIX: Also set baseline in Pareto logger
+                    # This ensures candidates can be properly evaluated against baseline
+                    from ..utils.pareto_logger import get_pareto_logger
+                    pareto_log = get_pareto_logger()
+                    pareto_log.set_baseline(baseline_val_score)
+                    self.logger.info(f"✅ Baseline set in Pareto logger: {baseline_val_score:.4f}")
+                    
+                except Exception as e:
+                    self.logger.warning(f"Baseline evaluation failed: {e}")
+                    import traceback
+                    self.logger.debug(f"Baseline evaluation error: {traceback.format_exc()}")
+                finally:
+                    try:
+                        self.adapter._is_baseline_evaluation = False
+                        self.logger.debug("✅ Reset baseline evaluation flag - optimization can begin")
+                    except AttributeError:
+                        pass  # Ignore if attribute not supported
+            
+            # Step 4: Run GEPA optimization
+            self.logger.info("Starting GEPA optimization...")
+            gepa_result, actual_iterations = await self._run_gepa_optimization(
+                adapter=self.adapter,
+                seed_candidate=seed_candidate,
+                trainset=trainset,
+                valset=valset,
+                **kwargs
+            )
+            
+            # Step 5: Extract best candidate
+            best_candidate = self._extract_best_candidate(gepa_result)
+            
+            # 🔥 CRITICAL: Extract optimized prompt from best_candidate
+            # This is the actual optimized prompt that GEPA found
+            self.logger.info(f"\n{'═'*80}")
+            self.logger.info(f"📝 EXTRACTING OPTIMIZED PROMPT FROM GEPA RESULT")
+            self.logger.info(f"{'═'*80}")
+            self.logger.info(f"best_candidate keys: {list(best_candidate.keys()) if isinstance(best_candidate, dict) else 'N/A'}")
+            
+            optimized_prompt = best_candidate.get('system_prompt', seed_prompt)
+            if not optimized_prompt or optimized_prompt.strip() == '':
+                # Fallback: try other keys or use seed prompt
+                optimized_prompt = best_candidate.get('prompt', best_candidate.get('text', seed_prompt))
+            
+            # Get fitness score if available
+            best_fitness = best_candidate.get('fitness') or self.adapter.get_best_score() if hasattr(self.adapter, 'get_best_score') else None
+            candidate_source = best_candidate.get('source', 'unknown')
+            
+            self.logger.info(f"\n✅ EXTRACTED OPTIMIZED PROMPT:")
+            self.logger.info(f"   Source: {candidate_source}")
+            if best_fitness is not None:
+                self.logger.info(f"   Fitness: f={best_fitness:.4f}")
+            self.logger.info(f"   Length: {len(optimized_prompt)} characters")
+            self.logger.info(f"   Words: {len(optimized_prompt.split())} words")
+            self.logger.info(f"\n📝 FULL OPTIMIZED PROMPT TEXT:")
+            self.logger.info(f"{'─'*80}")
+            self.logger.info(optimized_prompt)
+            self.logger.info(f"{'─'*80}")
+            
+            if optimized_prompt != seed_prompt:
+                self.logger.info(f"\n✅ SUCCESS: Prompt WAS OPTIMIZED!")
+                self.logger.info(f"   Seed length: {len(seed_prompt)} chars")
+                self.logger.info(f"   Optimized length: {len(optimized_prompt)} chars")
+                self.logger.info(f"   Difference: {len(optimized_prompt) - len(seed_prompt):+d} chars")
+                if best_fitness is not None:
+                    baseline_fitness = 0.5  # Default baseline, could be improved
+                    improvement = best_fitness - baseline_fitness
+                    improvement_pct = (improvement / baseline_fitness * 100) if baseline_fitness > 0 else 0
+                    self.logger.info(f"   Fitness: f={best_fitness:.4f} (improvement: {improvement:+.4f} ({improvement_pct:+.1f}%))")
+            else:
+                self.logger.warning(f"\n⚠️  WARNING: Optimized prompt is IDENTICAL to seed prompt")
+                self.logger.warning(f"   This means GEPA didn't modify the prompt during optimization")
+                if best_fitness is not None:
+                    self.logger.warning(f"   Best fitness found: f={best_fitness:.4f}")
+                    self.logger.warning(f"   💡 Check if LLEGO best candidate is being properly extracted")
+            
+            self.logger.info(f"{'═'*80}\n")
+            
+            # Step 5.5: Calculate improvement metrics (validation vs validation)
+            optimized_test_score = None
+            improvement_data = {}
+            
+            # 🔥 FIX: Calculate improvement based on VALIDATION scores (fair comparison)
+            # Compare optimized VALIDATION score vs validation baseline (both on Dpareto)
+            # This ensures fair comparison - both evaluated on the same validation set
+            optimized_val_score = best_fitness  # Best candidate's fitness is from validation set (Dpareto)
+            
+            if baseline_val_score is not None and optimized_val_score is not None:
+                absolute_improvement = optimized_val_score - baseline_val_score
+                relative_improvement = (
+                    (absolute_improvement / baseline_val_score * 100) 
+                    if baseline_val_score > 0 else 0
+                )
+                
+                improvement_data = {
+                    'baseline_val_score': baseline_val_score,
+                    'optimized_val_score': optimized_val_score,
+                    'absolute_improvement': absolute_improvement,
+                    'relative_improvement_percent': relative_improvement
+                }
+                
+                self.logger.info(
+                    f"📈 Validation improvement: {relative_improvement:+.2f}% "
+                    f"(baseline val: {baseline_val_score:.4f} → optimized val: {optimized_val_score:.4f})"
+                )
+            
+            # Step 5.6: Evaluate optimized prompt on test set (if available) for final reporting
+            if testset and self.config.evaluate_on_test:
+                self.logger.info("📊 Evaluating optimized prompt on test set...")
+                
+                # 🔥 CRITICAL FIX: Clear LLEGO candidate queue before test evaluation
+                # This prevents the LLEGO wrapper from intercepting test evaluation calls
+                # and returning wrong candidates instead of actually running the optimized prompt
+                from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                if hasattr(self.adapter, 'llm_client') and isinstance(self.adapter.llm_client, LLEGOEnhancedLLMClient):
+                    if hasattr(self.adapter.llm_client, '_adapter_generated_candidates'):
+                        self.adapter.llm_client._adapter_generated_candidates = []
+                        self.logger.info("✅ Cleared LLEGO candidate queue for clean test evaluation")
+                    if hasattr(self.adapter.llm_client, '_candidate_queue'):
+                        self.adapter.llm_client._candidate_queue = []
+                        self.logger.info("✅ Cleared LLEGO hybrid candidate queue for clean test evaluation")
+                
+                # Evaluate on test set for final reporting (but improvement is based on validation)
+                try:
+                    optimized_test_score = self._evaluate_candidate_on_testset(
+                        best_candidate,
+                        testset
+                    )
+                    self.logger.info(f"📊 Optimized test score: {optimized_test_score:.4f}")
+                    
+                    # Add test score to improvement_data for reference (but improvement is based on validation)
+                    improvement_data['optimized_test_score'] = optimized_test_score
+                    
+                    if baseline_val_score is not None:
+                        test_vs_baseline = (
+                            ((optimized_test_score - baseline_val_score) / baseline_val_score * 100)
+                            if baseline_val_score > 0 else 0
+                        )
+                        self.logger.info(
+                            f"📊 Test set vs validation baseline: {test_vs_baseline:+.2f}% "
+                            f"(baseline val: {baseline_val_score:.4f} → optimized test: {optimized_test_score:.4f})"
+                        )
+                except Exception as e:
+                    self.logger.warning(f"Test evaluation failed: {e}")
+            
+            # Step 6: Process results
+            optimization_time = time.time() - start_time
+            
+            processed_result = self.result_processor.process_full_result(
+                result=gepa_result,
+                original_prompt=seed_prompt,
+                optimization_time=optimization_time,
+                actual_iterations=actual_iterations,
+                test_metrics=improvement_data  # Add test metrics
+            )
+            
+            # Merge improvement data
+            final_improvement_data = {**processed_result.get('improvement_data', {}), **improvement_data}
+            
+            # Step 7: Create result objects
+            # 🔥 CRITICAL: Use extracted optimized_prompt instead of processed_result
+            result = OptimizedResult(
+                original_prompt=seed_prompt,
+                optimized_prompt=optimized_prompt,  # Use extracted prompt, not processed_result!
+                improvement_data=final_improvement_data,
+                optimization_time=optimization_time,
+                dataset_size=len(trainset) + len(valset) + len(testset),
+                total_iterations=processed_result.get('total_iterations', 0),
+                status=processed_result.get('status', 'completed'),
+                error_message=processed_result.get('error_message'),
+                detailed_result=OptimizationResult(
+                    session_id=session_id,
+                    original_prompt=seed_prompt,
+                    optimized_prompt=optimized_prompt,  # Use extracted prompt!
+                    improvement_data=final_improvement_data,
+                    optimization_time=optimization_time,
+                    dataset_size=len(trainset) + len(valset) + len(testset),
+                    total_iterations=processed_result.get('total_iterations', 0),
+                    status=processed_result.get('status', 'completed'),
+                    error_message=processed_result.get('error_message')
+                )
+            )
+            
+            self.logger.info(f"✅ Optimization completed in {optimization_time:.2f}s")
+            return result
+            
+        except Exception as e:
+            optimization_time = time.time() - start_time
+            error_msg = f"Optimization failed: {str(e)}"
+            self.logger.error(error_msg)
+            
+            # Return failed result
+            return OptimizedResult(
+                original_prompt=seed_prompt,
+                optimized_prompt=seed_prompt,  # Return original on failure
+                improvement_data={'error': error_msg},
+                optimization_time=optimization_time,
+                dataset_size=0,
+                total_iterations=0,
+                status='failed',
+                error_message=error_msg
+            )
+
+    def _update_config_from_kwargs(self, kwargs: Dict[str, Any]) -> None:
+        """Update configuration with runtime overrides from kwargs."""
+        updated_params = []
+        
+        for key, value in kwargs.items():
+            if hasattr(self.config, key):
+                setattr(self.config, key, value)
+                updated_params.append(f"{key}={value}")
+            else:
+                self.logger.warning(f"Unknown parameter '{key}' ignored")
+        
+        if updated_params:
+            self.logger.info(f"Updated config parameters: {', '.join(updated_params)}")
+
+    def _validate_inputs(self, seed_prompt: str) -> None:
+        """
+        Validate input parameters for optimization
+        
+        Args:
+            seed_prompt: The seed prompt to validate
+            
+        Raises:
+            InvalidInputError: If validation fails
+        """
+        if not seed_prompt or not isinstance(seed_prompt, str):
+            raise InvalidInputError("Seed prompt must be a non-empty string")
+        
+        if len(seed_prompt.strip()) < 10:
+            raise InvalidInputError("Seed prompt is too short (minimum 10 characters)")
+        
+        # Validate model configuration
+        model_config = self.config.model
+        if not hasattr(model_config, 'model_name') or not model_config.model_name:
+            raise InvalidInputError("Model name is required")
+        
+        reflection_config = self.config.reflection_model
+        if not hasattr(reflection_config, 'model_name') or not reflection_config.model_name:
+            raise InvalidInputError("Reflection model name is required")
+
+    def _clean_reflection_prompt(self, prompt: str, max_length: int = 50000) -> str:
+        """
+        Clean reflection prompt by removing base64 images and truncating if too long.
+        
+        🔥 CRITICAL: GEPA's reflective dataset includes base64 images which create
+        massive prompts (7MB+) that exceed token limits. This function:
+        1. Strips all base64 image data
+        2. Removes excessive detailed_scores entries
+        3. Truncates to reasonable size
+        4. Preserves essential feedback information
+        
+        Args:
+            prompt: Original prompt from GEPA (may contain base64)
+            max_length: Maximum length after cleaning (default: 50K chars)
+            
+        Returns:
+            Cleaned prompt without base64, within size limits
+        """
+        import re
+        
+        # Step 1: Remove base64 image strings (typically very long alphanumeric strings)
+        # Base64 images are usually 50K+ characters of A-Za-z0-9+/= pattern
+        # Look for very long base64-like sequences
+        base64_pattern = r'[A-Za-z0-9+/=]{5000,}'  # Sequences of 5000+ base64 chars
+        cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', prompt)
+        
+        # Step 2: Remove detailed_scores sections that might contain base64 references
+        # These are usually in markdown format: "### detailed_scores\n...base64..."
+        detailed_scores_pattern = r'### detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*'
+        cleaned = re.sub(detailed_scores_pattern, '### detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE)
+        
+        # Step 3: Remove any remaining image_base64 references
+        cleaned = re.sub(r'image_base64[^\n]*', 'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE)
+        cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned)  # Very long strings likely base64
+        
+        # Step 4: Truncate if still too long (keep the beginning which usually has the most important info)
+        if len(cleaned) > max_length:
+            # Keep first part (usually contains prompt and key feedback)
+            # Add truncation notice
+            truncated_size = len(cleaned) - max_length
+            cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters of detailed evaluation data]"
+            self.logger.warning(f"⚠️  Prompt truncated: {len(prompt)} → {len(cleaned)} chars")
+        
+        return cleaned
+
+    def _validate_models(self, task_lm, reflection_lm):
+        """
+        Validate if specified models are supported.
+        
+        Note: No hardcoded restrictions - the API provider will validate model existence.
+        This method is kept for potential future validation logic but doesn't restrict users.
+        """
+        # No hardcoded model restrictions - users can specify any model
+        # The API provider will handle validation and return errors if model doesn't exist
+        self.logger.debug(f"Using task model: {task_lm}, reflection model: {reflection_lm}")
+
+    def _create_seed_candidate(self, seed_prompt: str) -> Dict[str, str]:
+        """Create a seed candidate from the input prompt."""
+        sanitized_prompt = sanitize_prompt(seed_prompt)
+        return {'system_prompt': sanitized_prompt}
+
+    async def _run_gepa_optimization(self, adapter, seed_candidate: Any, trainset: List[Any], valset: List[Any], **kwargs) -> tuple:  # Return tuple
+        """
+        Run GEPA optimization with the given adapter and data
+        
+        Args:
+            adapter: Custom adapter for GEPA
+            seed_candidate: Initial prompt candidate
+            trainset: Training dataset
+            valset: Validation dataset
+            **kwargs: Additional optimization parameters that can override config
+            
+        Returns:
+            Dict with optimization results
+            
+        Raises:
+            GepaOptimizerError: If optimization fails
+            
+        Note:
+            The following parameters are required in the config:
+            - max_metric_calls: Maximum number of metric evaluations
+            - batch_size: Batch size for evaluation
+            - max_iterations: Maximum number of optimization iterations
+        """
+        try:
+            # Get optimization parameters from config (these are required fields)
+            max_metric_calls = self.config.max_metric_calls
+            batch_size = self.config.batch_size
+            max_iterations = self.config.max_iterations
+
+            # Create reflection model client
+            from ..llms.vision_llm import VisionLLMClient
+            base_reflection_lm_client = VisionLLMClient(
+                provider=self.config.reflection_model.provider,
+                model_name=self.config.reflection_model.model_name,
+                api_key=self.config.reflection_model.api_key,
+                base_url=self.config.reflection_model.base_url,
+                temperature=self.config.reflection_model.temperature,
+                max_tokens=self.config.reflection_model.max_tokens,
+                top_p=self.config.reflection_model.top_p,
+                frequency_penalty=self.config.reflection_model.frequency_penalty,
+                presence_penalty=self.config.reflection_model.presence_penalty
+            )
+            # reflection_lm_client will be set below (may be wrapped with LLEGO)
+            reflection_lm_client = base_reflection_lm_client
+
+            # 🆕 LLEGO Integration: Create enhanced reflection callable
+            if self.config.use_llego_operators:
+                self.logger.info("🧬 LLEGO genetic operators ENABLED")
+                self.logger.info(f"   α={self.config.alpha}, τ={self.config.tau}, ν={self.config.nu}")
+                self.logger.info(f"   Crossover offspring: {self.config.n_crossover}, Mutation offspring: {self.config.n_mutation}")
+                
+                # Import LLEGO operators
+                from ..operators.llego_operators import LLEGOIntegrationLayer, PromptCandidate
+                
+                # Initialize LLEGO integration layer
+                llego = LLEGOIntegrationLayer(
+                    alpha=self.config.alpha,
+                    tau=self.config.tau,
+                    nu=self.config.nu,
+                    population_size=self.config.population_size,
+                    n_crossover=self.config.n_crossover,
+                    n_mutation=self.config.n_mutation
+                )
+                
+                # Initialize with seed prompt
+                llego.initialize_population(
+                    seed_prompt=seed_candidate.get('system_prompt', ''),
+                    initial_fitness=0.5
+                )
+                
+                # 🔥 HYBRID MODE FIX: Wrap reflection_lm_client with LLEGO for hybrid mode
+                # This ensures reflection calls go through LLEGO wrapper for candidate generation
+                if self.config.enable_gepa_reflection_with_llego:
+                    self.logger.info("🔥 HYBRID MODE: Wrapping reflection_lm_client with LLEGO")
+                    from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                    
+                    # Wrap reflection_lm_client with LLEGO so hybrid generation is triggered
+                    reflection_lm_client = LLEGOEnhancedLLMClient(
+                        base_llm=base_reflection_lm_client,
+                        llego_layer=llego,
+                        config=self.config,  # Pass config for hybrid mode!
+                        verbose=True
+                    )
+                    self.logger.info("✅ reflection_lm_client wrapped with LLEGO (hybrid mode enabled)")
+                    
+                    # 🔥 CRITICAL: Store reflection_lm_client reference in adapter so it can set context
+                    # This allows make_reflective_dataset to set reflection context on BOTH clients
+                    if hasattr(adapter, 'reflection_lm_client'):
+                        adapter.reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Stored reflection_lm_client reference in adapter")
+                    else:
+                        # Add reflection_lm_client attribute to adapter
+                        adapter.reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Added reflection_lm_client attribute to adapter")
+                    
+                    # 🔥 NEW: Also store config and reflection_lm_client for adapter-level generation
+                    if hasattr(adapter, '_config'):
+                        adapter._config = self.config
+                        self.logger.info("✅ Stored config in adapter for hybrid mode")
+                    else:
+                        adapter._config = self.config
+                        self.logger.info("✅ Added _config attribute to adapter")
+                    
+                    if hasattr(adapter, '_reflection_lm_client'):
+                        adapter._reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Stored _reflection_lm_client in adapter for hybrid mode")
+                    else:
+                        adapter._reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Added _reflection_lm_client attribute to adapter")
+                    
+                    # 🔥 CRITICAL FIX: Ensure LLEGO layer is stored in adapter
+                    # Without this, adapter.llego will be None and population updates are skipped!
+                    if hasattr(adapter, 'llego'):
+                        if adapter.llego is None:
+                            adapter.llego = llego
+                            self.logger.info("✅ CRITICAL: Set LLEGO layer in adapter (was None)")
+                        else:
+                            self.logger.debug("✅ LLEGO layer already set in adapter")
+                    else:
+                        # Add llego attribute if it doesn't exist
+                        adapter.llego = llego
+                        self.logger.info("✅ CRITICAL: Added LLEGO layer to adapter")
+                
+                # 🔥 CRITICAL: Always set _reflection_lm_client in adapter (even without hybrid mode)
+                # This is required for propose_new_texts() to work
+                if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None:
+                    adapter._reflection_lm_client = reflection_lm_client
+                    self.logger.info("✅ Set _reflection_lm_client in adapter (required for propose_new_texts)")
+                
+                # 🔥 HYBRID MODE FIX: Inject config into LLEGO wrapper for hybrid mode
+                # The adapter already has LLEGO wrapper, we just need to update its config
+                if self.config.enable_gepa_reflection_with_llego:
+                    # HYBRID MODE: Update the LLEGO wrapper's config
+                    self.logger.info("🔥 HYBRID MODE: Enabling hybrid candidate generation in LLEGO wrapper")
+                    
+                    # Get the LLM client (may already be wrapped)
+                    llm_client = self.adapter.llm_client
+                    from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                    
+                    if isinstance(llm_client, LLEGOEnhancedLLMClient):
+                        # Already wrapped, just update config
+                        llm_client.config = self.config
+                        self.logger.info("✅ Updated LLEGO wrapper with hybrid mode config")
+                    else:
+                        # Not wrapped yet, wrap it now with config
+                        llego_wrapped_llm = LLEGOEnhancedLLMClient(
+                            base_llm=llm_client,
+                            llego_layer=llego,
+                            config=self.config,  # ← Pass config for hybrid mode!
+                            verbose=True
+                        )
+                        # Update adapter's LLM client
+                        self.adapter.llm_client = llego_wrapped_llm
+                        self.logger.info("✅ Wrapped LLM client with LLEGO (hybrid mode enabled)")
+                    
+                    adapter = self.adapter
+                else:
+                    # LLEGO-ONLY MODE: Wrap adapter with LLEGO layer (no hybrid)
+                    self.logger.info("🧬 LLEGO-ONLY MODE: Recreating adapter with LLEGO integration...")
+                    if hasattr(self, 'adapter') and self.adapter:
+                        from .universal_adapter import UniversalGepaAdapter
+                        
+                        # Get original LLM client and evaluator from current adapter
+                        original_llm = self.adapter.llm_client
+                        # If it's already wrapped, unwrap it
+                        if hasattr(original_llm, 'base_llm'):
+                            original_llm = original_llm.base_llm
+                        
+                        evaluator = self.adapter.evaluator
+                        data_converter = self.adapter.data_converter
+                        
+                        # Recreate adapter with LLEGO (no hybrid mode config)
+                        from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                        llego_wrapped_llm = LLEGOEnhancedLLMClient(
+                            base_llm=original_llm,
+                            llego_layer=llego,
+                            config=None,  # No hybrid mode
+                            verbose=True
+                        )
+                        
+                        adapter = UniversalGepaAdapter(
+                            llm_client=llego_wrapped_llm,
+                            evaluator=evaluator,
+                            data_converter=data_converter,
+                            llego_layer=llego
+                        )
+                        self.logger.info("✅ Adapter recreated with LLEGO-enhanced LLM client")
+                    else:
+                        adapter = self.adapter
+                
+                # Create LLEGO-enhanced reflection callable
+                # When hybrid mode is enabled, reflection_lm_client is wrapped with LLEGO
+                # The wrapper will automatically generate hybrid candidates when called
+                def reflection_lm_callable(prompt: str) -> str:
+                    """
+                    Reflection callable that delegates to LLEGO-wrapped client.
+                    In hybrid mode, the wrapper generates candidates from both GEPA and LLEGO.
+                    
+                    🔥 CRITICAL: Clean the prompt to remove base64 images and truncate if too long.
+                    """
+                    # 🔥 FIX: Clean prompt to remove base64 images and truncate excessive data
+                    cleaned_prompt = self._clean_reflection_prompt(prompt)
+                    
+                    self.logger.info(f"\n{'🔥'*40}")
+                    self.logger.info(f"🔥 reflection_lm_callable CALLED (delegating to LLEGO wrapper)")
+                    self.logger.info(f"🔥 Original prompt length: {len(prompt)} chars")
+                    self.logger.info(f"🔥 Cleaned prompt length: {len(cleaned_prompt)} chars")
+                    self.logger.info(f"🔥 Truncation: {len(prompt) - len(cleaned_prompt)} chars removed")
+                    self.logger.info(f"🔥 First 200 chars (cleaned): {cleaned_prompt[:200]}...")
+                    self.logger.info(f"{'🔥'*40}\n")
+                    
+                    try:
+                        # 🔥 CRITICAL: Set reflection context BEFORE generating
+                        # This signals to the LLEGO wrapper that we're in reflection mode
+                        if isinstance(reflection_lm_client, LLEGOEnhancedLLMClient):
+                            reflection_lm_client.set_reflection_context(
+                                current_prompt=cleaned_prompt,  # Use cleaned prompt
+                                feedback=None,
+                                in_reflection=True  # Enable reflection mode
+                            )
+                            self.logger.info("✅ Reflection context set on reflection_lm_client")
+                        
+                        # 🔥 HYBRID MODE: If reflection_lm_client is wrapped with LLEGO,
+                        # calling generate() will trigger hybrid candidate generation
+                        # The wrapper handles queuing and returns candidates one by one
+                        
+                        # 🔥 CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback
+                        optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+
+Output the improved prompt directly and only the prompt."""
+                        
+                        result = reflection_lm_client.generate(
+                            system_prompt=optimization_system_prompt,
+                            user_prompt=cleaned_prompt,  # Use cleaned prompt (no base64, truncated)
+                            image_base64=""
+                        )
+                        
+                        # Extract content from result
+                        if isinstance(result, dict):
+                            candidate = result.get("content", str(result))
+                            source = result.get("source", "unknown")
+                            self.logger.info(f"✅ Candidate from {source} (FULL TEXT):")
+                            self.logger.info(f"   '{candidate}'")
+                            return candidate
+                        else:
+                            candidate = str(result)
+                            self.logger.info(f"✅ Candidate generated (FULL TEXT):")
+                            self.logger.info(f"   '{candidate}'")
+                            return candidate
+                            
+                    except Exception as e:
+                        self.logger.error(f"❌ Error in reflection_lm_callable: {e}")
+                        import traceback
+                        self.logger.error(traceback.format_exc())
+                        # Fallback: return prompt as-is
+                        return prompt
+                
+                # Set up reflection context for LLEGO wrapper
+                if self.config.enable_gepa_reflection_with_llego and isinstance(reflection_lm_client, LLEGOEnhancedLLMClient):
+                    # Store current prompt in reflection context for LLEGO operators
+                    reflection_lm_client.set_reflection_context(
+                        current_prompt=seed_candidate.get('system_prompt', ''),
+                        feedback=None,
+                        in_reflection=True
+                    )
+                
+            else:
+                # Standard GEPA reflection (no LLEGO)
+                adapter = self.adapter  # Use the original adapter
+                
+                # 🔥 CRITICAL: Always set _reflection_lm_client in adapter (even without LLEGO)
+                # This is required for propose_new_texts() to work
+                if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None:
+                    adapter._reflection_lm_client = reflection_lm_client
+                    self.logger.info("✅ Set _reflection_lm_client in adapter (required for propose_new_texts)")
+                
+                # Define standard reflection callable (no LLEGO enhancement)
+                def reflection_lm_callable(prompt: str) -> str:
+                    """Standard callable wrapper for reflection model that GEPA expects"""
+                    try:
+                        # 🔥 CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback
+                        optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+
+Output the improved prompt directly and only the prompt."""
+                        
+                        # For reflection, we only need text generation (no images)
+                        result = reflection_lm_client.generate(
+                            system_prompt=optimization_system_prompt,
+                            user_prompt=prompt,
+                            image_base64=""  # No image for reflection
+                        )
+                        
+                        # Extract string content from the result dictionary
+                        if isinstance(result, dict):
+                            return result.get("content", str(result))
+                        else:
+                            return str(result)
+                        
+                    except Exception as e:
+                        self.logger.error(f"Reflection model error: {e}")
+                        return prompt  # Return original prompt on error
+            self.logger.info(
+                f"Starting GEPA optimization with {max_iterations} iterations, "
+                f"batch size {batch_size}, max metric calls: {max_metric_calls}"
+            )
+            self.logger.info(
+                f"GEPA parameters: candidate_selection_strategy=pareto, "
+                f"reflection_minibatch_size={batch_size}, "
+                f"skip_perfect_score=False, "
+                f"module_selector=round_robin"
+            )
+            
+            # Prepare optimization parameters with ONLY valid GEPA parameters
+            # Note: 'adapter' variable is set above (either LLEGO-enhanced or standard)
+            # 🔥 REMOVED: Excessive diagnostic warnings - moved to DEBUG level
+            reflection_lm_passed = reflection_lm_callable if self.config.use_llego_operators else None
+            if reflection_lm_passed:
+                self.logger.debug(f"reflection_lm_callable passed to GEPA (may be ignored in adapter mode)")
+            
+            # #region agent log
+            import json as _json_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params", "message": "GEPA params construction", "data": {"max_iterations_from_config": max_iterations, "max_metric_calls": max_metric_calls, "batch_size": batch_size}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            # #endregion
+            
+            gepa_params = {
+                'adapter': adapter,  # Use the adapter created above (with or without LLEGO)
+                'seed_candidate': seed_candidate,
+                'trainset': trainset,
+                'valset': valset,
+                'max_metric_calls': max_metric_calls,
+                # NOTE: GEPA does NOT have num_iterations - it uses max_metric_calls to control iterations
+                
+                # 🔥 CRITICAL: When using an adapter, GEPA expects:
+                # - adapter.make_reflective_dataset() to create feedback data
+                # - GEPA's internal proposer to generate candidates from that data
+                # - task_lm and reflection_lm must be None (GEPA will use model from adapter)
+                'task_lm': None,  # Don't pass - adapter handles this
+                'reflection_lm': reflection_lm_passed,  # Pass LLEGO-enhanced reflection (may be ignored!)
+                
+                # Valid GEPA parameters based on actual library
+                'candidate_selection_strategy': 'pareto',  # Use Pareto selection
+                'skip_perfect_score': False,  # Don't skip perfect scores
+                'reflection_minibatch_size': batch_size,  # Use batch size for reflection
+                'perfect_score': 1.0,  # Perfect score threshold
+                'module_selector': 'round_robin',  # Cycle through components
+                'display_progress_bar': self.config.verbose,  # Show progress if verbose
+                'raise_on_exception': True,  # Raise exceptions for debugging
+            }
+            
+            # 🔥 CRITICAL FIX: Filter kwargs to only include valid GEPA parameters
+            # GEPA does NOT accept num_iterations, max_iterations, or other non-GEPA params
+            VALID_GEPA_PARAMS = {
+                'seed_candidate', 'trainset', 'valset', 'adapter', 'task_lm', 'reflection_lm',
+                'candidate_selection_strategy', 'skip_perfect_score', 'batch_sampler',
+                'reflection_minibatch_size', 'perfect_score', 'reflection_prompt_template',
+                'module_selector', 'use_merge', 'max_merge_invocations', 'merge_val_overlap_floor',
+                'max_metric_calls', 'stop_callbacks', 'logger', 'run_dir', 'use_wandb',
+                'wandb_api_key', 'wandb_init_kwargs', 'use_mlflow', 'mlflow_tracking_uri',
+                'mlflow_experiment_name', 'track_best_outputs', 'display_progress_bar',
+                'use_cloudpickle', 'seed', 'raise_on_exception', 'val_evaluation_policy'
+            }
+            
+            # Only add valid kwargs that aren't already in gepa_params
+            for key, value in kwargs.items():
+                if key in VALID_GEPA_PARAMS and key not in gepa_params:
+                    gepa_params[key] = value
+                elif key not in VALID_GEPA_PARAMS:
+                    self.logger.debug(f"⚠️  Filtering out invalid GEPA parameter: {key}")
+            
+            # #region agent log
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params_final", "message": "Final GEPA params keys", "data": {"params_keys": list(gepa_params.keys()), "max_metric_calls": gepa_params.get('max_metric_calls', 'NOT_PASSED')}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            # #endregion
+            
+            # 🎯 NEW: Capture GEPA's internal logging for pareto front information
+            gepa_output = io.StringIO()
+            
+            # Log iteration start
+            from ..utils.clean_logger import get_clean_logger
+            clean_log = get_clean_logger()
+            clean_log.log_iteration_start(1, seed_prompt=seed_candidate.get('system_prompt', ''))
+            
+            # 🔥 CRITICAL: Pass valset size to adapter for better dataset type detection
+            if hasattr(adapter, '_valset_size'):
+                adapter._valset_size = len(valset)
+                self.logger.debug(f"✅ Set valset_size in adapter: {len(valset)} for Dpareto detection")
+            
+            # 🔥 CRITICAL FIX: Store valset in adapter so we can evaluate generated candidates on it
+            # This ensures generated candidates are evaluated on Dpareto for Pareto selection
+            if hasattr(adapter, '_valset'):
+                adapter._valset = valset
+                self.logger.debug(f"✅ Stored valset in adapter ({len(valset)} samples) for Dpareto evaluation of generated candidates")
+            else:
+                # Add _valset attribute if it doesn't exist
+                adapter._valset = valset
+                self.logger.debug(f"✅ Added _valset attribute to adapter ({len(valset)} samples)")
+            
+            # Run GEPA optimization (synchronous call wrapped in async)
+            result = await asyncio.get_event_loop().run_in_executor(
+                None, 
+                lambda: self._run_gepa_with_logging(gepa_params, gepa_output)
+            )
+            
+            # 🎯 NEW: Process and log pareto front information, extract iteration count
+            gepa_logs = gepa_output.getvalue()
+            actual_iterations = self._log_pareto_front_info(gepa_logs)  # Get iteration count
+            
+            return result, actual_iterations  # Return both result and iteration count
+        except Exception as e:
+            # Try to extract partial results before failing
+            self.logger.warning(f"GEPA optimization failed: {e}")
+            
+            # Check if we have any cached results from the adapter
+            best_candidate = adapter.get_best_candidate()
+            best_score = adapter.get_best_score()
+            
+            if best_candidate and best_score > 0:
+                self.logger.info(f"🎯 Using cached best result with score: {best_score:.4f}")
+                
+                # Create a mock GEPA result with the best candidate found
+                return {
+                    'best_candidate': best_candidate,
+                    'best_score': best_score,
+                    'partial_result': True,
+                    'error': f'GEPA failed but returning best result found: {str(e)}'
+                }
+            else:
+                # If no cached results, re-raise the error
+                raise GepaOptimizerError(f"GEPA optimization failed: {str(e)}")
+    
+    def _run_gepa_with_logging(self, gepa_params: Dict[str, Any], output_buffer: io.StringIO) -> Any:
+        """Run GEPA optimization while capturing its output."""
+        # Capture GEPA's print statements and logging
+        with redirect_stdout(output_buffer), redirect_stderr(output_buffer):
+            return gepa.optimize(**gepa_params)
+    
+    def _log_pareto_front_info(self, gepa_logs: str) -> int:  # Return int instead of None
+        """Extract and log pareto front information from GEPA logs. Returns max iteration count."""
+        lines = gepa_logs.split('\n')
+        current_iteration = 0
+        max_iteration = 0  # Track max iteration
+        
+        for line in lines:
+            # Look for iteration information
+            if 'iteration' in line.lower():
+                # Try to extract iteration number
+                import re
+                iteration_match = re.search(r'iteration\s+(\d+)', line.lower())
+                if iteration_match:
+                    current_iteration = int(iteration_match.group(1))
+                    max_iteration = max(max_iteration, current_iteration)  # Track max
+                    # Log iteration change
+                    from ..utils.clean_logger import get_clean_logger
+                    clean_log = get_clean_logger()
+                    if current_iteration > clean_log.current_iteration:
+                        clean_log.current_iteration = current_iteration
+            
+            # Look for pareto front information
+            if 'pareto front' in line.lower() or 'new program' in line.lower():
+                self.logger.info(f"GEPA Pareto Update: {line.strip()}")
+            elif 'iteration' in line.lower() and ('score' in line.lower() or 'program' in line.lower()):
+                self.logger.debug(f"{line.strip()}")
+            elif 'best' in line.lower() and 'score' in line.lower():
+                self.logger.info(f"{line.strip()}")
+            
+            # Look for evaluation information
+            if 'evaluating' in line.lower() and 'candidate' in line.lower():
+                self.logger.debug(f"{line.strip()}")
+    
+        self.logger.info(f"GEPA Optimization Complete: {max_iteration} iterations")
+        
+        # #region agent log
+        import json as _json_debug
+        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+        with open(_debug_log_path, "a") as _f:
+            _f.write(_json_debug.dumps({"hypothesisId": "F", "location": "optimizer.py:gepa_complete", "message": "GEPA optimization complete - iteration count", "data": {"max_iteration_from_logs": max_iteration, "expected_iterations": self.config.max_iterations, "off_by_one": max_iteration != self.config.max_iterations, "gepa_logs_length": len(gepa_logs)}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        # #endregion
+        
+        return max_iteration  # Return the max iteration count
+    
+    def _extract_best_candidate(self, gepa_result: Any) -> Dict[str, str]:
+        """
+        Extract the best candidate from GEPA Pareto front (single source of truth).
+        
+        GEPA Pareto front is the single source of truth because:
+        - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto
+        - All non-dominated candidates are added to GEPA Pareto front
+        - Therefore, the best candidate MUST be in GEPA Pareto front
+        
+        Args:
+            gepa_result: Raw result from gepa.optimize() (used only as fallback edge case)
+            
+        Returns:
+            Best candidate dictionary with prompt components from GEPA Pareto front
+        """
+        try:
+            self.logger.info(f"\n{'═'*80}")
+            self.logger.info(f"🔍 EXTRACTING BEST CANDIDATE FROM GEPA PARETO FRONT")
+            self.logger.info(f"{'═'*80}")
+            
+            # ========================================================================
+            # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth)
+            # ========================================================================
+            from ..utils.pareto_logger import get_pareto_logger
+            pareto_log = get_pareto_logger()
+            
+            if pareto_log.pareto_front:
+                try:
+                    # Get best candidate from GEPA Pareto front (highest score = best)
+                    gepa_pareto_best = max(pareto_log.pareto_front, key=lambda x: x['score'])
+                    gepa_pareto_fitness = gepa_pareto_best['score']
+                    gepa_pareto_prompt = gepa_pareto_best['prompt']
+                    gepa_pareto_type = gepa_pareto_best.get('type', 'unknown')
+                    gepa_pareto_notation = gepa_pareto_best.get('notation', 'S')
+                    
+                    best_candidate = {
+                        'system_prompt': gepa_pareto_prompt,
+                        'fitness': gepa_pareto_fitness,
+                        'source': 'gepa_pareto_front',
+                        'candidate_type': gepa_pareto_type,
+                        'notation': gepa_pareto_notation
+                    }
+                    
+                    self.logger.info(f"✅ SELECTED: Best candidate from GEPA Pareto front")
+                    self.logger.info(f"   Notation: {gepa_pareto_notation}")
+                    self.logger.info(f"   Fitness: f({gepa_pareto_notation})={gepa_pareto_fitness:.4f}")
+                    self.logger.info(f"   Type: {gepa_pareto_type}")
+                    self.logger.info(f"   Prompt length: {len(gepa_pareto_prompt)} chars")
+                    self.logger.info(f"   💡 GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)")
+                    
+                    return best_candidate
+                    
+                except Exception as e:
+                    self.logger.error(f"❌ Failed to extract from GEPA Pareto front: {e}")
+                    import traceback
+                    self.logger.error(traceback.format_exc())
+            
+            # ========================================================================
+            # EDGE CASE FALLBACK: Pareto front empty (shouldn't happen, but handle gracefully)
+            # ========================================================================
+            self.logger.warning(f"⚠️  GEPA Pareto front is empty - using gepa_result as fallback")
+            self.logger.warning(f"   This should not happen if all candidates are evaluated on Dpareto")
+            
+            # Try to extract from gepa_result (last resort)
+            if hasattr(gepa_result, 'best_candidate'):
+                gepa_candidate = gepa_result.best_candidate
+                gepa_prompt = gepa_candidate.get('system_prompt') if isinstance(gepa_candidate, dict) else str(gepa_candidate)
+                gepa_fitness = getattr(gepa_result, 'best_score', None)
+                
+                if gepa_prompt:
+                    self.logger.info(f"✅ Using gepa_result.best_candidate as fallback")
+                    return {
+                        'system_prompt': gepa_prompt,
+                        'fitness': float(gepa_fitness) if gepa_fitness is not None else None,
+                        'source': 'gepa_result_fallback',
+                        'candidate_type': 'unknown',
+                        'notation': 'S'
+                    }
+            
+            # Last resort: return empty prompt
+            self.logger.error(f"❌ No candidates found anywhere - returning empty prompt")
+            return {'system_prompt': ''}
+            
+        except Exception as e:
+            self.logger.error(f"❌ Error extracting best candidate: {e}")
+            import traceback
+            self.logger.error(traceback.format_exc())
+            return {'system_prompt': ''}
+    
+    def _evaluate_candidate_on_testset(
+        self, 
+        candidate: Dict[str, str], 
+        testset: List[Dict]
+    ) -> float:
+        """
+        Evaluate a candidate prompt on the held-out test set.
+        
+        Args:
+            candidate: Prompt candidate to evaluate
+            testset: Test dataset (not used during optimization)
+            
+        Returns:
+            Average composite score on test set
+            
+        Raises:
+            TestSetEvaluationError: If evaluation fails
+        """
+        from ..utils.exceptions import TestSetEvaluationError
+        
+        try:
+            # Evaluate using the adapter (same as GEPA does internally)
+            eval_result = self.adapter.evaluate(
+                batch=testset,
+                candidate=candidate,
+                capture_traces=False  # Don't need detailed traces for test
+            )
+            
+            if not eval_result.scores:
+                raise TestSetEvaluationError("No scores returned from test evaluation")
+            
+            # Calculate average score
+            avg_score = sum(eval_result.scores) / len(eval_result.scores)
+            
+            self.logger.debug(
+                f"Test set evaluation: {len(eval_result.scores)} samples, "
+                f"scores: {eval_result.scores}, avg: {avg_score:.4f}"
+            )
+            
+            return avg_score
+            
+        except Exception as e:
+            raise TestSetEvaluationError(f"Failed to evaluate on test set: {str(e)}")
+    
+    def optimize_sync(self,
+                     model: str,
+                     seed_prompt: str,
+                     dataset: Any,
+                     reflection_lm: str,
+                     max_metric_calls: int = 150,
+                     **kwargs) -> OptimizedResult:
+        """
+        Synchronous version of the optimization method
+        
+        Args:
+            model: Target model to optimize for
+            seed_prompt: Initial prompt to optimize
+            dataset: Training data in any format
+            reflection_lm: Model for reflection
+            max_metric_calls: Budget for optimization attempts
+            **kwargs: Additional optimization parameters
+            
+        Returns:
+            OptimizedResult: Optimization result
+        """
+        # Run the async method in a new event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        
+        try:
+            result = loop.run_until_complete(
+                self.train(model, seed_prompt, dataset, reflection_lm, max_metric_calls, **kwargs)
+            )
+            return result
+        finally:
+            loop.close()
+
+
+# Convenience function for quick optimization
+def optimize_prompt(
+    model: Union[str, ModelConfig],
+    seed_prompt: str,
+    dataset: Any,
+    reflection_model: Optional[Union[str, ModelConfig]] = None,
+    **kwargs
+) -> OptimizedResult:
+    """
+    Convenience function for quick prompt optimization without creating optimizer instance
+    
+    Args:
+        model: Target model configuration
+        seed_prompt: Initial prompt to optimize
+        dataset: Training data
+        reflection_model: Model for reflection (optional)
+        **kwargs: Additional optimization parameters
+        
+    Returns:
+        OptimizedResult: Optimization result
+    """
+    # Create default config if not provided
+    if reflection_model is None:
+        reflection_model = model
+    
+    config = OptimizationConfig(
+        model=model,
+        reflection_model=reflection_model,
+        max_iterations=kwargs.get('max_iterations', 10),
+        max_metric_calls=kwargs.get('max_metric_calls', 50),
+        batch_size=kwargs.get('batch_size', 4)
+    )
+    
+    optimizer = GepaOptimizer(config=config)
+    return asyncio.run(optimizer.train(seed_prompt, dataset, **kwargs))
+
+
+
+
+
+
diff --git a/src/gepa_optimizer/core/result.py b/src/gepa_optimizer/core/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..d23bb98840b4e023873ef435df846afebe748187
--- /dev/null
+++ b/src/gepa_optimizer/core/result.py
@@ -0,0 +1,180 @@
+"""
+Result processing for GEPA Optimizer
+Handles extraction and processing of GEPA optimization results
+"""
+
+from typing import Any, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+class ResultProcessor:
+    """
+    Processes raw GEPA optimization results into clean, usable formats
+    """
+    
+    @staticmethod
+    def extract_optimized_prompt(result: Any) -> str:
+        """
+        Extract the optimized prompt from GEPA result object
+        
+        Args:
+            result: Raw GEPA optimization result
+            
+        Returns:
+            str: The optimized prompt text
+        """
+        try:
+            # Try multiple possible result structures
+            if hasattr(result, 'best_candidate'):
+                candidate = result.best_candidate
+                
+                if isinstance(candidate, dict):
+                    # Try common prompt keys
+                    for key in ['system_prompt', 'prompt', 'text']:
+                        if key in candidate:
+                            return str(candidate[key])
+                    
+                    # If no standard key found, return string representation
+                    return str(candidate)
+                else:
+                    return str(candidate)
+            
+            # Fallback - convert entire result to string
+            return str(result)
+            
+        except Exception as e:
+            logger.warning(f"Failed to extract optimized prompt: {e}")
+            return "Optimization completed (prompt extraction failed)"
+    
+    @staticmethod
+    def extract_metrics(result: Any) -> Dict[str, Any]:
+        """
+        Extract performance metrics from GEPA result
+        
+        Args:
+            result: Raw GEPA optimization result
+            
+        Returns:
+            Dict[str, Any]: Extracted metrics
+        """
+        metrics = {}
+        
+        try:
+            # Extract common metrics
+            if hasattr(result, 'best_score'):
+                metrics['best_score'] = float(result.best_score)
+            
+            if hasattr(result, 'baseline_score'):
+                metrics['baseline_score'] = float(result.baseline_score)
+            
+            if hasattr(result, 'improvement'):
+                metrics['improvement'] = float(result.improvement)
+            
+            if hasattr(result, 'iterations'):
+                metrics['iterations'] = int(result.iterations)
+            
+            # Calculate improvement percentage if we have both scores
+            if 'best_score' in metrics and 'baseline_score' in metrics:
+                baseline = metrics['baseline_score']
+                if baseline > 0:
+                    improvement_percent = ((metrics['best_score'] - baseline) / baseline) * 100
+                    metrics['improvement_percent'] = round(improvement_percent, 2)
+            
+            # Extract additional metadata
+            if hasattr(result, 'metadata'):
+                metrics['metadata'] = result.metadata
+            
+        except Exception as e:
+            logger.warning(f"Failed to extract metrics: {e}")
+        
+        return metrics
+    
+    @staticmethod
+    def extract_reflection_history(result: Any) -> list:
+        """
+        Extract reflection/optimization history from GEPA result
+        
+        Args:
+            result: Raw GEPA optimization result
+            
+        Returns:
+            list: List of reflection iterations
+        """
+        history = []
+        
+        try:
+            if hasattr(result, 'optimization_history'):
+                for i, iteration in enumerate(result.optimization_history):
+                    history_item = {
+                        'iteration': i,
+                        'score': iteration.get('score', 0.0),
+                        'candidate': iteration.get('candidate', {}),
+                        'feedback': iteration.get('feedback', ''),
+                        'improvement': iteration.get('improvement', 0.0)
+                    }
+                    history.append(history_item)
+            
+        except Exception as e:
+            logger.warning(f"Failed to extract reflection history: {e}")
+        
+        return history
+    
+    @staticmethod
+    def process_full_result(
+        result: Any, 
+        original_prompt: str, 
+        optimization_time: float, 
+        actual_iterations: Optional[int] = None,
+        test_metrics: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Process complete GEPA result into structured format.
+        
+        Args:
+            result: Raw GEPA optimization result
+            original_prompt: Original seed prompt
+            optimization_time: Time taken for optimization
+            actual_iterations: Actual number of iterations from GEPA logs (optional)
+            test_metrics: Metrics from test set evaluation (optional)
+            
+        Returns:
+            Dict[str, Any]: Complete processed result
+        """
+        # Extract metrics first
+        metrics = ResultProcessor.extract_metrics(result)
+        
+        # Extract iterations from GEPA result
+        total_iterations = 0
+        try:
+            # First priority: use actual_iterations if provided (from logs)
+            if actual_iterations is not None:
+                total_iterations = actual_iterations
+            elif hasattr(result, 'iterations'):
+                total_iterations = int(result.iterations)
+            elif hasattr(result, 'num_iterations'):
+                total_iterations = int(result.num_iterations)
+            elif hasattr(result, 'optimization_history'):
+                total_iterations = len(result.optimization_history)
+            # Check if it's in metrics
+            elif 'iterations' in metrics:
+                total_iterations = metrics['iterations']
+        except Exception as e:
+            logger.warning(f"Failed to extract iterations: {e}")
+        
+        # Merge test metrics into improvement_data
+        improvement_data = {}
+        if test_metrics:
+            improvement_data.update(test_metrics)
+        
+        return {
+            'original_prompt': original_prompt,
+            'optimized_prompt': ResultProcessor.extract_optimized_prompt(result),
+            'metrics': metrics,
+            'improvement_data': improvement_data,
+            'reflection_history': ResultProcessor.extract_reflection_history(result),
+            'optimization_time': optimization_time,
+            'total_iterations': total_iterations,
+            'status': 'completed',
+            'raw_result': result  # Keep raw result for advanced users
+        }
diff --git a/src/gepa_optimizer/core/universal_adapter.py b/src/gepa_optimizer/core/universal_adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..539b7413219278c5e28085fb159456aa664c3600
--- /dev/null
+++ b/src/gepa_optimizer/core/universal_adapter.py
@@ -0,0 +1,2386 @@
+"""
+Universal GEPA adapter for user-defined metrics and LLM clients.
+"""
+
+from .base_adapter import BaseGepaAdapter
+from ..data.converters import UniversalConverter
+from typing import Any, Dict, List, Optional
+import logging
+import re
+from gepa.core.adapter import EvaluationBatch
+
+logger = logging.getLogger(__name__)
+
+class UniversalGepaAdapter(BaseGepaAdapter):
+    """
+    Universal GEPA adapter that works with any LLM client and evaluator.
+    
+    This adapter uses the existing UniversalConverter for data processing
+    and delegates LLM generation and evaluation to user-provided components.
+    
+    Features:
+    - Optimized multi-variation JSON generation (66% cost reduction)
+    - Robust parsing with multiple fallback strategies
+    - Automatic fallback to sequential generation if JSON parsing fails
+    """
+    
+    # Fallback system prompt for sequential generation (when JSON parsing fails)
+    _FALLBACK_SYSTEM_PROMPT = """You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+
+Output the improved prompt directly and only the prompt."""
+    
+    def __init__(self, llm_client, evaluator, data_converter=None, llego_layer=None):
+        """
+        Initialize universal adapter.
+        
+        Args:
+            llm_client: User-provided LLM client (must inherit from BaseLLMClient)
+            evaluator: User-provided evaluator (must inherit from BaseEvaluator)
+            data_converter: Optional custom data converter (uses UniversalConverter by default)
+            llego_layer: Optional LLEGO integration layer for genetic operations
+        """
+        # Store LLEGO layer first
+        self.llego = llego_layer
+        
+        # If LLEGO is provided, wrap the LLM client
+        # Note: If config is passed separately, it will be handled by optimizer
+        if llego_layer is not None:
+            from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+            # Only wrap if not already wrapped (optimizer may have wrapped it with config)
+            if not isinstance(llm_client, LLEGOEnhancedLLMClient):
+                # Wrap before calling super().__init__
+                # Config will be set later by optimizer if hybrid mode is enabled
+                llm_client = LLEGOEnhancedLLMClient(llm_client, llego_layer, config=None, verbose=True)
+            else:
+                # Already wrapped, but update config if available
+                if hasattr(llm_client, 'config') and llm_client.config is None:
+                    # Config will be set by optimizer later
+                    pass
+        
+        # Initialize parent (this sets up self.logger)
+        super().__init__(llm_client, evaluator)
+        
+        # Use existing UniversalConverter for data processing
+        self.data_converter = data_converter or UniversalConverter()
+        
+        # 🔥 NEW: Initialize optimization state tracking
+        self._is_baseline_evaluation = False  # Flag to distinguish baseline vs optimization
+        self._last_candidate = None  # Track last candidate to detect changes
+        self._gepa_iteration = 0  # Track actual GEPA iteration (not evaluation count)
+        
+        # Track candidates for logging
+        self._evaluation_count = 0
+        
+        # Track current evaluation context
+        self._current_evaluation_type = None  # 'seed', 'gepa_reflection', 'llego_crossover', 'llego_mutation'
+        self._current_dataset_type = None  # 'dfeedback' or 'dpareto'
+        self._baseline_score = None  # Store baseline score for comparison
+        
+        # Track candidate sources by prompt text (in case GEPA doesn't pass source field)
+        self._candidate_sources = {}  # Maps prompt_text -> source_type
+        
+        # Track validation set size for better dataset type detection
+        self._valset_size = None  # Will be set by optimizer
+        self._valset = None  # Will be set by optimizer - stores actual valset for Dpareto evaluation
+        
+        # 🔥 CRITICAL: Track which candidates have been evaluated on Dpareto to avoid double evaluation
+        # Key: normalized prompt text, Value: (fitness_score, candidate_type, timestamp)
+        self._dpareto_evaluated_candidates = {}  # Maps prompt -> (score, type)
+        
+        # 🔥 HYBRID MODE: Storage for generated candidates
+        self._generated_candidates = []  # Store hybrid mode candidates
+        self._candidate_generation_active = False  # Track if we're generating candidates
+        self._config = None  # Will be set by optimizer if hybrid mode enabled
+        self._reflection_lm_client = None  # Will be set by optimizer
+        
+        # 🔥 FORMAT AWARENESS: Store detected output format for better prompts
+        self._detected_format = None  # Will be populated from expected outputs
+        self._format_detection_done = False  # Only detect once
+        
+        # Log initialization
+        model_info = llm_client.get_model_info()
+        if llego_layer is not None:
+            self.logger.info(f"🚀 Initialized Universal adapter with {model_info}")
+            self.logger.info(f"🧬 LLEGO integration ENABLED - LLM client is wrapped for genetic operations")
+        else:
+            self.logger.info(f"🚀 Initialized Universal adapter with {model_info}")
+    
+    def _clean_llm_output(self, output: str) -> str:
+        """
+        🔥 CRITICAL: Clean LLM output before evaluation.
+        
+        LLMs often wrap JSON/structured output in markdown code blocks.
+        This causes evaluation to fail because the evaluator sees:
+            "```json\n{\"key\": \"value\"}\n```"
+        Instead of:
+            "{\"key\": \"value\"}"
+        
+        This method extracts the clean content for fair comparison.
+        """
+        if not output or not isinstance(output, str):
+            return output
+        
+        cleaned = output.strip()
+        
+        # Remove markdown code blocks (```json ... ``` or ``` ... ```)
+        code_block_match = re.search(r'```(?:json|JSON)?\s*([\s\S]*?)\s*```', cleaned)
+        if code_block_match:
+            extracted = code_block_match.group(1).strip()
+            # Only use extracted if it looks like valid content
+            if extracted and (extracted.startswith('{') or extracted.startswith('[') or len(extracted) > 10):
+                self.logger.debug(f"📦 Cleaned markdown code block from LLM output")
+                return extracted
+        
+        # Remove leading/trailing markdown artifacts
+        # Handle cases like "Here is the JSON:\n```json\n...\n```"
+        if '```' in cleaned:
+            # Try to extract content between first ``` and last ```
+            parts = cleaned.split('```')
+            if len(parts) >= 3:
+                # Content is in the middle part(s)
+                middle_content = parts[1]
+                # Remove language tag if present (e.g., "json\n")
+                middle_content = re.sub(r'^(?:json|JSON|python|text)\s*\n?', '', middle_content).strip()
+                if middle_content:
+                    return middle_content
+        
+        return cleaned
+    
+    def _detect_and_cache_format(self, batch: List[Dict[str, Any]]) -> None:
+        """
+        Detect output format from expected outputs and cache for future use.
+        
+        This enables format-aware prompting and feedback generation.
+        """
+        try:
+            from ..utils.format_detection import detect_output_format
+            
+            # Extract expected outputs from batch
+            expected_outputs = []
+            for item in batch:
+                # Try to extract output directly, or standardize if needed
+                output = None
+                if isinstance(item, dict):
+                    # Try common output field names first
+                    output = item.get('output') or item.get('expected_output') or item.get('result') or item.get('answer')
+                    if not output:
+                        # Standardize using converter's private method (same as _evaluate_batch_mode)
+                        try:
+                            standardized = self.data_converter._standardize([item])[0]
+                            output = standardized.get('output')
+                        except Exception:
+                            pass
+                
+                if output and isinstance(output, str) and output.strip():
+                    expected_outputs.append(output)
+            
+            if expected_outputs:
+                self._detected_format = detect_output_format(expected_outputs)
+                self.logger.info(f"📐 FORMAT DETECTED: {self._detected_format['format_type']}")
+                self.logger.info(f"   Spec: {self._detected_format['format_spec'][:100]}...")
+                self.logger.info(f"   Avg length: {self._detected_format['avg_length']} chars")
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_DETECT", "location": "universal_adapter.py:format_detected", "message": "Format detection successful", "data": {"format_type": self._detected_format['format_type'], "num_outputs": len(expected_outputs), "avg_length": self._detected_format['avg_length'], "has_constraint": bool(self._detected_format.get('format_constraint'))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+            else:
+                self.logger.warning("⚠️ No expected outputs found for format detection")
+                self._detected_format = None
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_DETECT", "location": "universal_adapter.py:format_detected", "message": "Format detection failed - no outputs", "data": {"batch_size": len(batch)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+                
+        except Exception as e:
+            self.logger.warning(f"⚠️ Format detection failed: {e}")
+            self._detected_format = None
+    
+    def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str], 
+                capture_traces: bool = False) -> EvaluationBatch:
+        """
+        Evaluate candidates using user-provided LLM client and evaluator.
+        
+        This method automatically detects BatchLLMClient and uses batch processing
+        for cost savings, or falls back to standard individual processing.
+        
+        This method works with any data type supported by UniversalConverter.
+        
+        🔥 IMPORTANT: We only optimize system_prompt, NOT user_prompt.
+        The user_prompt varies per tester and is not part of optimization.
+        
+        🔥 CACHING: Seed prompt is evaluated ONLY ONCE on Dpareto (validation set).
+        Subsequent evaluations return cached result to save API calls and ensure consistency.
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        
+        # 🔥 FORMAT DETECTION: Detect output format from expected outputs (once)
+        if not self._format_detection_done and batch:
+            self._detect_and_cache_format(batch)
+            self._format_detection_done = True
+        
+        # Determine dataset type first (needed for cache check)
+        batch_size_threshold = self._config.batch_size if hasattr(self, '_config') and self._config else 8
+        
+        # 🔥 CRITICAL FIX: If _is_baseline_evaluation is True, we KNOW this is the validation set
+        # This fixes the issue where valset_size might not be set yet when baseline detection happens
+        if hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation:
+            dataset_type = 'dpareto'  # Baseline is ALWAYS evaluated on validation set
+            self.logger.debug(f"🎯 Forced dataset_type to 'dpareto' (baseline evaluation flag is True)")
+        elif hasattr(self, '_valset_size') and self._valset_size is not None and len(batch) >= self._valset_size:
+            dataset_type = 'dpareto'  # Full validation set size = Dpareto
+        elif len(batch) > batch_size_threshold * 1.5:
+            dataset_type = 'dpareto'  # Much larger than batch = likely full valset
+        else:
+            dataset_type = 'dfeedback'  # Small batch = training minibatch for reflection
+        
+        # 🔥 CRITICAL: Check cache to avoid re-evaluating same prompt on Dpareto
+        # This ensures seed prompt is evaluated ONLY ONCE
+        if dataset_type == 'dpareto':
+            normalized_prompt = system_prompt.strip().strip('"\'')
+            if normalized_prompt in self._dpareto_evaluated_candidates:
+                existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt]
+                self.logger.info(
+                    f"♻️  CACHE HIT: Prompt already evaluated on Dpareto "
+                    f"(score={existing_score:.4f}, type={existing_type}) - skipping re-evaluation"
+                )
+                
+                # Return cached result - create EvaluationBatch with cached score
+                cached_outputs = [f"[CACHED: {existing_type}]"] * len(batch)
+                cached_scores = [existing_score] * len(batch)
+                
+                # Still update baseline if this is seed and baseline not set
+                from ..utils.pareto_logger import get_pareto_logger
+                pareto_log = get_pareto_logger()
+                
+                if existing_type == 'seed' and self._baseline_score is None:
+                    self._baseline_score = existing_score
+                    pareto_log.set_baseline(existing_score)
+                    self.logger.info(f"📊 Baseline score set from cache: {existing_score:.4f}")
+                
+                # Log to Pareto logger (for tracking, but no re-evaluation)
+                pareto_log.log_candidate_evaluation(
+                    prompt=system_prompt,
+                    score=existing_score,
+                    candidate_type=existing_type,
+                    dataset_type='dpareto'
+                )
+                
+                return EvaluationBatch(
+                    outputs=cached_outputs,
+                    scores=cached_scores,
+                    trajectories=None  # No traces for cached results
+                )
+        
+        # Determine candidate type
+        # Priority order:
+        # 1. Check candidate dict for 'source' field (from LLM wrapper)
+        # 2. Check _candidate_sources mapping (from previous evaluations)
+        # 3. Check _current_evaluation_type (from log_proposed_candidate)
+        # 4. Infer from context (seed, repeat, etc.)
+        
+        candidate_type = candidate.get('source')  # First try candidate dict
+        if not candidate_type or candidate_type == 'unknown':
+            candidate_type = self._candidate_sources.get(system_prompt)  # Check mapping
+        if not candidate_type or candidate_type == 'unknown':
+            candidate_type = self._current_evaluation_type  # Use stored type
+        if not candidate_type or candidate_type == 'unknown':
+            # Try to infer from prompt or metadata
+            if system_prompt == self._last_candidate:
+                candidate_type = 'repeat'  # Same prompt being re-evaluated
+            elif self._evaluation_count == 0 or 'seed' in str(candidate.get('source', '')).lower():
+                candidate_type = 'seed'  # Explicitly mark as seed
+                self.logger.debug("🌱 Detected seed prompt (S₀)")
+            else:
+                candidate_type = 'unknown'  # Truly unknown
+        
+        # #region agent log
+        import json as _json_debug
+        import time as _time_debug
+        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+        with open(_debug_log_path, "a") as _f:
+            _f.write(_json_debug.dumps({"hypothesisId": "C", "location": "universal_adapter.py:candidate_type_detect", "message": "Candidate type detection", "data": {"candidate_type": candidate_type, "evaluation_count": self._evaluation_count, "from_candidate_dict": candidate.get('source'), "from_sources_mapping": self._candidate_sources.get(system_prompt), "from_current_type": self._current_evaluation_type}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        # #endregion
+        
+        # Store source for future lookups (always update if we found a valid type)
+        if candidate_type and candidate_type != 'unknown' and system_prompt not in self._candidate_sources:
+            self._candidate_sources[system_prompt] = candidate_type
+            self.logger.debug(f"   📝 Stored candidate type: {candidate_type} for prompt (length: {len(system_prompt)})")
+        
+        # Dataset type already determined above for cache check - reuse it
+        
+        # #region agent log
+        try:
+            import json as _json_debug
+            import time as _time_debug
+            import os as _os_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "H", "location": "universal_adapter.py:dataset_type_detect", "message": "Dataset type detection", "data": {"batch_size": len(batch), "valset_size": getattr(self, '_valset_size', None), "batch_size_threshold": batch_size_threshold, "detected_type": dataset_type, "evaluation_count": self._evaluation_count}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        except Exception:
+            pass
+        # #endregion
+        
+        # Check if this is a new candidate (different from last one)
+        if self._last_candidate != system_prompt:
+            self._evaluation_count += 1
+            # 🔥 CRITICAL: If this is baseline evaluation, force candidate_type to 'seed'
+            if self._is_baseline_evaluation:
+                candidate_type = 'seed'
+                self.logger.debug(f"🌱 Baseline evaluation detected - setting candidate_type to 'seed'")
+            self._current_evaluation_type = candidate_type
+            self._current_dataset_type = dataset_type
+            self._last_candidate = system_prompt
+            
+            # Minimal logging - just track what we're evaluating
+            if self._is_baseline_evaluation:
+                self.logger.debug(f"Evaluating baseline (S₀) on {dataset_type}")
+            else:
+                self.logger.debug(f"Evaluating candidate #{self._evaluation_count} ({candidate_type}) on {dataset_type}")
+        
+        # Detect and use batch mode if available
+        from ..llms.batch_llm import BatchLLMClient
+        is_batch_mode = isinstance(self.llm_client, BatchLLMClient)
+        
+        if is_batch_mode:
+            outputs, scores, trajectories = self._evaluate_batch_mode(
+                batch, system_prompt, capture_traces
+            )
+        else:
+            outputs, scores, trajectories = self._evaluate_standard_mode(
+                batch, system_prompt, capture_traces
+            )
+        
+        avg_score = sum(scores) / len(scores) if scores else 0.0
+        
+        # #region agent log
+        import json as _json_debug
+        import time as _time_debug
+        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+        with open(_debug_log_path, "a") as _f:
+            _f.write(_json_debug.dumps({"hypothesisId": "B,C", "location": "universal_adapter.py:baseline_check", "message": "Baseline check conditions", "data": {"baseline_score_is_none": self._baseline_score is None, "current_dataset_type": self._current_dataset_type, "current_evaluation_type": self._current_evaluation_type, "is_baseline_evaluation": self._is_baseline_evaluation, "batch_size": len(batch), "avg_score": avg_score}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        # #endregion
+        
+        # 🔥 CRITICAL FIX: Baseline MUST be set from seed's first Dpareto evaluation ONLY
+        # This ensures FAIR comparison: seed and candidates evaluated on SAME dataset (Dpareto) with SAME number of datapoints
+        # 
+        # Fair evaluation requires:
+        # - Seed baseline: Dpareto (validation set) - first evaluation during optimization
+        # - Candidates: Dpareto (validation set) - same dataset, same size
+        # - Same conditions = fair comparison ✅
+        # 
+        # We IGNORE test set for baseline - baseline must come from Dpareto to ensure same dataset/size
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        
+        # 🔥 FIX: Check if this is baseline evaluation AND dpareto - set baseline with priority
+        is_baseline_eval = hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation
+        
+        if self._baseline_score is None:
+            # 🔥 FIX B: Set baseline on FIRST Dpareto evaluation, regardless of candidate type
+            # Also set baseline if this is explicitly marked as baseline evaluation
+            if self._current_dataset_type == 'dpareto' or is_baseline_eval:
+                # ✅ PRIMARY: Set baseline from FIRST Dpareto evaluation (seed or first candidate)
+                self._baseline_score = avg_score
+                pareto_log.set_baseline(avg_score)
+                self.logger.info(f"📊 Baseline score (Dpareto, {len(batch)} samples): {avg_score:.4f}")
+                self.logger.info(f"   ✅ Baseline set from {'baseline evaluation' if is_baseline_eval else 'first Dpareto'} (type: {self._current_evaluation_type})")
+                # #region agent log
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "B", "location": "universal_adapter.py:baseline_set", "message": "Baseline score SET", "data": {"baseline_score": avg_score, "candidate_type": self._current_evaluation_type, "dataset_type": self._current_dataset_type, "is_baseline_eval": is_baseline_eval}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+            # Note: Test set evaluations are ignored for baseline - baseline comes from Dpareto
+        else:
+            # 🔥 SAFETY CHECK: Ensure Pareto logger also has baseline if adapter has it
+            # This handles the case where optimizer set baseline in adapter but Pareto logger wasn't updated
+            if (self._current_dataset_type == 'dpareto' or is_baseline_eval) and pareto_log.baseline_score is None:
+                pareto_log.set_baseline(self._baseline_score)
+                self.logger.info(f"✅ Synchronized baseline in Pareto logger: {self._baseline_score:.4f}")
+        
+        # Track Dpareto evaluations for Pareto front
+        if self._current_dataset_type == 'dpareto':
+            from ..utils.pareto_logger import get_pareto_logger
+            pareto_log = get_pareto_logger()
+            pareto_log.log_candidate_evaluation(
+                prompt=system_prompt,
+                score=avg_score,
+                candidate_type=self._current_evaluation_type or 'unknown',
+                dataset_type=self._current_dataset_type
+            )
+            
+            # Track evaluated candidates
+            normalized_prompt = system_prompt.strip().strip('"\'')
+            if normalized_prompt not in self._dpareto_evaluated_candidates:
+                self._dpareto_evaluated_candidates[normalized_prompt] = (
+                    avg_score, self._current_evaluation_type or 'unknown', 'evaluated_by_gepa'
+                )
+        
+        self.logger.debug(f"Evaluation complete: score={avg_score:.4f}")
+        
+        # 🔥 CRITICAL: Update _best_candidate and _best_score with average fitness for Dpareto evaluations
+        # This ensures the adapter tracks the best average fitness, not just per-sample scores
+        # Only update if this score is better than current best
+        if self._current_dataset_type == 'dpareto':
+            if self._best_score is None or avg_score > self._best_score:
+                self._best_score = avg_score
+                self._best_candidate = {
+                    'system_prompt': system_prompt,
+                    'fitness': avg_score,
+                    'source': self._current_evaluation_type or 'unknown'
+                }
+                self.logger.info(f"✅ Updated best candidate from Dpareto evaluation: f={avg_score:.4f} (type: {self._current_evaluation_type})")
+        
+        return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
+    
+    def _evaluate_batch_mode(
+        self, 
+        batch: List[Dict], 
+        system_prompt: str,
+        capture_traces: bool
+    ) -> tuple:
+        """
+        Batch mode evaluation - process all samples in one API call.
+        
+        This method prepares all requests, submits them as a batch job to Gemini,
+        waits for completion, then evaluates all results.
+        """
+        # Prepare all requests
+        requests = []
+        standardized_items = []
+        
+        for item in batch:
+            standardized_item = self.data_converter._standardize([item])[0]
+            standardized_items.append(standardized_item)
+            
+            request = {
+                'system_prompt': system_prompt,
+                'user_prompt': standardized_item['input']
+            }
+            
+            if standardized_item.get('image'):
+                request['image_base64'] = standardized_item['image']
+            
+            requests.append(request)
+        
+        # Submit batch job and get all results at once
+        batch_results = self.llm_client.generate_batch(requests)
+        
+        # Process results
+        outputs = []
+        scores = []
+        trajectories = [] if capture_traces else None
+        
+        for i, (llm_response, standardized_item) in enumerate(zip(batch_results, standardized_items)):
+            # Extract content
+            raw_output = llm_response.get("content", "")
+            
+            # 🔥 CRITICAL: Clean markdown wrappers before evaluation
+            predicted_output = self._clean_llm_output(raw_output)
+            outputs.append(predicted_output)
+            
+            # Evaluate with cleaned output
+            evaluation_results = self.evaluator.evaluate(
+                predicted_output, 
+                standardized_item['output']
+            )
+            
+            composite_score = evaluation_results.get("composite_score", 0.0)
+            scores.append(composite_score)
+            
+            # Update tracking
+            if composite_score > self._best_score:
+                self._best_score = composite_score
+                self._best_candidate = {'system_prompt': system_prompt}
+            
+            # Capture traces
+            if capture_traces:
+                trajectories.append({
+                    'input_data': standardized_item,
+                    'predicted_output': predicted_output,
+                    'evaluation_results': evaluation_results
+                })
+            
+            # Concise logging with element IDs and candidate notation
+            predicted_element = evaluation_results.get('predicted_element', '?')
+            expected_element = evaluation_results.get('expected_element', '?')
+            status = "✅" if composite_score == 1.0 else "❌"
+            
+            # Add notation for candidate type
+            notation_map = {'seed': 'S₀', 'gepa_reflection': 'Sᵣ', 'llego_crossover': 'Oₓₒ', 'llego_mutation': 'Oₘᵤₜ'}
+            notation = notation_map.get(self._current_evaluation_type, 'S')
+            
+            self.logger.info(f"   [{notation}] Sample {i+1}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status}")
+        
+        return outputs, scores, trajectories
+    
+    def _evaluate_standard_mode(
+        self, 
+        batch: List[Dict], 
+        system_prompt: str,
+        capture_traces: bool
+    ) -> tuple:
+        """
+        Standard mode evaluation - process samples individually (existing logic).
+        
+        This is the original implementation, preserved for backward compatibility
+        and for use with non-batch LLM clients.
+        """
+        outputs = []
+        scores = []
+        trajectories = [] if capture_traces else None
+        
+        for i, item in enumerate(batch):
+            # Use existing data processing logic
+            standardized_item = self.data_converter._standardize([item])[0]
+            
+            # Prepare generation parameters
+            generation_params = {
+                'system_prompt': system_prompt,
+                'user_prompt': standardized_item['input']
+            }
+            
+            # Add image if present
+            if standardized_item.get('image'):
+                generation_params['image_base64'] = standardized_item['image']
+            
+            # Generate response using user's LLM client
+            llm_response = self.llm_client.generate(**generation_params)
+            
+            # Extract content
+            if isinstance(llm_response, dict):
+                raw_output = llm_response.get("content", "")
+            else:
+                raw_output = str(llm_response)
+            
+            # 🔥 CRITICAL: Clean markdown wrappers before evaluation
+            predicted_output = self._clean_llm_output(raw_output)
+            outputs.append(predicted_output)
+            
+            # Evaluate using user's evaluator with cleaned output
+            evaluation_results = self.evaluator.evaluate(
+                predicted_output, 
+                standardized_item['output']
+            )
+            
+            composite_score = evaluation_results.get("composite_score", 0.0)
+            scores.append(composite_score)
+            
+            # #region agent log
+            try:
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_adapter.py:evaluation_result", "message": "Individual evaluation result", "data": {"sample_idx": i, "composite_score": composite_score, "semantic_sim": evaluation_results.get("semantic_similarity", -1), "structural_sim": evaluation_results.get("structural_similarity", -1), "format_mismatch": evaluation_results.get("analysis", {}).get("format_mismatch", False), "predicted_len": len(predicted_output) if predicted_output else 0, "expected_len": len(standardized_item.get('output', ''))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            
+            # Update performance tracking
+            self._evaluation_count += 1
+            if composite_score > self._best_score:
+                self._best_score = composite_score
+                self._best_candidate = {'system_prompt': system_prompt}
+            
+            # Capture traces if requested
+            if capture_traces:
+                trajectories.append({
+                    'input_data': standardized_item,
+                    'predicted_output': predicted_output,
+                    'evaluation_results': evaluation_results
+                })
+            
+            # Concise logging with element IDs and candidate notation
+            predicted_element = evaluation_results.get('predicted_element', '?')
+            expected_element = evaluation_results.get('expected_element', '?')
+            status = "✅" if composite_score == 1.0 else "❌"
+            
+            # Add notation for candidate type
+            notation_map = {'seed': 'S₀', 'gepa_reflection': 'Sᵣ', 'llego_crossover': 'Oₓₒ', 'llego_mutation': 'Oₘᵤₜ'}
+            notation = notation_map.get(self._current_evaluation_type, 'S')
+            
+            self.logger.info(f"   [{notation}] Sample {i+1}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status}")
+        
+        return outputs, scores, trajectories
+    
+    def make_reflective_dataset(self, candidate: Dict[str, str], eval_batch: EvaluationBatch, 
+                              components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Create reflective dataset using user-provided evaluator.
+        
+        This method generates feedback based on the evaluation results
+        from the user's custom evaluator.
+        
+        🔥 NEW: If hybrid mode is enabled, this method ALSO generates hybrid candidates
+        (GEPA Reflection + LLEGO Operators) and stores them for GEPA to use.
+        """
+        # 🔥 REMOVED: Excessive diagnostic logs - moved to DEBUG level if needed
+        self.logger.debug(f"make_reflective_dataset() called - generating feedback and hybrid candidates")
+        
+        reflective_dataset = {}
+        system_prompt = candidate.get('system_prompt', '')
+        
+        # 🔥 REMOVED: Verbose diagnostic checks - only log if hybrid mode is actually enabled
+        hybrid_mode_enabled = (self._config and 
+                              hasattr(self._config, 'enable_gepa_reflection_with_llego') and 
+                              self._config.enable_gepa_reflection_with_llego and
+                              self._reflection_lm_client)
+        
+        if hybrid_mode_enabled:
+            self.logger.debug(f"✅ Hybrid mode conditions met - will generate hybrid candidates")
+        
+        # ========================================================================
+        # 🔥 CRITICAL FIX: Update LLEGO population with evaluated candidate
+        # ========================================================================
+        # This is the MISSING LINK! After a candidate is evaluated, we need to add it
+        # to the LLEGO population so it can be used for crossover/mutation.
+        # Without this, the population only contains the seed, so Pareto front stays at 1!
+        # 
+        # This is called for EVERY candidate that GEPA evaluates:
+        # - Seed prompt (baseline) → added to population
+        # - New candidate 1 (from reflection/crossover/mutation) → added to population
+        # - New candidate 2 → added to population
+        # - etc.
+        if self.llego:
+            # Calculate average fitness from evaluation scores
+            if eval_batch.scores and len(eval_batch.scores) > 0:
+                avg_fitness = sum(eval_batch.scores) / len(eval_batch.scores)
+            else:
+                # Fallback: extract from trajectories if scores not available
+                scores = [t.get('evaluation_results', {}).get('composite_score', 0.0) 
+                         for t in eval_batch.trajectories if 'evaluation_results' in t]
+                avg_fitness = sum(scores) / len(scores) if scores else 0.0
+            
+            self.logger.debug(f"Updating LLEGO population: fitness={avg_fitness:.4f}")
+            
+            # Create PromptCandidate from evaluated prompt
+            from ..operators.llego_operators import PromptCandidate
+            
+            # Check if this candidate already exists in population (avoid duplicates)
+            # 🔥 FIX: Normalize prompts for comparison (strip whitespace, remove quotes)
+            normalized_new_prompt = system_prompt.strip().strip('"\'')
+            existing_prompts = {p.prompt.strip().strip('"\'') for p in self.llego.population}
+            
+            # Also check normalized versions
+            if normalized_new_prompt not in existing_prompts:
+                prompt_candidate = PromptCandidate(
+                    prompt=system_prompt,  # Keep original prompt (not normalized)
+                    fitness=avg_fitness,
+                    metadata={
+                        'generation': self.llego.current_generation,
+                        'operator': 'evaluated',
+                        'prompt_length': len(system_prompt),
+                        'word_count': len(system_prompt.split()),
+                        'evaluation_samples': len(eval_batch.scores) if eval_batch.scores else 0,
+                        'candidate_type': self._current_evaluation_type or 'unknown',  # Store type for notation
+                        'dataset_evaluated': self._current_dataset_type or 'unknown'
+                    }
+                )
+                
+                # Update population - this will add the candidate and keep top N by fitness
+                population_before = len(self.llego.population)
+                self.llego.update_population([prompt_candidate])
+                population_after = len(self.llego.population)
+                
+                self.logger.debug(f"Added to LLEGO population: fitness={avg_fitness:.4f}, size={population_after}")
+            else:
+                # Update fitness if candidate already exists (seed prompt, etc.)
+                # 🔥 FIX: Also normalize for comparison
+                updated = False
+                for p in self.llego.population:
+                    normalized_existing = p.prompt.strip().strip('"\'')
+                    if normalized_existing == normalized_new_prompt:
+                        old_fitness = p.fitness
+                        if avg_fitness > p.fitness:
+                            p.fitness = avg_fitness
+                            updated = True
+                            self.logger.debug(f"Updated fitness: {old_fitness:.4f} → {avg_fitness:.4f}")
+                            # Update candidate type if we have new information
+                            if self._current_evaluation_type and p.metadata:
+                                old_type = p.metadata.get('candidate_type', 'unknown')
+                                if self._current_evaluation_type != old_type:
+                                    p.metadata['candidate_type'] = self._current_evaluation_type
+                        else:
+                            self.logger.debug(f"ℹ️  Candidate already exists with better/equal fitness: {p.fitness:.4f} >= {avg_fitness:.4f}")
+                        break
+                
+                if not updated:
+                    self.logger.debug(f"Candidate already in population with higher fitness")
+        else:
+            self.logger.debug("LLEGO not initialized - skipping population update")
+        
+        # ========================================================================
+        # 🔥 HYBRID MODE: Generate candidates at adapter level
+        # ========================================================================
+        if (self._config and 
+            hasattr(self._config, 'enable_gepa_reflection_with_llego') and 
+            self._config.enable_gepa_reflection_with_llego and
+            self._reflection_lm_client):
+            
+            self.logger.debug("Generating hybrid candidates")
+            
+            # Generate hybrid candidates FIRST
+            generated_candidates = self._generate_hybrid_candidates_adapter_level(
+                current_prompt=system_prompt,
+                eval_batch=eval_batch,
+                candidate=candidate
+            )
+            
+            # 🔥 CRITICAL: Store generated candidates so we can inject them
+            # _generate_hybrid_candidates_adapter_level now returns list of dicts with metadata
+            if generated_candidates:
+                candidate_dicts = []
+                for cand in generated_candidates:
+                    if isinstance(cand, dict) and 'prompt' in cand:
+                        # Already a dict with metadata (preferred format)
+                        candidate_dicts.append(cand)
+                    elif isinstance(cand, str):
+                        # Just a string - determine source based on position (fallback)
+                        # This shouldn't happen if _generate_hybrid_candidates_adapter_level is fixed
+                        self.logger.warning(f"⚠️  Received string candidate instead of dict - using fallback logic")
+                        if len(candidate_dicts) < self._config.num_gepa_reflection_candidates:
+                            source = 'gepa_reflection'
+                        elif len(candidate_dicts) < self._config.num_gepa_reflection_candidates + self._config.n_crossover:
+                            source = 'llego_crossover'
+                        else:
+                            source = 'llego_mutation'
+                        candidate_dicts.append({
+                            'prompt': cand,
+                            'source': source,
+                            'index': len(candidate_dicts) + 1
+                        })
+                    else:
+                        self.logger.warning(f"⚠️  Unknown candidate format: {type(cand)}")
+                
+                self._generated_candidates = candidate_dicts
+                
+                # Store candidate sources for tracking
+                for cand_dict in candidate_dicts:
+                    if 'prompt' in cand_dict and 'source' in cand_dict:
+                        self._candidate_sources[cand_dict['prompt']] = cand_dict['source']
+                
+                # 🔥 CRITICAL: Inject into LLM client wrapper so it can return them when GEPA calls
+                # This is the key mechanism: when GEPA calls adapter.llm_client.generate() for proposals,
+                # our wrapper will detect it and return our pre-generated candidates
+                if hasattr(self.llm_client, '_adapter_generated_candidates'):
+                    self.llm_client._adapter_generated_candidates = candidate_dicts.copy()
+                    self.logger.debug(f"Injected {len(candidate_dicts)} candidates")
+                else:
+                    try:
+                        self.llm_client._adapter_generated_candidates = candidate_dicts.copy()
+                    except Exception as e:
+                        self.logger.error(f"Failed to inject candidates: {e}")
+                
+                # Evaluate generated candidates on Dpareto for fair comparison
+                if hasattr(self, '_evaluating_generated_candidates'):
+                    pass  # Skip to prevent recursion
+                elif self._valset and len(self._valset) > 0:
+                    self._evaluating_generated_candidates = True
+                    self.logger.debug(f"Evaluating {len(candidate_dicts)} candidates on Dpareto ({len(self._valset)} samples)")
+                    
+                    # 🔥 NEW: Collect all candidates with scores for batch update
+                    candidates_with_scores = []
+                    
+                    for i, cand_dict in enumerate(candidate_dicts, 1):
+                        cand_prompt = cand_dict.get('prompt', '')
+                        cand_source = cand_dict.get('source', 'unknown')
+                        
+                        if not cand_prompt:
+                            continue
+                        
+                        # Normalize prompt for duplicate detection
+                        normalized_prompt = cand_prompt.strip().strip('"\'')
+                        
+                        # Check if already evaluated on Dpareto (avoid double evaluation)
+                        if normalized_prompt in self._dpareto_evaluated_candidates:
+                            existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt]
+                            
+                            # Still add to batch for Pareto update (with existing score)
+                            notation_map = {
+                                'seed': 'S₀',
+                                'gepa_reflection': 'Sᵣ',
+                                'llego_crossover': 'Oₓₒ',
+                                'llego_mutation': 'Oₘᵤₜ'
+                            }
+                            cand_notation = notation_map.get(cand_source, 'S')
+                            candidates_with_scores.append({
+                                'prompt': cand_prompt,
+                                'score': existing_score,
+                                'type': cand_source,
+                                'notation': cand_notation
+                            })
+                            continue
+                        
+                        # Evaluate this candidate on valset (Dpareto)
+                        try:
+                            # Set candidate type for proper logging
+                            self._current_evaluation_type = cand_source
+                            
+                            # 🔥 CRITICAL: Temporarily disable individual Pareto updates
+                            # We'll do batch update after all evaluations
+                            from ..utils.pareto_logger import get_pareto_logger
+                            pareto_log = get_pareto_logger()
+                            original_log_method = pareto_log.log_candidate_evaluation
+                            
+                            # Temporarily replace to prevent individual updates
+                            def noop_log(*args, **kwargs):
+                                pass  # Skip individual logging - we'll batch update later
+                            
+                            pareto_log.log_candidate_evaluation = noop_log
+                            
+                            # Evaluate on valset - THIS IS THE FAIR EVALUATION ON SAME DATASET
+                            valset_eval = self.evaluate(
+                                batch=self._valset,  # Same valset as seed!
+                                candidate={'system_prompt': cand_prompt, 'source': cand_source},
+                                capture_traces=True
+                            )
+                            
+                            # Restore original method
+                            pareto_log.log_candidate_evaluation = original_log_method
+                            
+                            avg_score = sum(valset_eval.scores) / len(valset_eval.scores) if valset_eval.scores else 0.0
+                            
+                            # Store evaluation result to avoid double evaluation
+                            self._dpareto_evaluated_candidates[normalized_prompt] = (
+                                avg_score, 
+                                cand_source, 
+                                'evaluated_in_make_reflective_dataset'
+                            )
+                            
+                            self.logger.debug(f"Candidate {i} evaluated: score={avg_score:.4f}")
+                            
+                            # Generate notation
+                            notation_map = {
+                                'seed': 'S₀',
+                                'gepa_reflection': 'Sᵣ',
+                                'llego_crossover': 'Oₓₒ',
+                                'llego_mutation': 'Oₘᵤₜ'
+                            }
+                            cand_notation = notation_map.get(cand_source, 'S')
+                            
+                            # Add to batch for Pareto update
+                            candidates_with_scores.append({
+                                'prompt': cand_prompt,
+                                'score': avg_score,
+                                'type': cand_source,
+                                'notation': cand_notation
+                            })
+                            
+                            # 🔥 CRITICAL: Explicitly add this candidate to LLEGO population with Dpareto fitness
+                            if self.llego:
+                                from ..operators.llego_operators import PromptCandidate
+                                
+                                # Check if already in population
+                                existing_in_pop = False
+                                for p in self.llego.population:
+                                    if p.prompt.strip().strip('"\'') == normalized_prompt:
+                                        # Update fitness if this Dpareto score is better
+                                        if avg_score > p.fitness:
+                                            old_fitness = p.fitness
+                                            p.fitness = avg_score
+                                            if p.metadata:
+                                                p.metadata['candidate_type'] = cand_source
+                                                p.metadata['dataset_evaluated'] = 'dpareto'
+                                            self.logger.debug(f"Updated LLEGO fitness: {old_fitness:.4f} → {avg_score:.4f}")
+                                        existing_in_pop = True
+                                        break
+                                
+                                if not existing_in_pop:
+                                    # Add new candidate to population
+                                    prompt_candidate = PromptCandidate(
+                                        prompt=cand_prompt,
+                                        fitness=avg_score,
+                                        metadata={
+                                            'generation': self.llego.current_generation,
+                                            'operator': 'evaluated_on_dpareto',
+                                            'prompt_length': len(cand_prompt),
+                                            'word_count': len(cand_prompt.split()),
+                                            'evaluation_samples': len(valset_eval.scores) if valset_eval.scores else 0,
+                                            'candidate_type': cand_source,
+                                            'dataset_evaluated': 'dpareto'
+                                        }
+                                    )
+                                    self.llego.update_population([prompt_candidate])
+                            
+                        except Exception as e:
+                            self.logger.error(f"      ❌ Error evaluating candidate #{i} on Dpareto: {e}")
+                            import traceback
+                            self.logger.error(traceback.format_exc())
+                    
+                    # Batch Pareto front update
+                    if candidates_with_scores:
+                        
+                        from ..utils.pareto_logger import get_pareto_logger
+                        pareto_log = get_pareto_logger()
+                        added_candidates = pareto_log.batch_update_pareto_front(candidates_with_scores)
+                        
+                        # 🔥 CRITICAL: Update queue with scores for best-candidate selection
+                        # Create a mapping of prompt -> score for quick lookup
+                        prompt_to_score = {c['prompt'].strip().strip('"\''): c['score'] for c in candidates_with_scores}
+                        
+                        # Update candidates in queue with their scores
+                        if hasattr(self.llm_client, '_adapter_generated_candidates'):
+                            updated_queue = []
+                            for cand in self.llm_client._adapter_generated_candidates:
+                                if isinstance(cand, dict):
+                                    cand_prompt = cand.get('prompt', '')
+                                    normalized = cand_prompt.strip().strip('"\'')
+                                    if normalized in prompt_to_score:
+                                        # Update with score
+                                        cand['score'] = prompt_to_score[normalized]
+                                        updated_queue.append(cand)
+                                    else:
+                                        updated_queue.append(cand)
+                                else:
+                                    updated_queue.append(cand)
+                            
+                            self.llm_client._adapter_generated_candidates = updated_queue
+                        
+                        self.logger.debug(f"Pareto update: {len(added_candidates)} added, front size={len(pareto_log.pareto_front)}")
+                    
+                    # Clear flag after evaluation complete
+                    self._evaluating_generated_candidates = False
+                elif not hasattr(self, '_evaluating_generated_candidates'):
+                    self.logger.error("Valset not available - cannot evaluate generated candidates")
+        
+        # Signal LLEGO-enhanced client for reflection mode
+        if self.llego and hasattr(self.llm_client, 'set_reflection_context'):
+            self.llm_client.set_reflection_context(
+                current_prompt=system_prompt,
+                feedback=eval_batch,
+                in_reflection=True
+            )
+        
+        # 🔥 CRITICAL: Also set reflection context on reflection_lm_client if it exists
+        # This ensures hybrid mode candidate generation is triggered when GEPA calls reflection_lm_callable
+        if hasattr(self, 'reflection_lm_client') and self.reflection_lm_client:
+            if hasattr(self.reflection_lm_client, 'set_reflection_context'):
+                self.logger.info("🔥 CRITICAL: Setting reflection context on reflection_lm_client for hybrid mode")
+                self.reflection_lm_client.set_reflection_context(
+                    current_prompt=system_prompt,
+                    feedback=eval_batch,
+                    in_reflection=True  # This enables hybrid candidate generation!
+                )
+        
+        self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update)
+        
+        # Inject generated candidates into reflective dataset
+        suggested_prompts = []
+        if hasattr(self, '_generated_candidates') and self._generated_candidates:
+            suggested_prompts = [c['prompt'] for c in self._generated_candidates if isinstance(c, dict) and 'prompt' in c]
+            self.logger.debug(f"Injecting {len(suggested_prompts)} suggested prompts")
+        
+        for component in components_to_update:
+            reflective_dataset[component] = []
+            for trace in eval_batch.trajectories:
+                # Generate feedback based on evaluation results
+                # 🆕 Phase 2: Pass trace and current_prompt for LLM-as-Judge
+                feedback = self._generate_feedback(
+                    trace['evaluation_results'],
+                    trace=trace,
+                    current_prompt=system_prompt
+                )
+                
+                # Base reflection data
+                # 🔥 FIX: Strip image_base64 from input_data to prevent massive base64 strings in logs
+                input_data_clean = trace['input_data'].copy() if isinstance(trace['input_data'], dict) else {}
+                if 'image_base64' in input_data_clean:
+                    input_data_clean['image_base64'] = f"[IMAGE_DATA_{len(input_data_clean['image_base64'])}_chars]"
+                
+                # 🔥 FIX: Clean detailed_scores to remove any base64 references or large data
+                detailed_scores_clean = {}
+                if isinstance(trace['evaluation_results'], dict):
+                    for key, value in trace['evaluation_results'].items():
+                        # Skip any values that look like base64 (very long strings)
+                        if isinstance(value, str) and len(value) > 1000:
+                            detailed_scores_clean[key] = f"[DATA_{len(value)}_chars]"
+                        else:
+                            detailed_scores_clean[key] = value
+                else:
+                    detailed_scores_clean = trace['evaluation_results']
+                
+                reflection_entry = {
+                    "current_prompt": system_prompt,
+                    "input_data": input_data_clean,  # Use cleaned version without full base64
+                    "predicted_output": trace['predicted_output'],
+                    "score": trace['evaluation_results'].get("composite_score", 0.0),
+                    "feedback": feedback,
+                    "detailed_scores": detailed_scores_clean  # Cleaned scores without large data
+                }
+                
+                # 🔥 CRITICAL: Only optimize system_prompt, NOT user_prompt
+                # The user_prompt contains the task description (command) and should NOT be modified
+                if component == 'system_prompt' and suggested_prompts:
+                    # Add suggested improved prompts to the reflection entry
+                    # GEPA might use these if the structure supports it
+                    reflection_entry["suggested_improved_prompts"] = suggested_prompts
+                    reflection_entry["num_suggestions"] = len(suggested_prompts)
+                    # Also add the best suggested prompt as a direct suggestion
+                    if suggested_prompts:
+                        reflection_entry["suggested_prompt"] = suggested_prompts[0]  # First candidate as primary suggestion
+                    reflection_entry["optimize_component"] = "system_prompt_only"  # Mark that we only optimize system_prompt
+                elif component != 'system_prompt':
+                    # For non-system_prompt components (like user_prompt), do NOT add suggestions
+                    # We only want to optimize system_prompt
+                    reflection_entry["optimize_component"] = "skip"  # Mark to skip optimization
+                    self.logger.info(f"⚠️  Skipping optimization for component '{component}' - only optimizing system_prompt")
+                
+                reflective_dataset[component].append(reflection_entry)
+        
+        total_samples = sum(len(data) for data in reflective_dataset.values())
+        avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0
+        self.logger.info(f"📝 Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}")
+        
+        return reflective_dataset
+    
+    def _generate_feedback(
+        self,
+        evaluation_results: Dict[str, Any],
+        trace: Optional[Dict[str, Any]] = None,
+        current_prompt: Optional[str] = None
+    ) -> str:
+        """
+        Generate feedback using hybrid approach:
+        - LLM-as-Judge for low/medium scores (detailed, actionable)
+        - Simple feedback for high scores (efficient)
+        
+        Args:
+            evaluation_results: Evaluation scores and extracted data
+            trace: Full trace with input_data, predicted_output, etc. (optional)
+            current_prompt: The current system prompt being optimized (optional)
+            
+        Returns:
+            Feedback string focused on prompt improvement
+        """
+        composite_score = evaluation_results.get("composite_score", 0.0)
+        
+        # Check if LLM-as-Judge is enabled
+        use_llm_judge = getattr(self._config, 'use_llm_as_judge', True)
+        threshold = getattr(self._config, 'llm_as_judge_threshold', 0.8)
+        
+        # 🔥 FIX: Check both attribute names (inconsistency in codebase)
+        reflection_lm = getattr(self, '_reflection_lm_client', None) or getattr(self, 'reflection_lm_client', None)
+        
+        # Debug logging - use INFO so we can see what's happening
+        self.logger.info(f"🔍 Feedback generation: score={composite_score:.4f}, use_llm_judge={use_llm_judge}, threshold={threshold}, has_trace={trace is not None}, has_reflection_lm={reflection_lm is not None}")
+        if trace:
+            input_data = trace.get('input_data', {})
+            predicted = trace.get('predicted_output', '')[:100] if trace.get('predicted_output') else 'N/A'
+            expected = input_data.get('output', '')[:100] if input_data.get('output') else 'N/A'
+            self.logger.info(f"   Predicted preview: {predicted}...")
+            self.logger.info(f"   Expected preview: {expected}...")
+        
+        # Use LLM-as-Judge for scores needing improvement
+        if use_llm_judge and composite_score < threshold and trace:
+            if not reflection_lm:
+                self.logger.warning("⚠️ LLM-as-Judge requested but reflection_lm_client not available - using simple feedback")
+                self.logger.warning(f"   Checked: _reflection_lm_client={getattr(self, '_reflection_lm_client', None) is not None}, reflection_lm_client={getattr(self, 'reflection_lm_client', None) is not None}")
+            else:
+                try:
+                    self.logger.info(f"🤖 Calling LLM-as-Judge for detailed feedback (score: {composite_score:.4f} < threshold: {threshold})")
+                    feedback = self._llm_as_judge_feedback(
+                        evaluation_results,
+                        trace,
+                        current_prompt
+                    )
+                    self.logger.info(f"✅ LLM-as-Judge returned feedback (length: {len(feedback)} chars)")
+                    return feedback
+                except Exception as e:
+                    self.logger.error(f"❌ LLM-as-Judge failed: {e}, falling back to simple feedback")
+                    import traceback
+                    self.logger.error(traceback.format_exc())
+                    # Fall through to simple feedback
+        
+        # Simple actionable feedback (for high scores or as fallback)
+        if composite_score >= threshold:
+            self.logger.debug(f"✅ Score {composite_score:.4f} >= threshold {threshold} - using simple feedback")
+        elif not trace:
+            self.logger.debug(f"⚠️ No trace provided - using simple feedback")
+        elif not use_llm_judge:
+            self.logger.debug(f"⚠️ LLM-as-Judge disabled - using simple feedback")
+        
+        feedback = self._simple_actionable_feedback(
+            evaluation_results,
+            trace,
+            current_prompt
+        )
+        
+        # 🔥 ADD FORMAT FEEDBACK: Append format-specific feedback if available
+        if self._detected_format and trace:
+            from ..utils.format_detection import generate_format_feedback
+            input_data = trace.get('input_data', {})
+            format_feedback = generate_format_feedback(
+                predicted_output=trace.get('predicted_output', ''),
+                expected_output=input_data.get('output', ''),
+                format_info=self._detected_format
+            )
+            if format_feedback:
+                feedback += format_feedback
+        
+        return feedback
+    
+    def _llm_as_judge_feedback(
+        self,
+        evaluation_results: Dict[str, Any],
+        trace: Dict[str, Any],
+        current_prompt: Optional[str] = None
+    ) -> str:
+        """
+        Generate detailed, actionable feedback using LLM-as-Judge.
+        
+        🔥 UNIVERSAL VERSION: Works for ANY task type (text, JSON, structured outputs).
+        No UI-specific assumptions. Pure semantic and structural comparison.
+        
+        Args:
+            evaluation_results: Evaluation scores and extracted data
+            trace: Full trace with input_data, predicted_output, etc.
+            current_prompt: The current system prompt being optimized
+            
+        Returns:
+            Detailed feedback string focused on prompt improvement
+        """
+        # Import universal judge prompt builder
+        from ..utils.universal_judge_prompt import (
+            build_universal_judge_prompt,
+            get_universal_judge_system_prompt,
+            format_universal_judge_feedback,
+            build_empty_output_feedback
+        )
+        
+        # Extract data from trace
+        input_data = trace.get('input_data', {})
+        predicted_output = trace.get('predicted_output', '') or ''
+        expected_output = input_data.get('output', '') or ''
+        task_input = input_data.get('input', '') or ''
+        
+        # Get image if available (for multi-modal tasks)
+        image_base64 = input_data.get('image', '') or input_data.get('image_base64', '')
+        
+        # Log what we're working with
+        self.logger.info(f"🔍 LLM-as-Judge input check:")
+        self.logger.info(f"   predicted_output length: {len(predicted_output)} chars")
+        self.logger.info(f"   expected_output length: {len(expected_output)} chars")
+        self.logger.info(f"   image available: {bool(image_base64)} (length: {len(image_base64) if image_base64 else 0} chars)")
+        self.logger.info(f"   predicted_output preview: {predicted_output[:200] if predicted_output else '[EMPTY]'}...")
+        self.logger.info(f"   expected_output preview: {expected_output[:200] if expected_output else '[EMPTY]'}...")
+        
+        # Handle empty predicted output specially
+        if not predicted_output or not predicted_output.strip():
+            self.logger.warning(f"⚠️ Predicted output is empty - generating specialized feedback")
+            return build_empty_output_feedback(task_input, expected_output, current_prompt)
+        
+        if not image_base64:
+            self.logger.debug(f"ℹ️ No image provided - text-only analysis")
+        
+        # Get the LLM for judging
+        judge_llm = getattr(self, '_reflection_lm_client', None) or getattr(self, 'reflection_lm_client', None)
+        
+        if not judge_llm:
+            self.logger.error("❌ CRITICAL: No reflection_lm_client available for LLM-as-Judge!")
+            raise ValueError("reflection_lm_client not available")
+        
+        # Build the universal judge prompt
+        judge_prompt = build_universal_judge_prompt(
+            task_input=task_input,
+            predicted_output=predicted_output,
+            expected_output=expected_output,
+            current_prompt=current_prompt,
+            evaluation_results=evaluation_results,
+            image_base64=image_base64
+        )
+        
+        # Get the universal system prompt
+        system_prompt = get_universal_judge_system_prompt(has_image=bool(image_base64))
+        
+        # Call LLM-as-Judge
+        try:
+            self.logger.info(f"🤖 Calling Universal LLM-as-Judge for semantic analysis")
+            result = judge_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=judge_prompt,
+                image_base64=image_base64 if image_base64 else ""
+            )
+            
+            if isinstance(result, dict):
+                judge_output = result.get('content', '')
+            else:
+                judge_output = str(result)
+            
+            # Format the feedback using the universal formatter
+            score = evaluation_results.get('composite_score', 0.0)
+            feedback = format_universal_judge_feedback(
+                judge_output=judge_output,
+                task_input=task_input,
+                predicted_output=predicted_output,
+                expected_output=expected_output,
+                score=score
+            )
+            
+            # 🔥 ADD FORMAT FEEDBACK: Append format-specific feedback
+            if self._detected_format:
+                from ..utils.format_detection import generate_format_feedback
+                format_feedback = generate_format_feedback(
+                    predicted_output=predicted_output,
+                    expected_output=expected_output,
+                    format_info=self._detected_format
+                )
+                if format_feedback:
+                    feedback += format_feedback
+                    
+                # Also add format constraint for next iteration
+                feedback += f"\n\n{self._detected_format['format_constraint']}"
+            
+            self.logger.info(f"✅ Universal LLM-as-Judge generated feedback")
+            return feedback
+                
+        except Exception as e:
+            self.logger.error(f"LLM-as-Judge failed: {e}")
+            import traceback
+            self.logger.error(traceback.format_exc())
+            # Fallback to simple feedback
+            return self._simple_actionable_feedback(evaluation_results, trace, current_prompt)
+    
+    def _extract_reasoning_from_expected(self, expected_output: str) -> str:
+        """Extract reasoning section from expected output."""
+        if not expected_output:
+            return ""
+        
+        # Look for "Reason:" or "Reasoning:" section
+        reason_patterns = [
+            r'Reason[:\s]+(.*?)(?:\n\n|\Z)',
+            r'Reasoning[:\s]+(.*?)(?:\n\n|\Z)',
+        ]
+        
+        for pattern in reason_patterns:
+            match = re.search(pattern, expected_output, re.IGNORECASE | re.DOTALL)
+            if match:
+                return match.group(1).strip()[:500]  # Truncate to 500 chars
+        
+        return ""
+
+    def _extract_reasoning_from_predicted(self, predicted_output: str) -> str:
+        """Extract reasoning from predicted output if available."""
+        # Similar to _extract_reasoning_from_expected
+        # Or return first 200 chars if no clear reasoning section
+        if not predicted_output:
+            return ""
+        
+        # Look for reasoning patterns
+        reason_patterns = [
+            r'Reason[:\s]+(.*?)(?:\n\n|\Z)',
+            r'Reasoning[:\s]+(.*?)(?:\n\n|\Z)',
+        ]
+        
+        for pattern in reason_patterns:
+            match = re.search(pattern, predicted_output, re.IGNORECASE | re.DOTALL)
+            if match:
+                return match.group(1).strip()[:500]
+        
+        # If no reasoning found, return first 200 chars
+        if len(predicted_output) > 200:
+            return predicted_output[:200] + "..."
+        return predicted_output
+
+    def _simple_actionable_feedback(
+        self,
+        evaluation_results: Dict[str, Any],
+        trace: Dict[str, Any] = None,
+        current_prompt: Optional[str] = None
+    ) -> str:
+        """
+        Simple feedback without LLM-as-Judge.
+        
+        🔥 UNIVERSAL VERSION: Works for any task type.
+        """
+        composite_score = evaluation_results.get("composite_score", 0.0)
+        semantic_sim = evaluation_results.get("semantic_similarity", 0.0)
+        structural_sim = evaluation_results.get("structural_similarity", 0.0)
+        
+        feedback_parts = []
+        
+        # Extract task context if available
+        if trace:
+            input_data = trace.get('input_data', {})
+            predicted = trace.get('predicted_output', '')
+            expected = input_data.get('output', '')
+            
+            # Check for empty output
+            if not predicted or not predicted.strip():
+                feedback_parts.append(
+                    "❌ CRITICAL: No output generated. "
+                    "Add explicit output instructions to the prompt."
+                )
+            # Check for format mismatch
+            elif structural_sim < 0.5:
+                feedback_parts.append(
+                    f"⚠️ Format mismatch (structural similarity: {structural_sim:.0%}). "
+                    "Add output format instructions (e.g., 'Return as JSON with fields: ...')."
+                )
+            # Check for semantic mismatch
+            elif semantic_sim < 0.5:
+                feedback_parts.append(
+                    f"⚠️ Semantic mismatch (similarity: {semantic_sim:.0%}). "
+                    "The output meaning differs from expected. Add clearer task instructions."
+                )
+        
+        # Score-based feedback
+        if composite_score >= 0.9:
+            feedback_parts.append("✅ Excellent match - prompt is working well.")
+        elif composite_score >= 0.8:
+            feedback_parts.append("✅ Good match - minor refinements possible.")
+        elif composite_score >= 0.6:
+            feedback_parts.append(
+                f"⚠️ Partial match (score: {composite_score:.0%}). "
+                "Consider adding examples or more specific field names to the prompt."
+            )
+        elif composite_score >= 0.3:
+            feedback_parts.append(
+                f"⚠️ Low match (score: {composite_score:.0%}). "
+                "The prompt needs clearer instructions about expected output format and content."
+            )
+        else:
+            feedback_parts.append(
+                f"❌ Poor match (score: {composite_score:.0%}). "
+                "Major revision required - add explicit output format, field names, and examples."
+            )
+        
+        return "\n".join(feedback_parts) if feedback_parts else f"Score: {composite_score:.0%}"
+    
+    def get_best_candidate(self) -> Optional[Dict[str, str]]:
+        """
+        Get the best candidate from GEPA Pareto front.
+        
+        GEPA Pareto front is the single source of truth because:
+        - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto
+        - All non-dominated candidates are added to GEPA Pareto front
+        - Therefore, the best candidate MUST be in GEPA Pareto front
+        
+        Returns:
+            Best candidate dictionary from GEPA Pareto front, or None if empty
+        """
+        # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth)
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        
+        if pareto_log.pareto_front:
+            try:
+                # Get best candidate from GEPA Pareto front (highest score = best)
+                gepa_best = max(pareto_log.pareto_front, key=lambda x: x['score'])
+                gepa_fitness = gepa_best['score']
+                gepa_prompt = gepa_best['prompt']
+                gepa_type = gepa_best.get('type', 'unknown')
+                gepa_notation = gepa_best.get('notation', 'S')
+                
+                self.logger.info(f"✅ Best candidate from GEPA Pareto front: {gepa_notation} with f({gepa_notation})={gepa_fitness:.4f}")
+                self.logger.info(f"   Type: {gepa_type}, Prompt length: {len(gepa_prompt)} chars")
+                self.logger.info(f"   💡 GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)")
+                
+                return {
+                    'system_prompt': gepa_prompt,
+                    'fitness': gepa_fitness,
+                    'source': 'gepa_pareto_front',
+                    'candidate_type': gepa_type,
+                    'notation': gepa_notation
+                }
+            except Exception as e:
+                self.logger.error(f"❌ Failed to get best from GEPA Pareto front: {e}")
+                import traceback
+                self.logger.error(traceback.format_exc())
+        
+        # EDGE CASE: Pareto front empty (shouldn't happen, but handle gracefully)
+        self.logger.warning("⚠️  GEPA Pareto front is empty - no best candidate available")
+        self.logger.warning("   This should not happen if all candidates are evaluated on Dpareto")
+        return None
+    
+    def get_best_score(self) -> float:
+        """Get the best score from GEPA Pareto front (single source of truth)."""
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        
+        if pareto_log.pareto_front:
+            try:
+                gepa_best_fitness = max(p['score'] for p in pareto_log.pareto_front)
+                return gepa_best_fitness
+            except Exception as e:
+                self.logger.warning(f"⚠️  Failed to get best fitness from GEPA Pareto front: {e}")
+        
+        # Edge case: Pareto front empty - fallback to adapter's score
+        return self._best_score
+    
+    def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0):
+        """
+        Pretty print the new proposed candidate prompt.
+        
+        Args:
+            candidate: The new candidate prompt from GEPA
+            iteration: Current optimization iteration
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        candidate_source = candidate.get('source', 'unknown')
+        
+        # Store source in adapter state so evaluate() can access it
+        self._current_evaluation_type = candidate_source
+        
+        # Also store in mapping by prompt text for lookup
+        if candidate_source != 'unknown' and system_prompt:
+            self._candidate_sources[system_prompt] = candidate_source
+        
+        # Use clean logger for simpler output
+        from ..utils.clean_logger import get_clean_logger
+        clean_log = get_clean_logger()
+        
+        # Update iteration if needed
+        if iteration > clean_log.current_iteration:
+            clean_log.log_iteration_start(iteration, seed_prompt=None)
+        
+        # Don't log here - let evaluate() handle it with full context
+    
+    def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch, 
+                                       components_to_update: List[str]):
+        """
+        Pretty print the reflection dataset creation process.
+        
+        Args:
+            candidate: Current candidate being evaluated
+            eval_batch: Evaluation results
+            components_to_update: Components being updated
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        
+        self.logger.info(f"🔍 DEBUG: Inside _log_reflection_dataset_creation")
+        self.logger.info(f"🔍 DEBUG: system_prompt length: {len(system_prompt)}")
+        self.logger.info(f"🔍 DEBUG: eval_batch.scores: {eval_batch.scores}")
+        self.logger.info(f"🔍 DEBUG: eval_batch.trajectories: {len(eval_batch.trajectories) if eval_batch.trajectories else 0}")
+        
+        # Determine candidate notation
+        notation_map = {'seed': 'S₀', 'gepa_reflection': 'Sᵣ', 'llego_crossover': 'Oₓₒ', 'llego_mutation': 'Oₘᵤₜ'}
+        notation = notation_map.get(self._current_evaluation_type, 'S')
+        cand_num = self._evaluation_count if hasattr(self, '_evaluation_count') else '?'
+        cand_label = f"{notation}{cand_num}"
+        
+        # Use logger for the main output too
+        self.logger.info("\n" + "="*80)
+        self.logger.info("🔍 REFLECTION DATASET CREATION")
+        self.logger.info("="*80)
+        
+        self.logger.info(f"\n📋 CURRENT PROMPT BEING ANALYZED: {cand_label}")
+        self.logger.info(f"   Candidate Type: {self._current_evaluation_type or 'unknown'}")
+        self.logger.info("-" * 40)
+        self.logger.info(f'"{system_prompt}"')
+        self.logger.info("-" * 40)
+        
+        self.logger.info(f"\n📊 EVALUATION SUMMARY:")
+        self.logger.info("-" * 40)
+        if eval_batch.scores:
+            avg_score = sum(eval_batch.scores) / len(eval_batch.scores)
+            min_score = min(eval_batch.scores)
+            max_score = max(eval_batch.scores)
+            self.logger.info(f"   • Average Score: {avg_score:.4f}")
+            self.logger.info(f"   • Min Score: {min_score:.4f}")
+            self.logger.info(f"   • Max Score: {max_score:.4f}")
+            self.logger.info(f"   • Total Samples: {len(eval_batch.scores)}")
+        
+        self.logger.info(f"\n🎯 COMPONENTS TO UPDATE:")
+        self.logger.info("-" * 40)
+        for i, component in enumerate(components_to_update, 1):
+            self.logger.info(f"   {i}. {component}")
+        
+        if eval_batch.trajectories:
+            self.logger.info(f"\n🔍 DETAILED ANALYSIS (FULL FEEDBACK - NO TRUNCATION):")
+            self.logger.info("-" * 80)
+            for i, trace in enumerate(eval_batch.trajectories[:5], 1):  # Show first 5 samples with FULL details
+                evaluation_results = trace['evaluation_results']
+                composite_score = evaluation_results.get("composite_score", 0.0)
+                
+                # Extract element IDs for concise logging
+                predicted_element = evaluation_results.get('predicted_element', 'Unknown')
+                expected_element = evaluation_results.get('expected_element', 'Unknown')
+                
+                # Concise, direct logging with candidate notation
+                status_icon = "✅" if composite_score == 1.0 else "❌"
+                
+                # Add notation for candidate type
+                notation_map = {'seed': 'S₀', 'gepa_reflection': 'Sᵣ', 'llego_crossover': 'Oₓₒ', 'llego_mutation': 'Oₘᵤₜ'}
+                notation = notation_map.get(self._current_evaluation_type, 'S')
+                
+                self.logger.info(f"   [{notation}] Sample {i}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status_icon}")
+                
+                # 🔥 FIX: Pass trace and current_prompt to enable LLM-as-Judge!
+                feedback = self._generate_feedback(
+                    evaluation_results, 
+                    trace=trace,  # Pass the full trace!
+                    current_prompt=system_prompt  # Pass current prompt being analyzed!
+                )
+                self.logger.info(f"      💬 FEEDBACK (FULL):")
+                self.logger.info(f"         \"{feedback}\"")
+            
+            if len(eval_batch.trajectories) > 5:
+                self.logger.info(f"\n   ... and {len(eval_batch.trajectories) - 5} more samples (all logged similarly)")
+        
+        self.logger.info("="*80)
+    
+    def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str:
+        """
+        🛡️ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions.
+        
+        NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text.
+        However, this extraction logic serves as a safety net in case the LLM still adds:
+        "Based on the performance analysis...
+        ### Recommendations...
+        ### Revised Prompt Example:
+        [THE ACTUAL PROMPT HERE]
+        ### Conclusion..."
+        
+        This is now a defensive measure, not the primary mechanism.
+        
+        Args:
+            reflection_output: Full reflection output (should be clean prompt, but may contain analysis)
+            
+        Returns:
+            str: Clean, extracted prompt (or original if extraction fails or not needed)
+        """
+        if not reflection_output or not isinstance(reflection_output, str):
+            return reflection_output
+        
+        # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:"
+        patterns = [
+            r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)',
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL)
+            if match:
+                extracted = match.group(1).strip()
+                # Clean up common artifacts
+                extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE)
+                extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE)
+                extracted = extracted.strip()
+                
+                if len(extracted) > 50:  # Reasonable minimum length for a prompt
+                    self.logger.debug(f"✅ Extracted clean prompt using pattern: {pattern[:50]}...")
+                    self.logger.debug(f"   Original length: {len(reflection_output)} chars")
+                    self.logger.debug(f"   Extracted length: {len(extracted)} chars")
+                    return extracted
+        
+        # Pattern 2: If output starts with a quote or prompt-like structure
+        # Look for text that starts with "You are..." and is substantial
+        if 'You are' in reflection_output:
+            # Find the longest continuous block that starts with "You are"
+            prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)', 
+                                    reflection_output, re.IGNORECASE | re.DOTALL)
+            if prompt_match:
+                extracted = prompt_match.group(1).strip()
+                if len(extracted) > 50:
+                    self.logger.debug(f"✅ Extracted prompt starting with 'You are...'")
+                    return extracted
+        
+        # Pattern 3: If the reflection output is actually just a clean prompt (no analysis)
+        # Check if it's relatively short and doesn't contain analysis keywords
+        analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion', 
+                           'optimization', 'analysis', 'feedback']
+        if (len(reflection_output) < 2000 and 
+            not any(keyword in reflection_output.lower() for keyword in analysis_keywords)):
+            # Likely a clean prompt, return as-is
+            self.logger.debug(f"✅ Reflection output appears to be a clean prompt (no analysis detected)")
+            return reflection_output.strip()
+        
+        # Fallback: Return original (with warning)
+        self.logger.warning(f"⚠️  Could not extract clean prompt from reflection output")
+        self.logger.warning(f"   Output length: {len(reflection_output)} chars")
+        self.logger.warning(f"   Output preview: {reflection_output[:200]}...")
+        self.logger.warning(f"   Returning original output (may contain analysis text)")
+        return reflection_output.strip()
+    
+    def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]:
+        """
+        🔥 OPTIMIZED: Parse N prompt variations from JSON format response.
+        
+        Uses robust JSON parsing with multiple fallback strategies:
+        1. Extract JSON from markdown code blocks (```json ... ```)
+        2. Find JSON object directly in text
+        3. Attempt JSON repair for common issues
+        4. Fallback to numbered section parsing if JSON fails
+        
+        Args:
+            response_text: LLM response containing JSON with variations
+            num_expected: Expected number of variations
+            
+        Returns:
+            List[str]: List of prompt variations (in order by index)
+            
+        Raises:
+            ValueError: If parsing fails and no valid variations found
+        """
+        import json
+        import re
+        
+        if not response_text or not isinstance(response_text, str):
+            raise ValueError("Empty or invalid response text")
+        
+        # Strategy 1: Extract JSON from markdown code block
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL)
+        if json_match:
+            json_str = json_match.group(1)
+            try:
+                data = json.loads(json_str)
+                return self._extract_variations_from_json(data, num_expected)
+            except json.JSONDecodeError as e:
+                self.logger.debug(f"JSON in code block invalid: {e}, trying repair...")
+        
+        # Strategy 2: Find JSON object directly in text
+        json_match = re.search(r'\{[^{}]*"variations"[^{}]*\[.*?\]\s*[^{}]*\}', response_text, re.DOTALL)
+        if json_match:
+            json_str = json_match.group(0)
+            try:
+                data = json.loads(json_str)
+                return self._extract_variations_from_json(data, num_expected)
+            except json.JSONDecodeError:
+                # Try to find largest JSON object
+                json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+                if json_match:
+                    try:
+                        data = json.loads(json_match.group(0))
+                        return self._extract_variations_from_json(data, num_expected)
+                    except json.JSONDecodeError:
+                        pass
+        
+        # Strategy 3: Attempt JSON repair (common issues: trailing commas, unescaped quotes)
+        json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+        if json_match:
+            json_str = json_match.group(0)
+            # Try common repairs
+            repaired = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas before }
+            repaired = re.sub(r',\s*]', ']', repaired)  # Remove trailing commas before ]
+            try:
+                data = json.loads(repaired)
+                return self._extract_variations_from_json(data, num_expected)
+            except json.JSONDecodeError:
+                pass
+        
+        # Strategy 4: Fallback to numbered section parsing
+        self.logger.warning(f"JSON parsing failed, trying numbered section fallback...")
+        try:
+            return self._parse_numbered_section_variations(response_text, num_expected)
+        except ValueError:
+            pass
+        
+        # All strategies failed
+        raise ValueError(f"Could not parse {num_expected} variations from response. Response preview: {response_text[:300]}...")
+    
+    def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]:
+        """Extract and validate variations from parsed JSON data."""
+        if not isinstance(data, dict):
+            raise ValueError("JSON data is not a dictionary")
+        
+        variations_list = data.get('variations', [])
+        if not isinstance(variations_list, list):
+            raise ValueError("'variations' field is not a list")
+        
+        if len(variations_list) < num_expected:
+            self.logger.warning(f"Expected {num_expected} variations, found {len(variations_list)} in JSON")
+        
+        # Extract and sort by index
+        variations_with_index = []
+        for var in variations_list:
+            if not isinstance(var, dict):
+                continue
+            index = var.get('index', 0)
+            prompt = var.get('prompt', '')
+            if prompt and isinstance(prompt, str):
+                variations_with_index.append((index, prompt.strip()))
+        
+        # Sort by index
+        variations_with_index.sort(key=lambda x: x[0])
+        
+        # Extract just the prompts
+        variations = [v[1] for v in variations_with_index]
+        
+        # Validate count
+        if len(variations) < num_expected:
+            self.logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}")
+            # Pad with duplicates if needed (not ideal but better than failing)
+            while len(variations) < num_expected:
+                variations.append(variations[-1] if variations else "")
+        
+        # Take first N if we got more
+        variations = variations[:num_expected]
+        
+        # Validate all variations are non-empty
+        if not all(v for v in variations):
+            raise ValueError(f"Some variations are empty after parsing")
+        
+        return variations
+    
+    def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]:
+        """
+        Fallback parser: Extract variations from numbered sections.
+        
+        Format: --- VARIATION N --- or Variation N: or similar
+        """
+        variations = []
+        
+        # Pattern 1: --- VARIATION N ---
+        pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)'
+        matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE)
+        
+        # Pattern 2: Variation N:
+        pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)'
+        matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE)
+        
+        # Pattern 3: Numbered list (1. 2. 3.)
+        pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)'
+        matches3 = re.findall(pattern3, response_text, re.DOTALL)
+        
+        # Use the pattern with most matches
+        matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3)
+        
+        if len(matches) >= num_expected:
+            # Sort by index
+            matches.sort(key=lambda x: int(x[0]))
+            # Extract prompts
+            variations = [match[1].strip() for match in matches[:num_expected]]
+        
+        if len(variations) != num_expected:
+            raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}")
+        
+        return variations
+    
+    def _generate_hybrid_candidates_adapter_level(
+        self,
+        current_prompt: str,
+        eval_batch: EvaluationBatch,
+        candidate: Dict[str, str]
+    ) -> List[str]:
+        """
+        🔥 ADAPTER-LEVEL HYBRID CANDIDATE GENERATION
+        
+        Generate candidates from BOTH GEPA reflection AND LLEGO operators
+        when GEPA's adapter mode ignores the reflection_lm parameter.
+        
+        This method:
+        1. Builds comprehensive feedback from evaluation results
+        2. Generates GEPA reflection candidates
+        3. Generates LLEGO crossover/mutation candidates
+        4. Logs ALL candidates with FULL prompts (no truncation)
+        5. Stores candidates for potential use
+        
+        Args:
+            current_prompt: The current prompt being optimized
+            eval_batch: Evaluation results with trajectories
+            candidate: Current candidate dict
+            
+        Returns:
+            List of generated candidate prompts
+        """
+        try:
+            from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+            
+            all_candidates = []
+            gepa_count = 0
+            
+            # 🔥 CRITICAL: Pass format info to LLM client before generating candidates
+            if self._detected_format and self._reflection_lm_client:
+                if isinstance(self._reflection_lm_client, LLEGOEnhancedLLMClient):
+                    self._reflection_lm_client._detected_format = self._detected_format
+                    self.logger.info(f"📐 Passed format info to reflection LLM: {self._detected_format['format_type']}")
+            
+            self.logger.info(f"🔥 STEP 1: Building comprehensive feedback from evaluation")
+            
+            # 🔥 REMOVED: Excessive diagnostic logs - moved to DEBUG level
+            # Build comprehensive feedback text from trajectories
+            if not hasattr(eval_batch, 'trajectories'):
+                self.logger.error(f"❌ eval_batch has no 'trajectories' attribute! Type: {type(eval_batch)}")
+                return []
+            
+            trajectories = eval_batch.trajectories
+            if not trajectories:
+                self.logger.warning(f"⚠️ eval_batch.trajectories is empty - no feedback to generate candidates from")
+                return []
+            
+            self.logger.debug(f"Processing {len(trajectories)} trajectories for feedback generation")
+            
+            feedback_lines = []
+            feedback_lines.append(f"Current prompt performance analysis:\n")
+            feedback_lines.append(f"Current prompt:\n{current_prompt}\n")
+            feedback_lines.append(f"\nEvaluation results:\n")
+            
+            for i, trace in enumerate(trajectories[:8], 1):  # Use up to 8 samples for feedback
+                try:
+                    eval_results = trace.get('evaluation_results', {})
+                    score = eval_results.get("composite_score", 0.0) if isinstance(eval_results, dict) else 0.0
+                    input_data = trace.get('input_data', {})
+                    predicted = trace.get('predicted_output', '')
+                    expected = input_data.get('output', '') if isinstance(input_data, dict) else ''
+                    
+                    # 🔥 FIX: Clean input_data to remove base64 images before logging
+                    input_data_clean = input_data.copy() if isinstance(input_data, dict) else {}
+                    if 'image_base64' in input_data_clean:
+                        input_data_clean['image_base64'] = f"[IMAGE_DATA_{len(input_data_clean['image_base64'])}_chars]"
+                    
+                    feedback_lines.append(f"  Sample {i}:")
+                    feedback_lines.append(f"    Input: {input_data_clean.get('input', '') if isinstance(input_data_clean, dict) else ''}")
+                    feedback_lines.append(f"    Expected: {expected}")
+                    feedback_lines.append(f"    Predicted: {predicted}")
+                    feedback_lines.append(f"    Score: {score:.4f}")
+                    
+                    if isinstance(eval_results, dict):
+                        # 🔥 FIX: Pass trace and current_prompt to enable LLM-as-Judge!
+                        feedback = self._generate_feedback(
+                            eval_results,
+                            trace=trace,  # Pass the full trace!
+                            current_prompt=current_prompt  # Pass current prompt!
+                        )
+                        feedback_lines.append(f"    Feedback: {feedback}")
+                    else:
+                        feedback_lines.append(f"    Feedback: Evaluation results not in expected format")
+                    feedback_lines.append("")
+                except Exception as e:
+                    self.logger.error(f"❌ Error processing trace {i}: {e}")
+                    import traceback
+                    self.logger.error(traceback.format_exc())
+                    continue
+            
+            feedback_text = "\n".join(feedback_lines)
+            
+            self.logger.info(f"\n📋 FULL FEEDBACK TEXT (NO TRUNCATION):")
+            self.logger.info(feedback_text)
+            
+            # ─────────────────────────────────────────────────────
+            # PART 1: GEPA REFLECTION CANDIDATES
+            # ─────────────────────────────────────────────────────
+            self.logger.info(f"📝 PART 2: GEPA REFLECTION - Semantic Understanding")
+            
+            num_gepa = self._config.num_gepa_reflection_candidates if hasattr(self._config, 'num_gepa_reflection_candidates') else 3
+            
+            self.logger.info(f"\n📝 Generating {num_gepa} GEPA Reflection candidates in single optimized call...")
+            
+            # Set reflection context
+            if isinstance(self._reflection_lm_client, LLEGOEnhancedLLMClient):
+                self._reflection_lm_client.set_reflection_context(
+                    current_prompt=current_prompt,
+                    feedback=eval_batch,
+                    in_reflection=True
+                )
+            
+            # 🔥 OPTIMIZED: Single call with JSON format for multiple variations
+            try:
+                # Precision-engineered system prompt requesting JSON format
+                optimization_system_prompt = f"""You are an expert prompt engineer specializing in iterative prompt optimization.
+
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate {num_gepa} DISTINCT variations of improved prompts that address the identified issues through DIFFERENT improvement strategies.
+
+CRITICAL OUTPUT FORMAT - MUST BE VALID JSON:
+{{
+  "variations": [
+    {{
+      "index": 1,
+      "prompt": "[First improved prompt text - complete and self-contained]"
+    }},
+    {{
+      "index": 2,
+      "prompt": "[Second improved prompt text - complete and self-contained]"
+    }},
+    {{
+      "index": 3,
+      "prompt": "[Third improved prompt text - complete and self-contained]"
+    }}
+  ]
+}}
+
+DIVERSITY REQUIREMENTS:
+- Variation 1: Focus on clarity, specificity, and explicit instructions
+- Variation 2: Focus on edge case handling, robustness, and error prevention
+- Variation 3: Focus on structural organization, examples, and step-by-step guidance
+- Each variation must be MEANINGFULLY DIFFERENT (not just rewordings)
+- Each variation must address ALL feedback issues but through different approaches
+
+QUALITY STANDARDS (apply to all variations):
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure each prompt is self-contained and unambiguous
+- Preserve the core task domain and output format requirements
+
+OUTPUT FORMAT:
+- Output MUST be valid JSON (can be wrapped in ```json ... ``` markdown code block)
+- Generate EXACTLY {num_gepa} variations
+- Index must be 1, 2, 3, ... (sequential, starting at 1)
+- Each "prompt" field must contain the complete, self-contained prompt text
+- NO explanations, NO analysis, NO meta-commentary - just the JSON structure
+
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+- Any text outside the JSON structure
+
+Output ONLY the JSON object with the variations."""
+
+                # Construct user prompt with clear structure
+                optimization_user_prompt = f"""CURRENT PROMPT (to be improved):
+{current_prompt}
+
+{feedback_text}
+
+TASK: Generate {num_gepa} DISTINCT variations of improved prompts. Each variation should:
+- Address ALL feedback issues identified above
+- Use a DIFFERENT improvement strategy (clarity, robustness, structure)
+- Be meaningfully different from the others (not just rewordings)
+- Be complete and self-contained
+
+Remember: Output ONLY the JSON object with {num_gepa} variations. No explanations."""
+
+                result = self._reflection_lm_client.generate(
+                    system_prompt=optimization_system_prompt,
+                    user_prompt=optimization_user_prompt,
+                    image_base64=""
+                )
+                
+                if isinstance(result, dict):
+                    response_text = result.get("content", str(result))
+                else:
+                    response_text = str(result)
+                
+                # Parse JSON variations
+                gepa_variations = self._parse_json_variations(response_text, num_gepa)
+                
+                # Add all variations to candidates
+                for idx, variation_prompt in enumerate(gepa_variations, 1):
+                    # 🛡️ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite instructions
+                    gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt)
+                    
+                    if gepa_candidate != variation_prompt:
+                        self.logger.debug(f"   Variation {idx}: Extracted clean prompt (removed {len(variation_prompt) - len(gepa_candidate)} chars)")
+                    
+                    all_candidates.append({
+                        'prompt': gepa_candidate,
+                        'source': 'gepa_reflection',
+                        'index': idx
+                    })
+                    
+                    # 🔥 CAPTURE CANDIDATE FOR LIVE UI DISPLAY
+                    try:
+                        import sys
+                        if 'app' in sys.modules:
+                            app_module = sys.modules['app']
+                            if hasattr(app_module, 'add_candidate_to_store'):
+                                app_module.add_candidate_to_store({
+                                    'prompt': gepa_candidate,
+                                    'source': 'gepa_reflection',
+                                    'timestamp': f"Candidate #{idx}"
+                                })
+                    except Exception:
+                        pass  # Silent fail - UI capture is optional
+                    
+                    self.logger.info(f"\n✅ GEPA REFLECTION CANDIDATE #{idx}/{num_gepa} (FULL PROMPT - NO TRUNCATION):")
+                    self.logger.info(f"{'▓'*80}")
+                    self.logger.info(f"{gepa_candidate}")
+                    self.logger.info(f"{'▓'*80}")
+                    self.logger.info(f"   Length: {len(gepa_candidate)} chars, Words: {len(gepa_candidate.split())}")
+                
+                gepa_count = len(all_candidates)
+                self.logger.info(f"\n✅ GEPA Reflection: {gepa_count} candidates generated in single optimized call")
+                
+            except Exception as e:
+                self.logger.error(f"❌ Error generating GEPA reflection candidates: {e}")
+                self.logger.warning(f"   Falling back to sequential generation...")
+                import traceback
+                self.logger.debug(traceback.format_exc())
+                
+                # Fallback: Sequential generation (when JSON parsing fails)
+                for i in range(num_gepa):
+                    self.logger.info(f"\n📝 Generating GEPA Reflection candidate #{i+1}/{num_gepa} (fallback mode)...")
+                    try:
+                        fallback_user_prompt = f"""CURRENT PROMPT (to be improved):
+{current_prompt}
+
+{feedback_text}
+
+TASK: Generate an improved version of the CURRENT PROMPT that addresses all issues identified in the evaluation feedback above.
+
+Remember: Output ONLY the improved prompt text. No explanations."""
+
+                        result = self._reflection_lm_client.generate(
+                            system_prompt=self._FALLBACK_SYSTEM_PROMPT,
+                            user_prompt=fallback_user_prompt,
+                            image_base64=""
+                        )
+                        
+                        if isinstance(result, dict):
+                            gepa_candidate_raw = result.get("content", str(result))
+                        else:
+                            gepa_candidate_raw = str(result)
+                        
+                        gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw)
+                        
+                        all_candidates.append({
+                            'prompt': gepa_candidate,
+                            'source': 'gepa_reflection',
+                            'index': i + 1
+                        })
+                        
+                        # 🔥 CAPTURE CANDIDATE FOR LIVE UI DISPLAY
+                        try:
+                            import sys
+                            if 'app' in sys.modules:
+                                app_module = sys.modules['app']
+                                if hasattr(app_module, 'add_candidate_to_store'):
+                                    app_module.add_candidate_to_store({
+                                        'prompt': gepa_candidate,
+                                        'source': 'gepa_reflection',
+                                        'timestamp': f"Fallback #{i+1}"
+                                    })
+                        except Exception:
+                            pass  # Silent fail - UI capture is optional
+                    except Exception as fallback_error:
+                        self.logger.error(f"❌ Error in fallback generation #{i+1}: {fallback_error}")
+            
+            gepa_count = len(all_candidates)
+            if gepa_count > 0:
+                self.logger.info(f"\n✅ GEPA Reflection: {gepa_count} candidates generated")
+            
+            # ─────────────────────────────────────────────────────
+            # PART 2: LLEGO GENETIC OPERATORS
+            # ─────────────────────────────────────────────────────
+            self.logger.info(f"🧬 PART 3: LLEGO GENETIC OPERATORS - Structural Diversity")
+            
+            if self.llego:
+                # 🔥 FIX 2: Get Pareto front from GEPA (not LLEGO population)
+                # This ensures LLEGO operators use true non-dominated solutions
+                from ..utils.pareto_logger import get_pareto_logger
+                pareto_log = get_pareto_logger()
+                gepa_pareto_front = pareto_log.pareto_front
+                
+                # Convert GEPA Pareto front to PromptCandidate format
+                pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+                pareto_front = pareto_candidates
+                
+                self.logger.info(f"   Using GEPA Pareto front (size: {len(gepa_pareto_front)})")
+                self.logger.info(f"   Converted to {len(pareto_front)} PromptCandidate objects")
+                for idx, p in enumerate(pareto_front, 1):
+                    cand_type = p.metadata.get('candidate_type', 'unknown') if p.metadata else 'unknown'
+                    notation = p.metadata.get('notation', 'S') if p.metadata else 'S'
+                    self.logger.info(f"      {notation}: [fitness={p.fitness:.3f}, type={cand_type}, length={len(p.prompt)} chars]")
+                
+                # Create LLM callable for LLEGO
+                def llm_callable(genetic_prompt: str) -> str:
+                    # 🔥 LLEGO genetic prompt already contains full instructions
+                    # Use minimal system prompt to avoid instruction conflict
+                    result = self._reflection_lm_client.generate(
+                        system_prompt="You are an expert prompt engineer. Follow the instructions provided in the user message to generate an improved prompt. Output only the prompt text, no explanations.",
+                        user_prompt=genetic_prompt,
+                        image_base64=""
+                    )
+                    if isinstance(result, dict):
+                        return result.get('content', str(result))
+                    return str(result)
+                
+                # Generate LLEGO offspring
+                try:
+                    llego_prompts = self.llego.evolve_generation(
+                        llm=llm_callable,
+                        pareto_front=pareto_front
+                    )
+                    
+                    n_crossover = self._config.n_crossover if hasattr(self._config, 'n_crossover') else 2
+                    crossover_count = min(n_crossover, len(llego_prompts))
+                    
+                    for i, prompt in enumerate(llego_prompts):
+                        if i < crossover_count:
+                            source = 'llego_crossover'
+                        else:
+                            source = 'llego_mutation'
+                        
+                        all_candidates.append({
+                            'prompt': prompt,
+                            'source': source,
+                            'index': i + 1
+                        })
+                        
+                        # 🔥 CAPTURE CANDIDATE FOR LIVE UI DISPLAY
+                        try:
+                            import sys
+                            if 'app' in sys.modules:
+                                app_module = sys.modules['app']
+                                if hasattr(app_module, 'add_candidate_to_store'):
+                                    app_module.add_candidate_to_store({
+                                        'prompt': prompt,
+                                        'source': source,
+                                        'timestamp': f"Candidate #{i+1}"
+                                    })
+                        except Exception:
+                            pass  # Silent fail - UI capture is optional
+                        
+                        border_char = "▓" if source == 'llego_crossover' else "▒"
+                        self.logger.info(f"\n{border_char*80}")
+                        self.logger.info(f"{border_char} {'🔀 LLEGO CROSSOVER' if source == 'llego_crossover' else '🎲 LLEGO MUTATION'} candidate #{i+1}")
+                        self.logger.info(f"{border_char*80}")
+                        self.logger.info(f"{prompt}")
+                        self.logger.info(f"{border_char*80}")
+                        self.logger.info(f"   Length: {len(prompt)} chars, Words: {len(prompt.split())}")
+                    
+                    self.logger.info(f"✅ LLEGO Genetic Operators: {len(llego_prompts)} candidates generated")
+                    
+                except Exception as e:
+                    self.logger.error(f"❌ Error generating LLEGO candidates: {e}")
+                    import traceback
+                    self.logger.error(traceback.format_exc())
+            
+            # ─────────────────────────────────────────────────────
+            # SUMMARY
+            # ─────────────────────────────────────────────────────
+            self.logger.info(f"\n{'='*80}")
+            self.logger.info(f"📊 ADAPTER-LEVEL HYBRID GENERATION SUMMARY")
+            self.logger.info(f"{'='*80}")
+            self.logger.info(f"   📝 GEPA Reflection:  {gepa_count} candidates")
+            self.logger.info(f"   🔀 LLEGO Crossover:  {len([c for c in all_candidates if c['source'] == 'llego_crossover'])} candidates")
+            self.logger.info(f"   🎲 LLEGO Mutation:   {len([c for c in all_candidates if c['source'] == 'llego_mutation'])} candidates")
+            self.logger.info(f"   📦 TOTAL:            {len(all_candidates)} diverse candidates")
+            self.logger.info(f"{'='*80}\n")
+            
+            # Store candidates (GEPA might access them through some mechanism)
+            self._generated_candidates = all_candidates
+            
+            # Log each candidate with FULL text
+            self.logger.info(f"\n{'='*80}")
+            self.logger.info(f"📋 ALL GENERATED CANDIDATES (FULL PROMPTS - NO TRUNCATION)")
+            self.logger.info(f"{'='*80}")
+            for i, cand in enumerate(all_candidates, 1):
+                source_emoji = "📝" if cand['source'] == 'gepa_reflection' else "🔀" if cand['source'] == 'llego_crossover' else "🎲"
+                self.logger.info(f"\n{source_emoji} CANDIDATE #{i} - {cand['source'].upper().replace('_', ' ')}")
+                self.logger.info(f"{cand['prompt']}")
+                self.logger.info(f"   Length: {len(cand['prompt'])} characters")
+                self.logger.info(f"   Words: {len(cand['prompt'].split())} words")
+            self.logger.info(f"{'='*80}\n")
+            
+            # Return candidates as list of dicts with metadata (not just strings)
+            # This ensures source information is preserved
+            return all_candidates  # Return full dicts with source info
+        
+        except Exception as e:
+            self.logger.error(f"\n{'❌'*80}")
+            self.logger.error(f"❌ CRITICAL ERROR in _generate_hybrid_candidates_adapter_level!")
+            self.logger.error(f"❌ Error: {str(e)}")
+            self.logger.error(f"{'❌'*80}\n")
+            import traceback
+            self.logger.error(traceback.format_exc())
+            return []
+    
+    def propose_new_texts(
+        self,
+        candidate: Dict[str, str],
+        reflective_dataset: Dict[str, List[Dict[str, Any]]],
+        components_to_update: List[str]
+    ) -> Dict[str, str]:
+        """
+        🔥 CRITICAL: This method is called by GEPA to propose new component texts.
+        
+        This is the KEY integration point - GEPA checks if adapter.propose_new_texts exists,
+        and if it does, uses it instead of the default InstructionProposalSignature.
+        
+        This method:
+        1. Uses reflective_dataset to generate improved prompts
+        2. Optionally uses LLEGO for additional diversity
+        3. Returns dict mapping component_name -> new component text
+        
+        Args:
+            candidate: Current candidate dict (component_name -> component_text)
+            reflective_dataset: Feedback data per component (from make_reflective_dataset)
+            components_to_update: List of component names to update
+            
+        Returns:
+            Dict mapping component_name -> new component text
+        """
+        self.logger.info(f"\n{'='*80}")
+        self.logger.info(f"🎯 PROPOSE_NEW_TEXTS CALLED BY GEPA")
+        self.logger.info(f"{'='*80}")
+        self.logger.info(f"   Components to update: {components_to_update}")
+        self.logger.info(f"   Reflective dataset keys: {list(reflective_dataset.keys())}")
+        
+        # 🔥 FIX: Check if we already generated candidates in hybrid mode
+        # If yes, return one of them instead of generating a new one (avoids duplicate work and context overflow)
+        if hasattr(self, '_generated_candidates') and self._generated_candidates:
+            self.logger.info(f"\n✅ HYBRID MODE: Using pre-generated candidates from make_reflective_dataset")
+            self.logger.info(f"   Available candidates: {len(self._generated_candidates)}")
+            self.logger.info(f"   Returning first candidate (GEPA will evaluate all of them)")
+            
+            # Return the first candidate (GEPA will get others via queue)
+            first_candidate = self._generated_candidates[0]
+            new_texts = {}
+            for component in components_to_update:
+                if isinstance(first_candidate, dict) and 'prompt' in first_candidate:
+                    new_texts[component] = first_candidate['prompt']
+                    source = first_candidate.get('source', 'unknown')
+                    self.logger.info(f"   Returning {source} candidate (length: {len(first_candidate['prompt'])} chars)")
+                else:
+                    new_texts[component] = str(first_candidate)
+            
+            self.logger.info(f"{'='*80}\n")
+            return new_texts
+        
+        new_texts = {}
+        
+        # Check if we have reflection_lm_client (required for proposal)
+        if not self._reflection_lm_client:
+            self.logger.error("❌ reflection_lm_client not available - cannot generate proposals")
+            # Fallback: return current candidate (no change)
+            for component in components_to_update:
+                new_texts[component] = candidate.get(component, '')
+            return new_texts
+        
+        # For each component to update
+        for component_name in components_to_update:
+            self.logger.info(f"📝 Proposing new text for component: {component_name}")
+            
+            current_text = candidate.get(component_name, '')
+            dataset = reflective_dataset.get(component_name, [])
+            
+            if not dataset:
+                self.logger.warning(f"⚠️  No feedback data for {component_name}, keeping current text")
+                new_texts[component_name] = current_text
+                continue
+            
+            self.logger.info(f"   Current text length: {len(current_text)} chars")
+            self.logger.info(f"   Feedback examples: {len(dataset)}")
+            
+            # Generate improved prompt using reflection LM
+            try:
+                # 🔥 FIX: Clean dataset to remove base64 images (prevents context overflow)
+                cleaned_dataset = []
+                for item in dataset:
+                    cleaned_item = item.copy()
+                    # Remove or truncate base64 image data
+                    if 'image_base64' in cleaned_item:
+                        img_len = len(cleaned_item['image_base64'])
+                        cleaned_item['image_base64'] = f'[IMAGE_DATA_REMOVED_{img_len}_chars]'
+                    if 'image' in cleaned_item and isinstance(cleaned_item['image'], str) and len(cleaned_item['image']) > 1000:
+                        img_len = len(cleaned_item['image'])
+                        cleaned_item['image'] = f'[IMAGE_DATA_REMOVED_{img_len}_chars]'
+                    # Also clean any nested detailed_scores
+                    if 'detailed_scores' in cleaned_item and isinstance(cleaned_item['detailed_scores'], dict):
+                        for key in list(cleaned_item['detailed_scores'].keys()):
+                            val = cleaned_item['detailed_scores'][key]
+                            if isinstance(val, str) and len(val) > 5000:
+                                cleaned_item['detailed_scores'][key] = f'[LARGE_DATA_REMOVED_{len(val)}_chars]'
+                    cleaned_dataset.append(cleaned_item)
+                
+                self.logger.info(f"   📋 Cleaned dataset: removed base64 images to prevent context overflow")
+                
+                # Use GEPA's default instruction proposal format
+                from gepa.strategies.instruction_proposal import InstructionProposalSignature
+                
+                # Build input dict for GEPA's instruction proposal
+                input_dict = {
+                    "current_instruction_doc": current_text,
+                    "dataset_with_feedback": cleaned_dataset  # Use cleaned dataset!
+                }
+                
+                # Generate prompt using GEPA's signature
+                prompt = InstructionProposalSignature.prompt_renderer(input_dict)
+                
+                # Call reflection LM to generate new instruction
+                self.logger.info(f"   Generating improved prompt via reflection LM...")
+                
+                result = self._reflection_lm_client.generate(
+                    system_prompt="You are an expert prompt engineer. Follow the instructions in the user message to generate an improved prompt.",
+                    user_prompt=prompt,
+                    image_base64=""
+                )
+                
+                # Extract response
+                if isinstance(result, dict):
+                    response_text = result.get("content", str(result))
+                else:
+                    response_text = str(result)
+                
+                # Extract instruction using GEPA's extractor
+                extracted = InstructionProposalSignature.output_extractor(response_text)
+                new_instruction = extracted.get("new_instruction", response_text.strip())
+                
+                # Clean up the instruction (remove markdown, quotes, etc.)
+                new_instruction = self._clean_extracted_prompt(new_instruction)
+                
+                self.logger.info(f"   ✅ Generated new text (length: {len(new_instruction)} chars)")
+                self.logger.info(f"   Preview: {new_instruction[:150]}...")
+                
+                new_texts[component_name] = new_instruction
+                
+            except Exception as e:
+                self.logger.error(f"❌ Error generating proposal for {component_name}: {e}")
+                import traceback
+                self.logger.error(traceback.format_exc())
+                # Fallback: return current text
+                new_texts[component_name] = current_text
+        
+        self.logger.info(f"\n{'='*80}")
+        self.logger.info(f"✅ PROPOSE_NEW_TEXTS COMPLETE")
+        self.logger.info(f"   Generated {len(new_texts)} new component texts")
+        self.logger.info(f"{'='*80}\n")
+        
+        return new_texts
+    
+    def _clean_extracted_prompt(self, prompt: str) -> str:
+        """
+        Clean extracted prompt by removing markdown, quotes, and extra whitespace.
+        
+        Args:
+            prompt: Raw extracted prompt text
+            
+        Returns:
+            Cleaned prompt text
+        """
+        if not prompt:
+            return prompt
+        
+        # Remove markdown code blocks
+        prompt = re.sub(r'```[\w]*\n?', '', prompt)
+        prompt = re.sub(r'```', '', prompt)
+        
+        # Remove quotes if entire prompt is quoted
+        prompt = prompt.strip()
+        if (prompt.startswith('"') and prompt.endswith('"')) or \
+           (prompt.startswith("'") and prompt.endswith("'")):
+            prompt = prompt[1:-1]
+        
+        # Remove leading/trailing whitespace
+        prompt = prompt.strip()
+        
+        return prompt
\ No newline at end of file
diff --git a/src/gepa_optimizer/data/__init__.py b/src/gepa_optimizer/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fbd2af54c03c26e5d54c79d12e6d16140d76190
--- /dev/null
+++ b/src/gepa_optimizer/data/__init__.py
@@ -0,0 +1,27 @@
+"""
+Data module for GEPA Optimizer
+"""
+
+from .converters import UniversalConverter
+from .loaders import DataLoader
+from .validators import DataValidator
+from .scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset
+from .validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split
+from .index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split
+
+__all__ = [
+    "UniversalConverter",
+    "DataLoader",
+    "DataValidator",
+    # Scroll dataset
+    "ScrollDatasetLoader",
+    "load_scroll_dataset",
+    # Validation dataset
+    "ValidationDatasetLoader",
+    "load_validation_dataset",
+    "load_validation_split",
+    # Index caching dataset
+    "IndexCachingDatasetLoader",
+    "load_index_caching_dataset",
+    "load_index_caching_split",
+]
diff --git a/src/gepa_optimizer/data/converters.py b/src/gepa_optimizer/data/converters.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc46ec186d799e47b92c756e594a8f796b8e8364
--- /dev/null
+++ b/src/gepa_optimizer/data/converters.py
@@ -0,0 +1,265 @@
+"""
+Universal converter for dataset to GEPA format with 3-way split (train/val/test)
+"""
+
+import os
+import json
+from typing import Any, List, Tuple, Union, Dict, Optional
+from pathlib import Path
+import pandas as pd
+import logging
+
+from .loaders import DataLoader
+from ..utils.exceptions import DatasetError
+from ..models.config import DataSplitConfig
+
+logger = logging.getLogger(__name__)
+
+class UniversalConverter:
+    """
+    Universal converter for datasets to GEPA format.
+    
+    Handles 3-way splitting (train/val/test) with configurable ratios and
+    graceful handling of small datasets.
+    """
+    
+    def __init__(self, data_split_config: Optional[DataSplitConfig] = None):
+        """
+        Initialize converter with optional split configuration.
+        
+        Args:
+            data_split_config: Configuration for train/val/test splits.
+                             If None, uses default 60/20/20 split.
+        """
+        self.supported_extensions = [
+            '.csv', '.json', '.jsonl', '.txt', '.md',
+            '.png', '.jpg', '.jpeg'
+        ]
+        self.loader = DataLoader()
+        self.data_split_config = data_split_config or DataSplitConfig()
+
+    def convert(
+        self, 
+        dataset: Union[List[Any], str, Any, Dict[str, Any]],
+        split_config: Optional[DataSplitConfig] = None
+    ) -> Tuple[List[dict], List[dict], List[dict]]:
+        """
+        Convert any dataset to GEPA format with 3-way split (train/val/test).
+        
+        Args:
+            dataset: Input dataset in any supported format
+            split_config: Optional split configuration (overrides instance config)
+            
+        Returns:
+            Tuple of (trainset, valset, testset) where:
+            - trainset: Used for reflection/feedback (Dfeedback in GEPA paper)
+            - valset: Used for Pareto selection (Dpareto in GEPA paper)
+            - testset: Held-out for final evaluation (not passed to GEPA)
+            
+        Raises:
+            DatasetError: If dataset cannot be converted or is too small
+        """
+        try:
+            # Use provided split config or instance default
+            config = split_config or self.data_split_config
+            
+            # Handle UI tree dataset format
+            if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset':
+                return self.convert_ui_tree_dataset(
+                    dataset.get('json_dir', 'json_tree'),
+                    dataset.get('screenshots_dir', 'screenshots'),
+                    split_config=config
+                )
+            elif isinstance(dataset, str):
+                data = self._load_from_path(dataset)
+            elif hasattr(dataset, 'to_dict'):  # pandas DataFrame
+                data = dataset.to_dict(orient='records')
+            elif isinstance(dataset, list):
+                data = dataset
+            else:
+                data = [dataset]
+
+            logger.info(f"Normalized data length: {len(data)}")
+            standardized = self._standardize(data)
+            train, val, test = self._split_three_way(standardized, config)
+            return train, val, test
+        except (FileNotFoundError, ValueError, TypeError) as e:
+            raise DatasetError(f"Failed to convert dataset: {str(e)}")
+
+    def _load_from_path(self, path: str) -> List[Any]:
+        """Load data from file path"""
+        p = Path(path)
+        if not p.exists():
+            raise FileNotFoundError(f"File not found: {path}")
+        
+        ext = p.suffix.lower()
+        if ext in self.supported_extensions:
+            return [self.loader.load(p)]
+        else:
+            raise DatasetError(f"Unsupported file extension: {ext}")
+
+    def _standardize(self, data: List[Any]) -> List[dict]:
+        """Standardize data to input/output format
+        
+        Handles both UI tree JSON format and simple text inputs.
+        UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str}
+        Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc.
+        """
+        out = []
+        for item in data:
+            if not isinstance(item, dict):
+                item = {'input': str(item)}
+                
+            # Handle UI tree JSON format
+            if 'ui_tree' in item and 'screenshot' in item:
+                ui_tree = item['ui_tree']
+                input_text = ui_tree.get('text', '')
+                output_text = item.get('expected_output', '')
+                image = item.get('screenshot', '')
+                out.append({'input': input_text, 'output': output_text, 'image': image})
+            # Handle simple text format
+            else:
+                inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or ''
+                outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or ''
+                image = self._extract(item, ['image', 'image_base64', 'screenshot']) or ''
+                out.append({'input': inp, 'output': outp, 'image': image})
+                
+        return out
+
+    def _extract(self, d: dict, keys: List[str]) -> Union[str, None]:
+        """Extract value by trying multiple keys"""
+        for k in keys:
+            if k in d:
+                return d[k]
+        return None
+
+    def _split_three_way(
+        self, 
+        data: List[dict], 
+        config: DataSplitConfig
+    ) -> Tuple[List[dict], List[dict], List[dict]]:
+        """
+        Split data into train, validation, and test sets.
+        
+        Args:
+            data: Standardized dataset
+            config: Split configuration with ratios and strategies
+            
+        Returns:
+            Tuple of (train, val, test) datasets
+            
+        Raises:
+            ValueError: If dataset is too small for configured splits
+        """
+        dataset_size = len(data)
+        
+        # 🔥 NEW: Log adaptive strategy if being used
+        if config.small_dataset_strategy == 'adaptive':
+            train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size)
+            logger.info(
+                f"📊 Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): "
+                f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% "
+                f"(prioritizes validation for reliable candidate ranking)"
+            )
+        
+        # Get split indices from config
+        try:
+            train_end, val_end, test_end, _ = config.get_split_indices(dataset_size)
+        except ValueError as e:
+            logger.error(f"Dataset split error: {e}")
+            raise DatasetError(str(e))
+        
+        # Perform the split
+        train = data[:train_end]
+        val = data[train_end:val_end]
+        test = data[val_end:test_end]
+        
+        # Log split information with strategy
+        strategy_note = ""
+        if config.small_dataset_strategy == 'adaptive':
+            strategy_note = " (adaptive)"
+        logger.info(
+            f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), "
+            f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), "
+            f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)"
+        )
+        
+        # Validate splits are not empty
+        if len(train) == 0:
+            raise DatasetError("Training set is empty after split")
+        if len(val) == 0:
+            logger.warning("Validation set is empty - this may cause issues with Pareto selection")
+            val = [train[-1]]  # Use last training sample as fallback
+        if len(test) == 0:
+            logger.warning("Test set is empty - final evaluation will not be performed")
+        
+        return train, val, test
+    
+    def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]:
+        """
+        DEPRECATED: Legacy 2-way split for backwards compatibility.
+        
+        Use _split_three_way() instead for production code.
+        
+        Args:
+            data: Standardized dataset
+            ratio: Train ratio (0.0-1.0)
+            
+        Returns:
+            Tuple of (train, val) datasets
+        """
+        import warnings
+        warnings.warn(
+            "_split() is deprecated. Use _split_three_way() for 3-way splitting.",
+            DeprecationWarning,
+            stacklevel=2
+        )
+        
+        split = max(1, int(len(data) * ratio))
+        train = data[:split]
+        val = data[split:] or data[-1:]  # Ensure val is not empty
+        return train, val
+
+    def convert_ui_tree_dataset(
+        self, 
+        json_dir: str, 
+        screenshots_dir: str,
+        split_config: Optional[DataSplitConfig] = None
+    ) -> Tuple[List[dict], List[dict], List[dict]]:
+        """
+        Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split.
+        
+        Args:
+            json_dir: Directory containing JSON files
+            screenshots_dir: Directory containing screenshot images
+            split_config: Optional split configuration (overrides instance config)
+            
+        Returns:
+            Tuple of (train_data, val_data, test_data) in GEPA format
+            
+        Raises:
+            DatasetError: If dataset cannot be loaded or is invalid
+        """
+        try:
+            # Load paired dataset
+            dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir)
+            
+            if not dataset:
+                raise DatasetError("No valid image-JSON pairs found")
+            
+            logger.info(f"Loaded {len(dataset)} UI tree samples")
+            
+            # Use provided config or instance default
+            config = split_config or self.data_split_config
+            
+            # Split into train/val/test
+            train, val, test = self._split_three_way(dataset, config)
+            
+            logger.info(
+                f"Split UI tree dataset: {len(train)} train, "
+                f"{len(val)} validation, {len(test)} test"
+            )
+            return train, val, test
+            
+        except Exception as e:
+            raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}")
diff --git a/src/gepa_optimizer/data/index_caching_loader.py b/src/gepa_optimizer/data/index_caching_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5e38edd78fbd009bca23e28505da62c89841c99
--- /dev/null
+++ b/src/gepa_optimizer/data/index_caching_loader.py
@@ -0,0 +1,278 @@
+"""
+Index Caching Dataset Loader
+
+Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format.
+"""
+
+import os
+import json
+import base64
+import logging
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class IndexCachingDatasetLoader:
+    """
+    Loads index caching dataset from JSON file.
+    
+    Expected JSON format:
+    [
+        {
+            "command": "Tap on first option from the suggestion",
+            "image": "element_images/QMxgc_14_0_tap_IkALe_element.png",
+            "xml": "xml/IkALe__debug.xml",
+            "expected": {
+                "is_index_based": true,
+                "index_value": 1,
+                "parent_element_id": "aaaabf",
+                "element_id_of_nth_child_of_parent": "aaaabg",
+                "selected_element_is_correct": true
+            }
+        },
+        ...
+    ]
+    
+    Converts to GEPA format:
+    - input: command text (seed prompt will be provided in test script)
+    - output: JSON string with expected values
+    - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
+    - input: Command + XML content (combined in user prompt)
+    - metadata: All original fields plus converted values
+    """
+    
+    def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None):
+        """
+        Initialize index caching dataset loader.
+        
+        Args:
+            json_path: Path to JSON file. Default: "./note2_debug.json" or from env var
+            base_dir: Base directory for resolving relative paths in JSON.
+                    Default: Directory containing JSON file
+        
+        Raises:
+            FileNotFoundError: If JSON file doesn't exist
+            json.JSONDecodeError: If JSON file is invalid
+        """
+        # Get JSON path from env or use default
+        if json_path is None:
+            json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json")
+        
+        self.json_path = Path(json_path).resolve()
+        
+        if not self.json_path.exists():
+            raise FileNotFoundError(
+                f"Dataset file not found: {self.json_path}\n"
+                f"Make sure note2_debug.json exists in the project root."
+            )
+        
+        # Base directory for resolving relative paths
+        if base_dir is None:
+            base_dir = self.json_path.parent
+        self.base_dir = Path(base_dir).resolve()
+    
+    def load_dataset(self) -> List[Dict[str, Any]]:
+        """
+        Load dataset from JSON file and convert to GEPA format.
+        
+        Returns:
+            List of dataset items in GEPA format:
+            [
+                {
+                    "input": "Tap on first option from the suggestion",  # Command only
+                    "output": '{"is_index_based": true, "index_value": 1, ...}',  # Expected JSON
+                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL
+                    "metadata": {
+                        "command": "...",
+                        "image_path": "...",
+                        "xml_path": "...",
+                        "expected": {...}
+                    }
+                },
+                ...
+            ]
+        
+        Raises:
+            FileNotFoundError: If image or XML file doesn't exist
+            json.JSONDecodeError: If JSON file is invalid
+        """
+        # Load JSON file
+        with open(self.json_path, "r", encoding="utf-8") as f:
+            dataset = json.load(f)
+        
+        gepa_dataset = []
+        
+        for idx, entry in enumerate(dataset):
+            command = entry.get("command", "")
+            image_path = entry.get("image", "")
+            xml_path = entry.get("xml", "")
+            expected = entry.get("expected", {})
+            
+            # Resolve paths relative to base_dir
+            abs_image_path = (self.base_dir / image_path).resolve()
+            abs_xml_path = (self.base_dir / xml_path).resolve()
+            
+            # Validate paths
+            if not abs_image_path.exists():
+                raise FileNotFoundError(
+                    f"Image file not found: {abs_image_path}\n"
+                    f"Entry {idx + 1}: {command}"
+                )
+            
+            if not abs_xml_path.exists():
+                raise FileNotFoundError(
+                    f"XML file not found: {abs_xml_path}\n"
+                    f"Entry {idx + 1}: {command}"
+                )
+            
+            # Load and encode image
+            with open(abs_image_path, "rb") as f:
+                image_data = f.read()
+                image_base64 = base64.b64encode(image_data).decode("utf-8")
+            
+            # Load XML content
+            with open(abs_xml_path, "r", encoding="utf-8") as f:
+                xml_content = f.read()
+            
+            # Convert expected to JSON string
+            expected_json = json.dumps(expected, ensure_ascii=False)
+            
+            # Create user prompt with command + XML content
+            # The XML will be included in the user prompt text (as the agent does)
+            user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```"
+            
+            # For reflection, we don't need full XML - just the command is enough
+            # Reflection is about improving the prompt based on evaluation feedback,
+            # not analyzing specific XML structures
+            reflection_input = command  # Just the command, no XML
+            
+            # Create GEPA format item
+            gepa_item = {
+                "input": user_prompt,  # Command + XML content (for evaluation)
+                "reflection_input": reflection_input,  # Just command (for reflection)
+                "output": expected_json,  # Expected output as JSON string
+                "image_base64": image_base64,  # TOP LEVEL for UniversalConverter
+                "metadata": {
+                    "command": command,
+                    "image_path": str(image_path),
+                    "xml_path": str(xml_path),
+                    "abs_image_path": str(abs_image_path),
+                    "abs_xml_path": str(abs_xml_path),
+                    "xml_content": xml_content,  # Store XML separately in metadata
+                    "expected": expected,
+                    "dataset_index": idx
+                }
+            }
+            
+            gepa_dataset.append(gepa_item)
+        
+        return gepa_dataset
+    
+    def load_split(
+        self,
+        train_ratio: float = 0.6,
+        val_ratio: float = 0.4
+    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Load dataset and split into train/val sets (no test set).
+        
+        Args:
+            train_ratio: Ratio for training set (default: 0.6)
+            val_ratio: Ratio for validation set (default: 0.4)
+        
+        Returns:
+            Tuple of (train_set, val_set)
+        
+        Raises:
+            ValueError: If ratios don't sum to 1.0
+        """
+        if abs(train_ratio + val_ratio - 1.0) > 0.01:
+            raise ValueError(
+                f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}"
+            )
+        
+        dataset = self.load_dataset()
+        total = len(dataset)
+        
+        train_end = int(total * train_ratio)
+        
+        train_set = dataset[:train_end]
+        val_set = dataset[train_end:]
+        
+        return train_set, val_set
+
+
+def load_index_caching_dataset(
+    json_path: Optional[str] = None,
+    base_dir: Optional[str] = None
+) -> List[Dict[str, Any]]:
+    """
+    Convenience function to load index caching dataset.
+    
+    Args:
+        json_path: Path to JSON file
+        base_dir: Base directory for resolving relative paths
+    
+    Returns:
+        List of dataset items in GEPA format
+    """
+    loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
+    return loader.load_dataset()
+
+
+def load_index_caching_split(
+    json_path: Optional[str] = None,
+    base_dir: Optional[str] = None,
+    train_ratio: float = 0.6,
+    val_ratio: float = 0.4
+) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Convenience function to load and split index caching dataset.
+    
+    Args:
+        json_path: Path to JSON file
+        base_dir: Base directory for resolving relative paths
+        train_ratio: Ratio for training set
+        val_ratio: Ratio for validation set
+    
+    Returns:
+        Tuple of (train_set, val_set) - no test set
+    """
+    loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
+    return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio)
+
+
+# Example usage
+if __name__ == "__main__":
+    print("🚀 Testing Index Caching Dataset Loader...")
+    
+    # Test loading
+    try:
+        loader = IndexCachingDatasetLoader(json_path="./note2_debug.json")
+        dataset = loader.load_dataset()
+        
+        print(f"\n✅ Loaded {len(dataset)} items")
+        
+        # Show sample
+        if dataset:
+            sample = dataset[0]
+            print(f"\n📝 Sample Item:")
+            print(f"   Command: {sample['input']}")
+            print(f"   Image path: {sample['metadata']['image_path']}")
+            print(f"   XML path: {sample['metadata']['xml_path']}")
+            print(f"   Expected: {sample['output'][:100]}...")
+            print(f"   Image base64 length: {len(sample['image_base64'])}")
+            print(f"   XML content length: {len(sample['metadata'].get('xml_content', ''))}")
+        
+        # Test split
+        train, val = loader.load_split()
+        print(f"\n📊 Dataset Split:")
+        print(f"   Training: {len(train)} samples")
+        print(f"   Validation: {len(val)} samples")
+        print(f"   Test: Not used (no test set)")
+        
+    except Exception as e:
+        print(f"❌ Error: {e}")
+
diff --git a/src/gepa_optimizer/data/loaders.py b/src/gepa_optimizer/data/loaders.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f70b857e9972f5dac3a267ec6f3db9d073ca0b0
--- /dev/null
+++ b/src/gepa_optimizer/data/loaders.py
@@ -0,0 +1,237 @@
+"""
+Data loading utilities for various file formats
+"""
+
+import json
+import base64
+import pandas as pd
+from typing import Any, Optional, Union, List , Dict
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+class DataLoader:
+    """
+    Utility class for loading data from various sources
+    """
+    
+    def __init__(self):
+        self.supported_formats = [
+            '.csv', '.json', '.jsonl', '.txt', '.md', '.xlsx',
+            '.png', '.jpg', '.jpeg'
+        ]
+    
+    def load(self, source: Union[str, Path], format_hint: Optional[str] = None) -> Optional[Any]:
+        """
+        Load data from any supported source
+        
+        Args:
+            source: File path or data source
+            format_hint: Optional format hint to override auto-detection
+            
+        Returns:
+            Loaded data or None if failed
+        """
+        try:
+            path = Path(source)
+            
+            if not path.exists():
+                logger.error(f"File not found: {source}")
+                return None
+            
+            # Use format hint or detect from extension
+            file_format = format_hint or path.suffix.lower()
+            
+            if file_format == '.csv':
+                return self.load_csv(path)
+            elif file_format == '.json':
+                return self.load_json(path)
+            elif file_format == '.jsonl':
+                return self.load_jsonl(path)
+            elif file_format in ['.txt', '.md']:
+                return self.load_text(path)
+            elif file_format == '.xlsx':
+                return self.load_excel(path)
+            elif file_format in ['.png', '.jpg', '.jpeg']:
+                return self.load_image_base64(path)
+            else:
+                logger.warning(f"Unsupported format: {file_format}")
+                return None
+                
+        except Exception as e:
+            logger.error(f"Failed to load data from {source}: {str(e)}")
+            return None
+    
+    def load_csv(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
+        """Load CSV file as pandas DataFrame"""
+        try:
+            df = pd.read_csv(path)
+            logger.info(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
+            return df
+        except Exception as e:
+            logger.error(f"Failed to load CSV {path}: {str(e)}")
+            return None
+    
+    def load_json(self, path: Union[str, Path]) -> Optional[Any]:
+        """Load JSON file"""
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            
+            if isinstance(data, list):
+                logger.info(f"Loaded JSON with {len(data)} items")
+            else:
+                logger.info("Loaded JSON object")
+            
+            return data
+        except Exception as e:
+            logger.error(f"Failed to load JSON {path}: {str(e)}")
+            return None
+    
+    def load_jsonl(self, path: Union[str, Path]) -> Optional[List[Dict]]:
+        """Load JSONL (JSON Lines) file"""
+        try:
+            data = []
+            with open(path, 'r', encoding='utf-8') as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if line:
+                        try:
+                            data.append(json.loads(line))
+                        except json.JSONDecodeError as e:
+                            logger.warning(f"Invalid JSON on line {line_num}: {str(e)}")
+            
+            logger.info(f"Loaded JSONL with {len(data)} items")
+            return data
+        except Exception as e:
+            logger.error(f"Failed to load JSONL {path}: {str(e)}")
+            return None
+    
+    def load_text(self, path: Union[str, Path]) -> Optional[str]:
+        """Load plain text file"""
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            
+            logger.info(f"Loaded text file with {len(content)} characters")
+            return content
+        except Exception as e:
+            logger.error(f"Failed to load text {path}: {str(e)}")
+            return None
+    
+    def load_excel(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
+        """Load Excel file as pandas DataFrame"""
+        try:
+            df = pd.read_excel(path)
+            logger.info(f"Loaded Excel with {len(df)} rows and {len(df.columns)} columns")
+            return df
+        except Exception as e:
+            logger.error(f"Failed to load Excel {path}: {str(e)}")
+            return None
+            
+    def load_image_base64(self, path: Union[str, Path]) -> Optional[str]:
+        """Load image file and encode as Base64 string"""
+        try:
+            with open(path, 'rb') as f:
+                encoded_string = base64.b64encode(f.read()).decode('utf-8')
+            logger.info(f"Loaded image {path} and encoded to Base64")
+            return encoded_string
+        except Exception as e:
+            logger.error(f"Failed to load image {path}: {str(e)}")
+            return None
+
+    def is_supported_format(self, file_path: Union[str, Path]) -> bool:
+        """Check if file format is supported"""
+        path = Path(file_path)
+        return path.suffix.lower() in self.supported_formats
+    
+    def get_file_info(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+        """Get information about a file"""
+        path = Path(file_path)
+        
+        if not path.exists():
+            return {'exists': False}
+        
+        return {
+            'exists': True,
+            'size': path.stat().st_size,
+            'format': path.suffix.lower(),
+            'supported': self.is_supported_format(path),
+            'name': path.name,
+            'stem': path.stem,
+            'parent': str(path.parent)
+        }
+
+    def load_ui_tree_dataset(self, json_dir: str, screenshots_dir: str) -> List[Dict[str, Any]]:
+        """
+        Load UI tree dataset by pairing JSON files with corresponding screenshots
+        
+        Args:
+            json_dir: Directory containing JSON files (e.g., "json_tree")
+            screenshots_dir: Directory containing screenshot images (e.g., "screenshots")
+            
+        Returns:
+            List of dictionaries with 'input', 'output', and 'image' keys
+        """
+        json_path = Path(json_dir)
+        screenshots_path = Path(screenshots_dir)
+        
+        if not json_path.exists():
+            raise FileNotFoundError(f"JSON directory not found: {json_dir}")
+        if not screenshots_path.exists():
+            raise FileNotFoundError(f"Screenshots directory not found: {screenshots_dir}")
+        
+        dataset = []
+        
+        # Get all JSON files
+        json_files = list(json_path.glob("*.json"))
+        logger.info(f"Found {len(json_files)} JSON files in {json_dir}")
+        
+        for json_file in json_files:
+            # Extract filename without extension (e.g., "2" from "2.json")
+            file_stem = json_file.stem
+            
+            # Look for corresponding image file
+            image_extensions = ['.jpg', '.jpeg', '.png']
+            image_file = None
+            
+            for ext in image_extensions:
+                potential_image = screenshots_path / f"{file_stem}{ext}"
+                if potential_image.exists():
+                    image_file = potential_image
+                    break
+            
+            if not image_file:
+                logger.warning(f"No corresponding image found for {json_file.name}")
+                continue
+                
+            try:
+                # Load JSON content
+                json_data = self.load_json(json_file)
+                if not json_data:
+                    logger.warning(f"Failed to load JSON: {json_file}")
+                    continue
+                    
+                # Load image as base64
+                image_base64 = self.load_image_base64(image_file)
+                if not image_base64:
+                    logger.warning(f"Failed to load image: {image_file}")
+                    continue
+                
+                # Create dataset entry
+                dataset_entry = {
+                    'input': 'Extract UI elements from this screenshot and provide the complete UI tree structure',
+                    'output': json.dumps(json_data, indent=2),  # Convert JSON to string
+                    'image': image_base64
+                }
+                
+                dataset.append(dataset_entry)
+                logger.debug(f"Loaded pair: {json_file.name} + {image_file.name}")
+                
+            except Exception as e:
+                logger.error(f"Error loading {json_file.name}: {str(e)}")
+                continue
+        
+        logger.info(f"Successfully loaded {len(dataset)} image-JSON pairs")
+        return dataset
diff --git a/src/gepa_optimizer/data/scroll_dataset_loader.py b/src/gepa_optimizer/data/scroll_dataset_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0c29e9e6119b2aca10de309e709db45374fb95c
--- /dev/null
+++ b/src/gepa_optimizer/data/scroll_dataset_loader.py
@@ -0,0 +1,334 @@
+"""
+Scroll Element Dataset Loader for Drizz Mobile App Testing
+
+Loads screenshots with bounding boxes and commands to identify scroll elements.
+Converts to GEPA-compatible format for prompt optimization.
+"""
+
+import base64
+import random
+import logging
+from typing import List, Dict, Any, Tuple, Optional
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ScrollDatasetLoader:
+    """
+    GENERIC dataset loader for image-based tasks.
+    
+    This is a LIBRARY class - NO hardcoded assumptions about:
+    - What the task is (OCR, element detection, classification, etc.)
+    - Input format (questions, commands, descriptions, etc.)
+    - Output format (IDs, text, JSON, etc.)
+    
+    Users define their dataset in the test script and pass it here.
+    
+    Dataset format per item: (image_filename, input_text, expected_output)
+    
+    Example usage (ANY task):
+        # Define YOUR dataset in YOUR test script
+        my_dataset = [
+            ("img1.png", "What is the main color?", "blue"),
+            ("img2.png", "Count the objects", "5"),
+            ("img3.png", "Describe the scene", "A cat on a sofa"),
+        ]
+        
+        # Pass to loader
+        loader = ScrollDatasetLoader(
+            images_dir="images",
+            dataset_config=my_dataset
+        )
+        data = loader.load_dataset()
+    """
+    
+    def __init__(
+        self,
+        images_dir: str = "images",
+        dataset_config: Optional[List[Tuple[str, str, str]]] = None
+    ):
+        """
+        Initialize dataset loader.
+        
+        Args:
+            images_dir: Directory containing images
+            dataset_config: List of (image_filename, input_text, expected_output) tuples.
+                           REQUIRED - no hardcoded defaults to keep library generic.
+        
+        Raises:
+            FileNotFoundError: If images_dir doesn't exist
+            ValueError: If dataset_config is None
+        """
+        self.images_dir = Path(images_dir)
+        
+        if not self.images_dir.exists():
+            raise FileNotFoundError(f"Images directory not found: {images_dir}")
+        
+        if dataset_config is None:
+            raise ValueError(
+                "dataset_config is required. This is a library class - define your "
+                "dataset in the test script:\n"
+                "  dataset = [('img1.png', 'your input', 'expected output'), ...]\n"
+                "  loader = ScrollDatasetLoader(images_dir='...', dataset_config=dataset)"
+            )
+        
+        self.dataset_config = dataset_config
+    
+    def load_dataset(self) -> List[Dict[str, Any]]:
+        """
+        Load complete dataset with images.
+        
+        Phase 1: Includes element_id extraction from expected output.
+        
+        Returns:
+            List of dataset items in GEPA format:
+            [
+                {
+                    "input": "Command: Scroll down by 70%",
+                    "output": "3",
+                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL
+                    "metadata": {
+                        "image_path": "images/5.png",
+                        "input_text": "Command: Scroll down by 70%",
+                        "expected_output": "3",
+                        "image_filename": "5.png",
+                        "element_id": 3  # Extracted integer (None if extraction fails)
+                    }
+                },
+                ...
+            ]
+        """
+        dataset = []
+        
+        # Generic variable names - no assumptions about data type
+        for image_filename, input_text, expected_output in self.dataset_config:
+            image_path = self.images_dir / image_filename
+            
+            # Validate image exists
+            if not image_path.exists():
+                logger.warning(f"Image not found: {image_path}")
+                continue
+            
+            # Read and encode image
+            try:
+                image_base64 = self._encode_image(image_path)
+            except Exception as e:
+                logger.warning(f"Error encoding {image_filename}: {e}")
+                continue
+            
+            # 🔥 Phase 1: Extract element_id from expected_output for robust evaluation
+            element_id = self._extract_element_id(expected_output)
+            if element_id is None:
+                logger.warning(f"Could not extract element_id from '{expected_output}' in {image_filename}")
+            
+            # Create dataset item - COMPLETELY GENERIC
+            # NO assumptions about output format (element IDs, commands, etc.)
+            # Just: image + input text + expected output text
+            # Library doesn't know or care what the task is!
+            # IMPORTANT: Put image_base64 at TOP LEVEL for UniversalConverter to find it
+            dataset_item = {
+                "input": input_text,  # Generic input text (ANY format)
+                "output": expected_output,  # Generic expected output (ANY format, full reasoning)
+                "image_base64": image_base64,  # TOP LEVEL for converter
+                "metadata": {
+                    "image_path": str(image_path),
+                    "input_text": input_text,
+                    "expected_output": expected_output,
+                    "image_filename": image_filename,
+                    "element_id": element_id  # NEW: Extracted element ID (int or None)
+                }
+            }
+            
+            dataset.append(dataset_item)
+        
+        if not dataset:
+            raise ValueError("No valid images found in dataset")
+        
+        logger.info(f"Loaded {len(dataset)} scroll element detection samples")
+        return dataset
+    
+    def _extract_element_id(self, expected_output: str) -> Optional[int]:
+        """
+        Extract element ID from expected output string.
+        
+        Handles multiple formats:
+        - "Element: 4"
+        - "Element 4"
+        - "4" (standalone)
+        - "Element: 4, Description: ..." (full reasoning)
+        
+        Args:
+            expected_output: Full expected output string with reasoning
+            
+        Returns:
+            Element ID as integer, or None if not found
+        """
+        import re
+        
+        if not expected_output:
+            return None
+        
+        # Pattern 1: "Element: X" or "Element X" (case insensitive)
+        patterns = [
+            r'element[:\s]+(\d+)',  # "Element: 4" or "Element 4"
+            r'\belement\s+(\d+)\b',  # "element 4" (word boundary)
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, expected_output, re.IGNORECASE)
+            if match:
+                try:
+                    element_id = int(match.group(1))
+                    # Validate range (reasonable UI element IDs)
+                    if 1 <= element_id <= 100:
+                        return element_id
+                except (ValueError, IndexError):
+                    continue
+        
+        # Pattern 2: First standalone number (if no "Element:" pattern found)
+        # Only use if it's a reasonable element ID (1-100)
+        number_match = re.search(r'\b(\d{1,3})\b', expected_output)
+        if number_match:
+            try:
+                element_id = int(number_match.group(1))
+                if 1 <= element_id <= 100:  # Reasonable range for UI elements
+                    return element_id
+            except ValueError:
+                pass
+        
+        return None
+    
+    def _encode_image(self, image_path: Path) -> str:
+        """
+        Encode image to base64 string.
+        
+        Args:
+            image_path: Path to image file
+            
+        Returns:
+            Base64 encoded image string
+        """
+        with open(image_path, "rb") as image_file:
+            encoded = base64.b64encode(image_file.read()).decode('utf-8')
+        return encoded
+    
+    def split_dataset(
+        self, 
+        dataset: List[Dict[str, Any]], 
+        train_size: int = 4,
+        val_size: int = 1,
+        test_size: int = 1,
+        shuffle: bool = True,
+        seed: Optional[int] = None
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Split dataset into train, validation, and test sets.
+        
+        🔥 NEW: Added shuffling support to ensure different image distribution
+        across splits, preventing hard images from always landing in validation set.
+        
+        Args:
+            dataset: Complete dataset
+            train_size: Number of samples for training (default: 4)
+            val_size: Number of samples for validation (default: 1)
+            test_size: Number of samples for test (default: 1)
+            shuffle: Whether to shuffle dataset before splitting (default: True)
+            seed: Random seed for reproducible shuffling (default: None = random)
+            
+        Returns:
+            Tuple of (train_set, val_set, test_set)
+        """
+        n = len(dataset)
+        
+        # Validate split sizes
+        total_size = train_size + val_size + test_size
+        if total_size > n:
+            logger.warning(f"Requested split ({total_size}) exceeds dataset size ({n}). Adjusting split proportionally...")
+            ratio = n / total_size
+            train_size = int(train_size * ratio)
+            val_size = int(val_size * ratio)
+            test_size = n - train_size - val_size
+        
+        # 🔥 CRITICAL: Shuffle dataset to ensure different image distribution
+        # This prevents the same hard images from always being in validation set
+        dataset_copy = dataset.copy()  # Don't modify original
+        if shuffle:
+            if seed is not None:
+                random.seed(seed)
+                logger.debug(f"Shuffling dataset with seed={seed} for reproducible splits")
+            else:
+                logger.debug(f"Shuffling dataset randomly (no seed)")
+            random.shuffle(dataset_copy)
+        else:
+            logger.warning(f"Not shuffling dataset - using original order")
+        
+        # Split shuffled dataset
+        train_set = dataset_copy[:train_size]
+        val_set = dataset_copy[train_size:train_size + val_size]
+        test_set = dataset_copy[train_size + val_size:train_size + val_size + test_size]
+        
+        logger.info(f"Dataset split: {len(train_set)} train, {len(val_set)} val, {len(test_set)} test")
+        
+        # Log which images are in each split for debugging
+        if shuffle:
+            train_images = [item['metadata'].get('image_filename', 'N/A') for item in train_set]
+            val_images = [item['metadata'].get('image_filename', 'N/A') for item in val_set]
+            test_images = [item['metadata'].get('image_filename', 'N/A') for item in test_set]
+            print(f"   Train images: {train_images[:5]}{'...' if len(train_images) > 5 else ''}")
+            print(f"   Val images:   {val_images}")
+            print(f"   Test images:  {test_images[:5]}{'...' if len(test_images) > 5 else ''}")
+        
+        return train_set, val_set, test_set
+
+
+def load_scroll_dataset(
+    images_dir: str = "images",
+    dataset_config: List[Tuple[str, str, str]] = None,
+    split: bool = True
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Convenience function to load image-based dataset (GENERIC).
+    
+    Args:
+        images_dir: Directory containing images
+        dataset_config: List of (image_filename, input_text, expected_output) tuples
+        split: Whether to split into train/val/test
+        
+    Returns:
+        If split=True: (train_set, val_set, test_set)
+        If split=False: (full_dataset, [], [])
+        
+    Example (works for ANY task):
+        dataset_config = [
+            ("img1.png", "What color is the sky?", "blue"),
+            ("img2.png", "Count the dogs", "2"),
+        ]
+        train, val, test = load_scroll_dataset(
+            images_dir="images",
+            dataset_config=dataset_config
+        )
+    """
+    loader = ScrollDatasetLoader(images_dir, dataset_config=dataset_config)
+    dataset = loader.load_dataset()
+    
+    if split:
+        return loader.split_dataset(dataset)
+    else:
+        return dataset, [], []
+
+
+# Example usage (for testing the library loader itself)
+if __name__ == "__main__":
+    print("🚀 Testing Scroll Dataset Loader...")
+    print("⚠️  NOTE: This is a library class. Define your dataset in your test script.")
+    print("\nExample:")
+    print("  dataset_config = [")
+    print("      ('image1.png', 'Scroll down by 50%', '3'),")
+    print("      ('image2.png', 'Swipe left', '4'),")
+    print("  ]")
+    print("  train, val, test = load_scroll_dataset(")
+    print("      images_dir='images',")
+    print("      dataset_config=dataset_config")
+    print("  )")
+
diff --git a/src/gepa_optimizer/data/validation_dataset_loader.py b/src/gepa_optimizer/data/validation_dataset_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c500db6127ae4c136eaf7c17ce0af0b88eca955
--- /dev/null
+++ b/src/gepa_optimizer/data/validation_dataset_loader.py
@@ -0,0 +1,376 @@
+"""
+Validation Dataset Loader for UI Validation Use Case
+
+Loads validation datapoints from SQLite database and converts to GEPA-compatible format.
+Supports filtering by data_type (trainset/valset/testset) and confirmed status.
+"""
+
+import os
+import sqlite3
+import base64
+import logging
+from typing import List, Dict, Any, Optional, Literal
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+
+class ValidationDatasetLoader:
+    """
+    Loads validation dataset from SQLite database.
+    
+    Database schema:
+    - validation_data: id, image_id, command, result (0/1), reasoning, data_type, confirmed, created_at
+    - images: image_id, mime, bytes (BLOB), created_at
+    
+    Converts to GEPA format:
+    - input: command text (seed prompt will be provided in test script)
+    - output: "true" or "false" (converted from 0/1)
+    - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
+    - metadata: All original fields plus converted values
+    
+    Note: The seed prompt is NOT stored in database - it will be provided in the test script.
+    The input field contains just the command, and the image is at top level.
+    """
+    
+    def __init__(
+        self,
+        db_path: Optional[str] = None,
+        confirmed_only: bool = True
+    ):
+        """
+        Initialize validation dataset loader.
+        
+        Args:
+            db_path: Path to SQLite database file. 
+                    Default: "./validation_data.db" or from VD_DB_PATH env var
+            confirmed_only: If True, only load datapoints where confirmed=1.
+                           Default: True (only manually reviewed data)
+        
+        Raises:
+            FileNotFoundError: If database file doesn't exist
+            sqlite3.Error: If database connection fails
+        """
+        # Get database path from env or use default
+        if db_path is None:
+            db_path = os.getenv("VD_DB_PATH", "./validation_data.db")
+        
+        self.db_path = Path(db_path).resolve()
+        
+        if not self.db_path.exists():
+            raise FileNotFoundError(
+                f"Database file not found: {self.db_path}\n"
+                f"Make sure validation_data_ui_server_async.py has been run at least once to create the database."
+            )
+        
+        self.confirmed_only = confirmed_only
+    
+    def load_dataset(
+        self,
+        data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
+        confirmed_only: Optional[bool] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Load dataset from database and convert to GEPA format.
+        
+        Args:
+            data_type: Filter by data_type. If None, loads all types.
+                      Options: "trainset", "valset", "testset"
+            confirmed_only: Override instance default. If True, only load confirmed datapoints.
+                           If None, uses instance default (self.confirmed_only)
+        
+        Returns:
+            List of dataset items in GEPA format:
+            [
+                {
+                    "input": "Validate Submit button is visible",  # Command only (seed prompt in test script)
+                    "output": "true",  # or "false" (converted from 0/1)
+                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL (image + command together)
+                    "metadata": {
+                        "id": 1,
+                        "image_id": "abc123...",
+                        "command": "Validate Submit button is visible",
+                        "result": True,  # Boolean
+                        "result_int": 1,  # Original 0/1
+                        "reasoning": "Detailed explanation...",
+                        "data_type": "trainset",
+                        "confirmed": True,
+                        "created_at": "2024-01-01 12:00:00"
+                    }
+                },
+                ...
+            ]
+            
+            Note: Seed prompt is provided separately in test script, not in database.
+        
+        Raises:
+            sqlite3.Error: If database query fails
+            ValueError: If no datapoints found matching criteria
+        """
+        # Use provided confirmed_only or instance default
+        use_confirmed = confirmed_only if confirmed_only is not None else self.confirmed_only
+        
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row  # Access columns by name
+        dataset = []
+        
+        try:
+            # Build query with filters
+            query = """
+                SELECT 
+                    v.id,
+                    v.image_id,
+                    v.command,
+                    v.result,
+                    v.reasoning,
+                    v.data_type,
+                    v.confirmed,
+                    v.created_at,
+                    i.mime,
+                    i.bytes
+                FROM validation_data v
+                INNER JOIN images i ON v.image_id = i.image_id
+                WHERE 1=1
+            """
+            params = []
+            
+            # Add filters
+            if use_confirmed:
+                query += " AND v.confirmed = 1"
+            
+            if data_type:
+                query += " AND v.data_type = ?"
+                params.append(data_type)
+            
+            query += " ORDER BY v.id ASC"
+            
+            # Execute query
+            cursor = conn.execute(query, params)
+            rows = cursor.fetchall()
+            
+            if not rows:
+                filter_msg = []
+                if use_confirmed:
+                    filter_msg.append("confirmed=1")
+                if data_type:
+                    filter_msg.append(f"data_type='{data_type}'")
+                
+                filter_str = " with filters: " + ", ".join(filter_msg) if filter_msg else ""
+                raise ValueError(
+                    f"No datapoints found{filter_str} in database: {self.db_path}\n"
+                    f"Make sure you have generated and saved datapoints using the validation UI."
+                )
+            
+            # Convert rows to GEPA format
+            for row in rows:
+                # Convert 0/1 to "true"/"false" string for GEPA
+                result_str = "true" if row["result"] == 1 else "false"
+                
+                # Encode image bytes to base64
+                image_base64 = base64.b64encode(row["bytes"]).decode("utf-8")
+                
+                # Create GEPA format item
+                # Input: command (seed prompt will be provided in test script)
+                # Image: separate at top level (image_base64)
+                # Output: "true" or "false" (converted from 0/1)
+                dataset_item = {
+                    "input": row["command"],  # Just the command - seed prompt will be in test script
+                    "output": result_str,  # "true" or "false" (string)
+                    "image_base64": image_base64,  # TOP LEVEL for UniversalConverter (image + command together)
+                    "metadata": {
+                        "id": row["id"],
+                        "image_id": row["image_id"],
+                        "command": row["command"],  # Keep original for reference
+                        "result": bool(row["result"]),  # Boolean for reference
+                        "result_int": row["result"],  # Original 0/1 for reference
+                        "reasoning": row["reasoning"],
+                        "data_type": row["data_type"],
+                        "confirmed": bool(row["confirmed"]),
+                        "created_at": row["created_at"],
+                        "mime": row["mime"],
+                    }
+                }
+                
+                dataset.append(dataset_item)
+            
+            # Log summary
+            data_type_str = f" ({data_type})" if data_type else ""
+            confirmed_str = " (confirmed only)" if use_confirmed else " (all)"
+            logger.info(f"Loaded {len(dataset)} validation datapoints{data_type_str}{confirmed_str}")
+            
+            return dataset
+            
+        finally:
+            conn.close()
+    
+    def load_split_dataset(
+        self,
+        confirmed_only: Optional[bool] = None
+    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Load dataset split by data_type (trainset/valset/testset).
+        
+        Convenience method that loads all three splits at once.
+        
+        Args:
+            confirmed_only: Override instance default. If True, only load confirmed datapoints.
+        
+        Returns:
+            Tuple of (train_set, val_set, test_set) in GEPA format
+        
+        Example:
+            loader = ValidationDatasetLoader(db_path="./validation_data.db")
+            train, val, test = loader.load_split_dataset()
+        """
+        train_set = self.load_dataset(data_type="trainset", confirmed_only=confirmed_only)
+        val_set = self.load_dataset(data_type="valset", confirmed_only=confirmed_only)
+        test_set = self.load_dataset(data_type="testset", confirmed_only=confirmed_only)
+        
+        logger.info(f"Dataset Split Summary: Training={len(train_set)}, Validation={len(val_set)}, Test={len(test_set)}, Total={len(train_set) + len(val_set) + len(test_set)}")
+        
+        return train_set, val_set, test_set
+    
+    def get_dataset_stats(self) -> Dict[str, Any]:
+        """
+        Get statistics about the dataset in the database.
+        
+        Returns:
+            Dictionary with dataset statistics:
+            {
+                "total": 100,
+                "confirmed": 95,
+                "unconfirmed": 5,
+                "by_data_type": {
+                    "trainset": 70,
+                    "valset": 15,
+                    "testset": 15
+                },
+                "by_result": {
+                    "true": 50,
+                    "false": 50
+                }
+            }
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        
+        try:
+            stats = {}
+            
+            # Total counts
+            total = conn.execute("SELECT COUNT(*) FROM validation_data").fetchone()[0]
+            confirmed = conn.execute("SELECT COUNT(*) FROM validation_data WHERE confirmed = 1").fetchone()[0]
+            stats["total"] = total
+            stats["confirmed"] = confirmed
+            stats["unconfirmed"] = total - confirmed
+            
+            # By data_type
+            data_type_rows = conn.execute("""
+                SELECT data_type, COUNT(*) as count 
+                FROM validation_data 
+                GROUP BY data_type
+            """).fetchall()
+            stats["by_data_type"] = {row["data_type"]: row["count"] for row in data_type_rows}
+            
+            # By result (true/false)
+            result_rows = conn.execute("""
+                SELECT result, COUNT(*) as count 
+                FROM validation_data 
+                GROUP BY result
+            """).fetchall()
+            stats["by_result"] = {
+                "true": sum(row["count"] for row in result_rows if row["result"] == 1),
+                "false": sum(row["count"] for row in result_rows if row["result"] == 0)
+            }
+            
+            return stats
+            
+        finally:
+            conn.close()
+
+
+def load_validation_dataset(
+    db_path: Optional[str] = None,
+    data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
+    confirmed_only: bool = True
+) -> List[Dict[str, Any]]:
+    """
+    Convenience function to load validation dataset.
+    
+    Args:
+        db_path: Path to SQLite database file. Default: "./validation_data.db"
+        data_type: Filter by data_type. If None, loads all types.
+        confirmed_only: If True, only load confirmed datapoints.
+    
+    Returns:
+        List of dataset items in GEPA format
+    
+    Example:
+        # Load all confirmed training data
+        train_data = load_validation_dataset(data_type="trainset", confirmed_only=True)
+        
+        # Load all confirmed data
+        all_data = load_validation_dataset(confirmed_only=True)
+    """
+    loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
+    return loader.load_dataset(data_type=data_type, confirmed_only=confirmed_only)
+
+
+def load_validation_split(
+    db_path: Optional[str] = None,
+    confirmed_only: bool = True
+) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Convenience function to load validation dataset split by data_type.
+    
+    Args:
+        db_path: Path to SQLite database file. Default: "./validation_data.db"
+        confirmed_only: If True, only load confirmed datapoints.
+    
+    Returns:
+        Tuple of (train_set, val_set, test_set) in GEPA format
+    
+    Example:
+        train, val, test = load_validation_split(confirmed_only=True)
+    """
+    loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
+    return loader.load_split_dataset(confirmed_only=confirmed_only)
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Validation Dataset Loader...")
+    
+    try:
+        loader = ValidationDatasetLoader()
+        
+        # Get stats
+        print("\n📊 Dataset Statistics:")
+        stats = loader.get_dataset_stats()
+        print(f"   Total: {stats['total']}")
+        print(f"   Confirmed: {stats['confirmed']}")
+        print(f"   Unconfirmed: {stats['unconfirmed']}")
+        print(f"   By data_type: {stats['by_data_type']}")
+        print(f"   By result: {stats['by_result']}")
+        
+        # Load split dataset
+        print("\n📦 Loading split dataset...")
+        train, val, test = loader.load_split_dataset()
+        
+        # Show sample
+        if train:
+            sample = train[0]
+            print(f"\n📝 Sample Training Item:")
+            print(f"   Input: {sample['input']}")
+            print(f"   Output: {sample['output']}")
+            print(f"   Image ID: {sample['metadata']['image_id'][:8]}...")
+            print(f"   Data Type: {sample['metadata']['data_type']}")
+            print(f"   Result: {sample['metadata']['result']} (int: {sample['metadata']['result_int']})")
+        
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        print("\n💡 Make sure validation_data_ui_server_async.py has been run to create the database.")
+    except ValueError as e:
+        print(f"❌ {e}")
+        print("\n💡 Generate and save some datapoints using the validation UI first.")
+
diff --git a/src/gepa_optimizer/data/validators.py b/src/gepa_optimizer/data/validators.py
new file mode 100644
index 0000000000000000000000000000000000000000..28fbc6048f91a25c1a4d54befc7f62be4498f898
--- /dev/null
+++ b/src/gepa_optimizer/data/validators.py
@@ -0,0 +1,207 @@
+"""
+Data validation utilities for GEPA optimizer
+"""
+
+from typing import List, Dict, Any, Optional, Tuple
+import logging
+
+logger = logging.getLogger(__name__)
+
+class DataValidator:
+    """
+    Validates datasets for completeness and GEPA compatibility
+    """
+    
+    def __init__(self):
+        self.required_fields = ['input', 'output']
+        self.optional_fields = ['metadata', 'id', 'tags']
+    
+    def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
+        """
+        Validate entire dataset
+        
+        Args:
+            dataset: List of data items to validate
+            
+        Returns:
+            Tuple[bool, List[str]]: (is_valid, list_of_errors)
+        """
+        errors = []
+        
+        # Basic dataset checks
+        if not dataset:
+            errors.append("Dataset is empty")
+            return False, errors
+        
+        if not isinstance(dataset, list):
+            errors.append("Dataset must be a list")
+            return False, errors
+        
+        # Validate each item
+        for idx, item in enumerate(dataset):
+            item_errors = self.validate_item(item, idx)
+            errors.extend(item_errors)
+        
+        # Check for minimum dataset size
+        if len(dataset) < 2:
+            errors.append("Dataset should have at least 2 items for proper train/val split")
+        
+        # Log validation results
+        if errors:
+            logger.warning(f"Dataset validation failed with {len(errors)} errors")
+        else:
+            logger.info(f"Dataset validation passed for {len(dataset)} items")
+        
+        return len(errors) == 0, errors
+    
+    def validate_item(self, item: Dict[str, Any], index: Optional[int] = None) -> List[str]:
+        """
+        Validate a single dataset item
+        
+        Args:
+            item: Single data item to validate
+            index: Optional item index for error reporting
+            
+        Returns:
+            List[str]: List of validation errors
+        """
+        errors = []
+        item_ref = f"item {index}" if index is not None else "item"
+        
+        # Check if item is a dictionary
+        if not isinstance(item, dict):
+            errors.append(f"{item_ref}: Must be a dictionary")
+            return errors
+        
+        # Check for required fields
+        if 'input' not in item:
+            errors.append(f"{item_ref}: Missing required 'input' field")
+        elif not isinstance(item['input'], str):
+            errors.append(f"{item_ref}: 'input' field must be a string")
+        elif not item['input'].strip():
+            errors.append(f"{item_ref}: 'input' field cannot be empty")
+        
+        # Check output field (can be empty but should exist for supervised learning)
+        if 'output' in item:
+            if not isinstance(item['output'], str):
+                errors.append(f"{item_ref}: 'output' field must be a string")
+        
+        # Validate metadata if present
+        if 'metadata' in item and not isinstance(item['metadata'], dict):
+            errors.append(f"{item_ref}: 'metadata' field must be a dictionary")
+        
+        return errors
+    
+    def validate_gepa_format(self, gepa_data: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
+        """
+        Validate data in GEPA format
+        
+        Args:
+            gepa_data: Data in GEPA format
+            
+        Returns:
+            Tuple[bool, List[str]]: (is_valid, list_of_errors)
+        """
+        errors = []
+        
+        if not gepa_data:
+            errors.append("GEPA dataset is empty")
+            return False, errors
+        
+        for idx, item in enumerate(gepa_data):
+            if 'input' not in item:
+                errors.append(f"GEPA item {idx}: Missing 'input' field")
+            
+            if 'expected_output' not in item:
+                errors.append(f"GEPA item {idx}: Missing 'expected_output' field")
+            
+            if 'metadata' not in item:
+                errors.append(f"GEPA item {idx}: Missing 'metadata' field")
+            elif not isinstance(item['metadata'], dict):
+                errors.append(f"GEPA item {idx}: 'metadata' must be a dictionary")
+        
+        return len(errors) == 0, errors
+    
+    def validate_split(self, trainset: List[Dict], valset: List[Dict]) -> Tuple[bool, List[str]]:
+        """
+        Validate train/validation split
+        
+        Args:
+            trainset: Training data
+            valset: Validation data
+            
+        Returns:
+            Tuple[bool, List[str]]: (is_valid, list_of_errors)
+        """
+        errors = []
+        
+        if not trainset:
+            errors.append("Training set is empty")
+        
+        if not valset:
+            errors.append("Validation set is empty")
+        
+        # Check proportions
+        total_size = len(trainset) + len(valset)
+        if total_size > 0:
+            train_ratio = len(trainset) / total_size
+            if train_ratio < 0.5:
+                errors.append(f"Training set too small: {train_ratio:.2%} of total data")
+            elif train_ratio > 0.95:
+                errors.append(f"Validation set too small: {1-train_ratio:.2%} of total data")
+        
+        return len(errors) == 0, errors
+    
+    def get_dataset_stats(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Get statistics about the dataset
+        
+        Args:
+            dataset: Dataset to analyze
+            
+        Returns:
+            Dict[str, Any]: Dataset statistics
+        """
+        if not dataset:
+            return {'total_items': 0, 'valid': False}
+        
+        stats = {
+            'total_items': len(dataset),
+            'has_output': sum(1 for item in dataset if item.get('output')),
+            'avg_input_length': 0,
+            'avg_output_length': 0,
+            'empty_inputs': 0,
+            'empty_outputs': 0
+        }
+        
+        input_lengths = []
+        output_lengths = []
+        
+        for item in dataset:
+            if isinstance(item, dict):
+                input_text = item.get('input', '')
+                output_text = item.get('output', '')
+                
+                if isinstance(input_text, str):
+                    input_lengths.append(len(input_text))
+                    if not input_text.strip():
+                        stats['empty_inputs'] += 1
+                
+                if isinstance(output_text, str):
+                    output_lengths.append(len(output_text))
+                    if not output_text.strip():
+                        stats['empty_outputs'] += 1
+        
+        if input_lengths:
+            stats['avg_input_length'] = sum(input_lengths) / len(input_lengths)
+        
+        if output_lengths:
+            stats['avg_output_length'] = sum(output_lengths) / len(output_lengths)
+        
+        # Determine if dataset looks valid
+        stats['valid'] = (
+            stats['total_items'] > 0 and
+            stats['empty_inputs'] < stats['total_items'] * 0.5  # Less than 50% empty inputs
+        )
+        
+        return stats
diff --git a/src/gepa_optimizer/evaluation/__init__.py b/src/gepa_optimizer/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e464dcf245f5c1bf1f11eba3eb30d64fe60499be
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/__init__.py
@@ -0,0 +1,28 @@
+"""
+Evaluation module for GEPA Optimizer
+
+Includes:
+- UniversalSemanticEvaluator: Works for ANY task (recommended for general use)
+- BaseEvaluator: Abstract base class for custom evaluators
+- Task-specific evaluators for specialized use cases
+"""
+
+from .base_evaluator import BaseEvaluator
+from .universal_evaluator import UniversalSemanticEvaluator, create_universal_evaluator
+from .ui_evaluator import UITreeEvaluator
+from .scroll_evaluator import ScrollElementEvaluator
+from .validation_evaluator import ValidationEvaluator
+from .index_caching_evaluator import IndexCachingEvaluator
+
+__all__ = [
+    # Universal (recommended)
+    "UniversalSemanticEvaluator",
+    "create_universal_evaluator",
+    # Base class
+    "BaseEvaluator",
+    # Task-specific
+    "UITreeEvaluator",
+    "ScrollElementEvaluator",
+    "ValidationEvaluator",
+    "IndexCachingEvaluator",
+]
diff --git a/src/gepa_optimizer/evaluation/base_evaluator.py b/src/gepa_optimizer/evaluation/base_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..c63f322935ceffa0d9fb38a7d1b2049c078bda6c
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/base_evaluator.py
@@ -0,0 +1,51 @@
+"""
+Base evaluator class for all evaluation strategies.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BaseEvaluator(ABC):
+    """
+    Abstract base class for all evaluation strategies.
+    
+    This enforces a consistent interface while allowing complete customization
+    of evaluation logic for any use case.
+    """
+    
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize evaluator with optional metric weights.
+        
+        Args:
+            metric_weights: Optional weights for different metrics.
+                          If None, subclasses should provide defaults.
+        """
+        self.metric_weights = metric_weights or {}
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+    
+    @abstractmethod
+    def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]:
+        """
+        Evaluate predicted output against expected output.
+        
+        Args:
+            predicted: The model's predicted output
+            expected: The ground truth expected output
+            
+        Returns:
+            Dictionary with metric names as keys and scores as values.
+            Must include 'composite_score' key for GEPA integration.
+        """
+        pass
+    
+    def validate_weights(self) -> bool:
+        """Validate that metric weights sum to approximately 1.0"""
+        if not self.metric_weights:
+            return True
+        
+        total = sum(self.metric_weights.values())
+        return abs(total - 1.0) < 0.01  # Allow small floating point errors
diff --git a/src/gepa_optimizer/evaluation/index_caching_evaluator.py b/src/gepa_optimizer/evaluation/index_caching_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d4ca53c9277645fa8037e27c65aae633fe47ca1
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/index_caching_evaluator.py
@@ -0,0 +1,357 @@
+"""
+Index Caching Evaluator for Index-Based Element Selection Use Case
+
+Evaluates predicted index caching results against expected results.
+Compares all 5 fields with equal weight:
+- is_index_based
+- index_value
+- parent_element_id
+- element_id_of_nth_child_of_parent
+- selected_element_is_correct
+"""
+
+from typing import Dict, Any, Optional
+import json
+import re
+import logging
+
+from .base_evaluator import BaseEvaluator
+
+
+class IndexCachingEvaluator(BaseEvaluator):
+    """
+    Evaluator for index caching use case.
+    
+    Features:
+    - Compares all 5 fields with equal weight (20% each)
+    - Parses JSON from LLM response
+    - Handles null values correctly
+    - Returns detailed field-by-field comparison
+    """
+    
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize index caching evaluator.
+        
+        Args:
+            metric_weights: Weights for evaluation metrics
+                          Default: Equal weight for all 5 fields (0.2 each)
+        """
+        # Each field gets 20% weight (5 fields * 0.2 = 1.0)
+        default_weights = {
+            "is_index_based_match": 0.2,
+            "index_value_match": 0.2,
+            "parent_element_id_match": 0.2,
+            "element_id_of_nth_child_match": 0.2,
+            "selected_element_correct_match": 0.2,
+        }
+        
+        weights = metric_weights or default_weights
+        super().__init__(metric_weights=weights)
+    
+    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+        """
+        Evaluate predicted index caching result against expected result.
+        
+        Args:
+            predicted: LLM's output (JSON string with all 5 fields)
+            expected: Expected output (JSON string or dict with all 5 fields)
+            
+        Returns:
+            Dictionary with evaluation metrics:
+            {
+                "is_index_based_match": 1.0 or 0.0,
+                "index_value_match": 1.0 or 0.0,
+                "parent_element_id_match": 1.0 or 0.0,
+                "element_id_of_nth_child_match": 1.0 or 0.0,
+                "selected_element_correct_match": 1.0 or 0.0,
+                "composite_score": 0.0 to 1.0,
+                "predicted_output": str,
+                "expected_output": str,
+                "field_scores": {...},
+                "evaluation_reason": str
+            }
+        """
+        if not predicted or not expected:
+            return {
+                "is_index_based_match": 0.0,
+                "index_value_match": 0.0,
+                "parent_element_id_match": 0.0,
+                "element_id_of_nth_child_match": 0.0,
+                "selected_element_correct_match": 0.0,
+                "composite_score": 0.0,
+                "predicted_output": str(predicted).strip() if predicted else "",
+                "expected_output": str(expected).strip() if expected else "",
+                "field_scores": {},
+                "evaluation_reason": "❌ Empty or missing input/output"
+            }
+        
+        # Parse expected (could be JSON string or dict)
+        try:
+            if isinstance(expected, str):
+                expected_dict = json.loads(expected)
+            else:
+                expected_dict = expected
+        except (json.JSONDecodeError, TypeError):
+            # If expected is already a dict from dataset
+            expected_dict = expected if isinstance(expected, dict) else {}
+        
+        # Parse predicted (must be JSON string)
+        try:
+            predicted_dict = self._parse_json_response(predicted)
+        except Exception as e:
+            # Log the actual response for debugging
+            response_preview = predicted[:200] if predicted else "(empty)"
+            self.logger.warning(f"Failed to parse predicted JSON: {e}")
+            self.logger.warning(f"Response preview: {response_preview}...")
+            predicted_dict = {}
+        
+        # NOTE: "notes" field is present in the output but is NOT used for scoring or reflection
+        # It's kept for reference but ignored in evaluation
+        
+        # Compare each field (only the 5 core fields, ignoring "notes")
+        field_scores = {}
+        field_reasons = []
+        
+        # 1. is_index_based (boolean)
+        pred_is_index = predicted_dict.get("is_index_based")
+        exp_is_index = expected_dict.get("is_index_based")
+        is_index_match = (pred_is_index == exp_is_index) if (pred_is_index is not None and exp_is_index is not None) else False
+        field_scores["is_index_based"] = 1.0 if is_index_match else 0.0
+        field_reasons.append(f"is_index_based: {pred_is_index} vs {exp_is_index} → {'✅' if is_index_match else '❌'}")
+        
+        # 2. index_value (int or null)
+        pred_index_val = predicted_dict.get("index_value")
+        exp_index_val = expected_dict.get("index_value")
+        # Handle null/None comparison
+        index_val_match = (pred_index_val == exp_index_val) or (pred_index_val is None and exp_index_val is None)
+        field_scores["index_value"] = 1.0 if index_val_match else 0.0
+        field_reasons.append(f"index_value: {pred_index_val} vs {exp_index_val} → {'✅' if index_val_match else '❌'}")
+        
+        # 3. parent_element_id (string or null)
+        pred_parent = predicted_dict.get("parent_element_id")
+        exp_parent = expected_dict.get("parent_element_id")
+        # Handle null/None comparison
+        parent_match = (pred_parent == exp_parent) or (pred_parent is None and exp_parent is None)
+        field_scores["parent_element_id"] = 1.0 if parent_match else 0.0
+        field_reasons.append(f"parent_element_id: {pred_parent} vs {exp_parent} → {'✅' if parent_match else '❌'}")
+        
+        # 4. element_id_of_nth_child_of_parent (string or null)
+        pred_element = predicted_dict.get("element_id_of_nth_child_of_parent")
+        exp_element = expected_dict.get("element_id_of_nth_child_of_parent")
+        # Handle null/None comparison
+        element_match = (pred_element == exp_element) or (pred_element is None and exp_element is None)
+        field_scores["element_id_of_nth_child_of_parent"] = 1.0 if element_match else 0.0
+        field_reasons.append(f"element_id_of_nth_child: {pred_element} vs {exp_element} → {'✅' if element_match else '❌'}")
+        
+        # 5. selected_element_is_correct (boolean)
+        pred_selected = predicted_dict.get("selected_element_is_correct")
+        exp_selected = expected_dict.get("selected_element_is_correct")
+        selected_match = (pred_selected == exp_selected) if (pred_selected is not None and exp_selected is not None) else False
+        field_scores["selected_element_is_correct"] = 1.0 if selected_match else 0.0
+        field_reasons.append(f"selected_element_is_correct: {pred_selected} vs {exp_selected} → {'✅' if selected_match else '❌'}")
+        
+        # Calculate composite score (weighted average)
+        composite_score = (
+            field_scores["is_index_based"] * 0.2 +
+            field_scores["index_value"] * 0.2 +
+            field_scores["parent_element_id"] * 0.2 +
+            field_scores["element_id_of_nth_child_of_parent"] * 0.2 +
+            field_scores["selected_element_is_correct"] * 0.2
+        )
+        
+        # Build evaluation reason
+        all_match = composite_score == 1.0
+        reason = "✅ All fields match!" if all_match else f"❌ Partial match ({composite_score:.1%})"
+        reason += "\n" + "\n".join(f"   {r}" for r in field_reasons)
+        
+        # Log evaluation details
+        self.logger.info(f"\n{'─'*70}")
+        self.logger.info(f"📊 INDEX CACHING EVALUATION")
+        self.logger.info(f"{'─'*70}")
+        self.logger.info(f"   🎯 COMPOSITE SCORE: {composite_score:.2f} ({composite_score:.1%})")
+        for field, score in field_scores.items():
+            status = "✅" if score == 1.0 else "❌"
+            self.logger.info(f"   {status} {field}: {score:.0f}")
+        self.logger.info(f"{'─'*70}\n")
+        
+        return {
+            "is_index_based_match": field_scores["is_index_based"],
+            "index_value_match": field_scores["index_value"],
+            "parent_element_id_match": field_scores["parent_element_id"],
+            "element_id_of_nth_child_match": field_scores["element_id_of_nth_child_of_parent"],
+            "selected_element_correct_match": field_scores["selected_element_is_correct"],
+            "composite_score": composite_score,
+            "predicted_output": predicted,
+            "expected_output": json.dumps(expected_dict) if isinstance(expected_dict, dict) else str(expected),
+            "predicted_dict": predicted_dict,
+            "expected_dict": expected_dict,
+            "field_scores": field_scores,
+            "evaluation_reason": reason
+        }
+    
+    def _parse_json_response(self, response: str) -> Dict[str, Any]:
+        """
+        Parse JSON from LLM response, handling markdown code blocks and various formats.
+        
+        Args:
+            response: LLM response string (may contain markdown)
+            
+        Returns:
+            Parsed JSON dictionary (empty dict if parsing fails)
+        """
+        if not response or not isinstance(response, str):
+            return {}
+        
+        response = response.strip()
+        
+        # If response is empty, return empty dict
+        if not response:
+            return {}
+        
+        # Strategy 1: Try to extract JSON from markdown code block
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
+        if json_match:
+            try:
+                json_str = json_match.group(1).strip()
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                pass
+        
+        # Strategy 2: Find JSON object in response (handle nested braces)
+        json_start = response.find('{')
+        if json_start != -1:
+            # Find matching closing brace
+            brace_count = 0
+            json_end = json_start
+            for i in range(json_start, len(response)):
+                if response[i] == '{':
+                    brace_count += 1
+                elif response[i] == '}':
+                    brace_count -= 1
+                    if brace_count == 0:
+                        json_end = i + 1
+                        break
+            
+            if brace_count == 0:
+                json_str = response[json_start:json_end]
+                try:
+                    return json.loads(json_str)
+                except json.JSONDecodeError:
+                    pass
+        
+        # Strategy 3: Try to find any JSON-like structure (more lenient)
+        # Look for patterns like {"key": "value"} even if not perfectly formatted
+        json_pattern = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response, re.DOTALL)
+        if json_pattern:
+            try:
+                return json.loads(json_pattern.group(0))
+            except json.JSONDecodeError:
+                pass
+        
+        # Strategy 4: Try parsing entire response as JSON
+        try:
+            return json.loads(response)
+        except json.JSONDecodeError:
+            pass
+        
+        # If all strategies fail, return empty dict
+        self.logger.debug(f"Could not parse JSON from response: {response[:100]}...")
+        return {}
+    
+    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        
+        Args:
+            results: List of evaluation result dictionaries
+            
+        Returns:
+            Summary statistics including accuracy per field and overall
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "overall_accuracy": 0.0,
+                "field_accuracies": {},
+                "perfect_matches": 0
+            }
+        
+        total = len(results)
+        perfect_matches = sum(1 for r in results if r.get("composite_score", 0.0) == 1.0)
+        overall_accuracy = perfect_matches / total if total > 0 else 0.0
+        
+        # Calculate accuracy per field
+        field_accuracies = {
+            "is_index_based": sum(1 for r in results if r.get("is_index_based_match", 0.0) == 1.0) / total,
+            "index_value": sum(1 for r in results if r.get("index_value_match", 0.0) == 1.0) / total,
+            "parent_element_id": sum(1 for r in results if r.get("parent_element_id_match", 0.0) == 1.0) / total,
+            "element_id_of_nth_child": sum(1 for r in results if r.get("element_id_of_nth_child_match", 0.0) == 1.0) / total,
+            "selected_element_is_correct": sum(1 for r in results if r.get("selected_element_correct_match", 0.0) == 1.0) / total,
+        }
+        
+        return {
+            "total_samples": total,
+            "overall_accuracy": overall_accuracy,
+            "field_accuracies": field_accuracies,
+            "perfect_matches": perfect_matches,
+            "partial_matches": total - perfect_matches
+        }
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Index Caching Evaluator...")
+    
+    evaluator = IndexCachingEvaluator()
+    
+    # Test cases
+    test_cases = [
+        # (predicted, expected, should_be_perfect)
+        (
+            '{"is_index_based": true, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}',
+            {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True},
+            True
+        ),
+        (
+            '{"is_index_based": false, "index_value": null, "parent_element_id": null, "element_id_of_nth_child_of_parent": null, "selected_element_is_correct": true}',
+            {"is_index_based": False, "index_value": None, "parent_element_id": None, "element_id_of_nth_child_of_parent": None, "selected_element_is_correct": True},
+            True
+        ),
+        (
+            '{"is_index_based": true, "index_value": 3, "parent_element_id": null, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": true}',
+            {"is_index_based": True, "index_value": 3, "parent_element_id": None, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": True},
+            True
+        ),
+        (
+            '{"is_index_based": true, "index_value": 2, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}',
+            {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True},
+            False  # index_value mismatch
+        ),
+    ]
+    
+    print("\n📝 Running test cases:")
+    print("-" * 80)
+    
+    results = []
+    for predicted, expected, should_be_perfect in test_cases:
+        result = evaluator.evaluate(predicted, expected)
+        is_perfect = result["composite_score"] == 1.0
+        
+        status = "✅" if is_perfect == should_be_perfect else "❌"
+        print(f"{status} Test: Perfect match = {is_perfect} (expected {should_be_perfect})")
+        print(f"   Score: {result['composite_score']:.2f}")
+        print()
+        
+        results.append(result)
+    
+    # Summary
+    print("\n📊 Summary:")
+    summary = evaluator.get_evaluation_summary(results)
+    print(f"   Total: {summary['total_samples']}")
+    print(f"   Perfect matches: {summary['perfect_matches']}")
+    print(f"   Overall accuracy: {summary['overall_accuracy']:.1%}")
+    print(f"   Field accuracies:")
+    for field, acc in summary['field_accuracies'].items():
+        print(f"      {field}: {acc:.1%}")
+
diff --git a/src/gepa_optimizer/evaluation/scroll_evaluator.py b/src/gepa_optimizer/evaluation/scroll_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..64171b2b9ae384339eec3d846e7d34c90fb36070
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/scroll_evaluator.py
@@ -0,0 +1,251 @@
+"""
+GENERIC String Match Evaluator
+
+Compares predicted output against expected output (simple string comparison).
+NO assumptions about what the output represents (IDs, text, JSON, etc.).
+
+Let GEPA discover the correct output format through evolution and feedback!
+"""
+
+from typing import Dict, Any
+
+try:
+    from .base_evaluator import BaseEvaluator
+except ImportError:
+    # For standalone testing
+    import sys
+    from pathlib import Path
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+    from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
+
+
+class ScrollElementEvaluator(BaseEvaluator):
+    """
+    GENERIC evaluator - just compares strings!
+    
+    NO assumptions about:
+    - Output format (element IDs, text, JSON, etc.)
+    - Output structure
+    - What the task is
+    
+    GEPA will learn the correct format through feedback and evolution.
+    """
+    
+    def __init__(self, metric_weights: Dict[str, float] = None):
+        """
+        Initialize evaluator.
+        
+        Args:
+            metric_weights: Weights for evaluation metrics
+                          Default: {"output_match": 1.0}
+        """
+        default_weights = {
+            "output_match": 1.0  # Simple string comparison
+        }
+        
+        weights = metric_weights or default_weights
+        super().__init__(metric_weights=weights)
+    
+    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+        """
+        Binary evaluation with element ID extraction.
+        
+        Phase 1 Implementation:
+        - Extracts element IDs using regex patterns (flexible format support)
+        - Uses INTEGER comparison for robustness (prevents "4" vs "14" bugs)
+        - Binary scoring: correct element = 1.0, wrong/missing = 0.0
+        
+        Scoring Strategy:
+        1. Extract element ID from both predicted and expected outputs
+        2. Compare using integer arithmetic (not string comparison)
+        3. Return 1.0 if match, 0.0 otherwise (no partial credit)
+        
+        Args:
+            predicted: LLM's output (may include verbose explanation)
+            expected: Expected output (may include verbose explanation)
+            
+        Returns:
+            Dictionary with evaluation metrics and extracted element IDs
+        """
+        import re
+        
+        if not predicted or not expected:
+            return {
+                "content_match": 0.0,
+                "output_match": 0.0,
+                "composite_score": 0.0,
+                "predicted_output": str(predicted).strip() if predicted else "",
+                "expected_output": str(expected).strip() if expected else "",
+                "predicted_element": "None",
+                "expected_element": "None",
+                "evaluation_reason": "❌ Empty or missing input/output"
+            }
+        
+        predicted_str = str(predicted).strip()
+        expected_str = str(expected).strip()
+        
+        # 1. Extract element numbers using MULTIPLE strategies (flexible!)
+        # Strategy A: "Element: X" or "Element X" (explicit format)
+        element_pattern_a = r'element[:\s]+(\d+)'
+        
+        # Strategy B: "element X" or "Element X" anywhere in text
+        element_pattern_b = r'\belement\s+(\d+)\b'
+        
+        # Strategy C: Just find ANY number if other strategies fail (last resort)
+        number_pattern = r'\b(\d+)\b'
+        
+        # Try to extract from predicted
+        pred_match = re.search(element_pattern_a, predicted_str, re.IGNORECASE)
+        if not pred_match:
+            pred_match = re.search(element_pattern_b, predicted_str, re.IGNORECASE)
+        if not pred_match:
+            # Last resort: find first number in the text
+            pred_match = re.search(number_pattern, predicted_str)
+        
+        # Try to extract from expected
+        exp_match = re.search(element_pattern_a, expected_str, re.IGNORECASE)
+        if not exp_match:
+            exp_match = re.search(element_pattern_b, expected_str, re.IGNORECASE)
+        if not exp_match:
+            exp_match = re.search(number_pattern, expected_str)
+        
+        # 2. Check if we found element numbers in both
+        if not exp_match:
+            # Expected doesn't have element pattern - fallback to exact match
+            content_score = 1.0 if predicted_str.lower() == expected_str.lower() else 0.0
+        elif not pred_match:
+            # Predicted doesn't have element number - WRONG
+            content_score = 0.0
+        else:
+            # Both have element pattern - compare using INTEGER comparison
+            pred_element = pred_match.group(1)
+            exp_element = exp_match.group(1)
+            
+            # 🔥 Phase 1: Use INTEGER comparison for robustness
+            # This prevents bugs like "4" != "14" string comparison issues
+            try:
+                pred_num = int(pred_element)
+                exp_num = int(exp_element)
+                
+                # Integer comparison (more robust than string)
+                content_score = 1.0 if pred_num == exp_num else 0.0
+                
+                # Log comparison for debugging
+                if pred_num != exp_num:
+                    import logging
+                    logger = logging.getLogger(__name__)
+                    logger.debug(f"Element mismatch: predicted={pred_num}, expected={exp_num}")
+                    
+            except (ValueError, TypeError) as e:
+                # Fallback to string comparison if conversion fails
+                import logging
+                logger = logging.getLogger(__name__)
+                logger.warning(f"Could not convert elements to integers: {e}, using string comparison")
+                content_score = 1.0 if pred_element == exp_element else 0.0
+        
+        # 3. Binary score and reason
+        if content_score == 1.0:
+            composite_score = 1.0
+            reason = "✅ Correct! Element number matches"
+        else:
+            composite_score = 0.0
+            if pred_match and exp_match:
+                reason = "❌ Wrong element number (predicted different element)"
+            else:
+                reason = "❌ Missing or invalid element number"
+        
+        pred_element = pred_match.group(1) if pred_match else "None"
+        exp_element = exp_match.group(1) if exp_match else "None"
+        
+        # Detailed logging for transparency
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.info(f"\n{'─'*70}")
+        logger.info(f"📊 EVALUATION DETAILS")
+        logger.info(f"{'─'*70}")
+        logger.info(f"   Expected: '{expected_str}' (Element: {exp_element})")
+        logger.info(f"   Predicted: '{predicted_str}' (Element: {pred_element})")
+        logger.info(f"   {'─'*66}")
+        logger.info(f"   🎯 SCORE: {composite_score:.2f} - {reason}")
+        logger.info(f"{'─'*70}\n")
+        
+        return {
+            "content_match": content_score,
+            "output_match": composite_score,  # This is what GEPA uses
+            "composite_score": composite_score,
+            "predicted_output": predicted_str,
+            "expected_output": expected_str,
+            "predicted_element": pred_element,
+            "expected_element": exp_element,
+            "evaluation_reason": reason
+        }
+    
+    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        
+        Args:
+            results: List of evaluation result dictionaries
+            
+        Returns:
+            Summary statistics
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "accuracy": 0.0,
+                "correct_predictions": 0
+            }
+        
+        total = len(results)
+        correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
+        accuracy = correct / total if total > 0 else 0.0
+        
+        return {
+            "total_samples": total,
+            "accuracy": accuracy,
+            "correct_predictions": correct,
+            "incorrect_predictions": total - correct
+        }
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Scroll Element Evaluator...")
+    
+    evaluator = ScrollElementEvaluator()
+    
+    # Test cases
+    test_cases = [
+        ("4", "4", True),
+        ("Element: 4", "4", True),
+        ("Element 4", "4", True),
+        ("The element to interact with is 4", "4", True),
+        ("Element ID: 4", "4", True),
+        ("Click on element 4 to scroll", "4", True),
+        ("5", "4", False),
+        ("Element: 5", "4", False),
+        ("No element found", "4", False),
+        ("", "4", False),
+    ]
+    
+    print("\n📝 Running test cases:")
+    print("-" * 80)
+    
+    results = []
+    for predicted, expected, should_match in test_cases:
+        result = evaluator.evaluate(predicted, expected)
+        match = result["composite_score"] == 1.0
+        
+        status = "✅" if match == should_match else "❌"
+        print(f"{status} Predicted: '{predicted}' | Expected: '{expected}' | Match: {match}")
+        
+        results.append(result)
+    
+    # Summary
+    print("\n📊 Summary:")
+    summary = evaluator.get_evaluation_summary(results)
+    print(f"   Total: {summary['total_samples']}")
+    print(f"   Correct: {summary['correct_predictions']}")
+    print(f"   Accuracy: {summary['accuracy']:.1%}")
+
diff --git a/src/gepa_optimizer/evaluation/ui_evaluator.py b/src/gepa_optimizer/evaluation/ui_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4aadf122212ad2d07ae1c7c4f23342eb5be88a3
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/ui_evaluator.py
@@ -0,0 +1,297 @@
+"""
+UI Tree Evaluator for GEPA Optimizer
+"""
+
+import json
+import logging
+import difflib
+from typing import Any, Dict, List, Optional
+
+from .base_evaluator import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+class UITreeEvaluator(BaseEvaluator):
+    """
+    Comprehensive evaluator for UI tree extraction quality.
+    """
+
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initializes the UITreeEvaluator with configurable metric weights.
+
+        Args:
+            metric_weights: A dictionary of weights for different metrics.
+                            If None, default weights will be used.
+        """
+        # Set default weights for UI tree evaluation
+        default_weights = {
+            "element_completeness": 0.3,      # How many elements are captured
+            "element_type_accuracy": 0.25,    # Correct element types (Button, Text, etc.)
+            "text_content_accuracy": 0.2,     # Text content matches
+            "hierarchy_accuracy": 0.15,       # Parent-child relationships
+            "style_accuracy": 0.1,            # Style properties captured
+        }
+        
+        # Use provided weights or defaults
+        weights = metric_weights or default_weights
+        
+        # Initialize parent class
+        super().__init__(metric_weights=weights)
+        
+        # Normalize weights
+        self._normalize_weights()
+    
+    def _normalize_weights(self):
+        """Normalize weights to sum to 1.0"""
+        total_weight = sum(self.metric_weights.values())
+        if total_weight > 0:
+            self.metric_weights = {k: v / total_weight for k, v in self.metric_weights.items()}
+        else:
+            self.logger.warning("Total metric weight is zero. Scores will be zero.")
+
+    def evaluate(self, predicted_json: Dict[str, Any], expected_json: Dict[str, Any]) -> Dict[str, float]:
+        """
+        Generates a weighted composite score from individual metrics.
+
+        Args:
+            predicted_json: The JSON generated by the LLM.
+            expected_json: The ground truth JSON.
+
+        Returns:
+            A dictionary of individual metric scores and the composite score.
+        """
+        scores = {
+            "element_completeness": self.calculate_element_completeness(predicted_json, expected_json),
+            "element_type_accuracy": self.calculate_element_type_accuracy(predicted_json, expected_json),
+            "text_content_accuracy": self.calculate_text_content_accuracy(predicted_json, expected_json),
+            "hierarchy_accuracy": self.calculate_hierarchy_accuracy(predicted_json, expected_json),
+            "style_accuracy": self.calculate_style_accuracy(predicted_json, expected_json),
+        }
+
+        composite_score = sum(scores[metric] * self.metric_weights.get(metric, 0) for metric in scores)
+        scores["composite_score"] = composite_score
+
+        # Add detailed logging for debugging
+        logger.debug(f"Evaluation scores: {scores}")
+        logger.debug(f"Composite score: {composite_score:.4f}")
+        
+        # Add small improvement bonus for better prompts (encourage GEPA to accept improvements)
+        # This helps GEPA recognize even tiny improvements
+        if composite_score > 0.05:  # If we have any meaningful content
+            composite_score = min(composite_score + 0.001, 1.0)  # Small bonus to encourage acceptance
+
+        return scores
+
+    def calculate_element_completeness(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates how many UI elements are captured in the predicted JSON.
+        This is the most important metric for UI tree extraction.
+        """
+        def _count_elements(node):
+            """Count total elements in the tree"""
+            if not isinstance(node, dict):
+                return 0
+            count = 1  # Count current node
+            for child in node.get("children", []):
+                count += _count_elements(child)
+            return count
+
+        try:
+            predicted_count = _count_elements(predicted)
+            expected_count = _count_elements(expected)
+            
+            if expected_count == 0:
+                return 1.0 if predicted_count == 0 else 0.0
+            
+            # Score based on how many elements are captured
+            completeness_ratio = predicted_count / expected_count
+            
+            # Give bonus for capturing more elements (up to 1.0)
+            # Penalize heavily for missing elements
+            if completeness_ratio >= 1.0:
+                return 1.0  # Perfect or better
+            elif completeness_ratio >= 0.8:
+                return completeness_ratio  # Good coverage
+            elif completeness_ratio >= 0.5:
+                return completeness_ratio * 0.8  # Moderate coverage with penalty
+            else:
+                return completeness_ratio * 0.5  # Poor coverage with heavy penalty
+                
+        except Exception as e:
+            logger.warning(f"Error calculating element completeness: {e}")
+            return 0.0
+
+    def calculate_element_type_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates element type accuracy by comparing the 'type' attribute of corresponding nodes.
+        Focuses on common UI element types like Button, Text, Image, etc.
+        """
+        def _get_all_types(node):
+            if not isinstance(node, dict):
+                return []
+            types = [node.get("type")]
+            for child in node.get("children", []):
+                types.extend(_get_all_types(child))
+            return [t for t in types if t is not None]
+
+        try:
+            predicted_types = _get_all_types(predicted)
+            expected_types = _get_all_types(expected)
+
+            if not expected_types:
+                return 1.0 if not predicted_types else 0.5
+
+            if not predicted_types:
+                return 0.0
+
+            # Count matching types with frequency consideration
+            expected_type_counts = {}
+            for t in expected_types:
+                expected_type_counts[t] = expected_type_counts.get(t, 0) + 1
+            
+            predicted_type_counts = {}
+            for t in predicted_types:
+                predicted_type_counts[t] = predicted_type_counts.get(t, 0) + 1
+            
+            # Calculate accuracy based on type matches
+            total_matches = 0
+            for type_name, expected_count in expected_type_counts.items():
+                predicted_count = predicted_type_counts.get(type_name, 0)
+                # Count matches up to the expected count
+                total_matches += min(predicted_count, expected_count)
+            
+            return total_matches / len(expected_types) if expected_types else 0.0
+            
+        except Exception as e:
+            logger.warning(f"Error calculating element type accuracy: {e}")
+            return 0.0
+
+    def calculate_hierarchy_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates hierarchy accuracy by comparing parent-child relationships.
+        """
+        def _get_hierarchy_structure(node, parent_type="ROOT"):
+            """Extract hierarchy structure as (parent_type, child_type) pairs"""
+            if not isinstance(node, dict):
+                return []
+            
+            current_type = node.get("type", "unknown")
+            hierarchy = [(parent_type, current_type)]
+            
+            for child in node.get("children", []):
+                hierarchy.extend(_get_hierarchy_structure(child, current_type))
+            
+            return hierarchy
+
+        try:
+            predicted_hierarchy = _get_hierarchy_structure(predicted)
+            expected_hierarchy = _get_hierarchy_structure(expected)
+            
+            if not expected_hierarchy:
+                return 1.0 if not predicted_hierarchy else 0.5
+            
+            if not predicted_hierarchy:
+                return 0.0
+            
+            # Count matching hierarchy relationships
+            expected_hierarchy_set = set(expected_hierarchy)
+            predicted_hierarchy_set = set(predicted_hierarchy)
+            
+            matches = len(expected_hierarchy_set.intersection(predicted_hierarchy_set))
+            total_expected = len(expected_hierarchy_set)
+            
+            return matches / total_expected if total_expected > 0 else 0.0
+            
+        except Exception as e:
+            logger.warning(f"Error calculating hierarchy accuracy: {e}")
+            return 0.0
+
+    def calculate_text_content_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates text content accuracy by comparing the 'text' attribute of corresponding nodes.
+        """
+        def _get_all_texts(node):
+            if not isinstance(node, dict):
+                return []
+            texts = [node.get("text")]
+            for child in node.get("children", []):
+                texts.extend(_get_all_texts(child))
+            return [t for t in texts if t is not None and str(t).strip()]
+
+        try:
+            predicted_texts = _get_all_texts(predicted)
+            expected_texts = _get_all_texts(expected)
+
+            if not expected_texts:
+                return 1.0 if not predicted_texts else 0.5  # Partial credit if predicted has texts but expected doesn't
+
+            if not predicted_texts:
+                return 0.0  # No predicted texts, so no match
+
+            total_similarity = 0.0
+            for p_text in predicted_texts:
+                best_similarity = 0.0
+                for e_text in expected_texts:
+                    similarity = difflib.SequenceMatcher(None, str(p_text).strip(), str(e_text).strip()).ratio()
+                    best_similarity = max(best_similarity, similarity)
+                total_similarity += best_similarity
+            
+            # Average similarity over all predicted texts
+            if not predicted_texts and not expected_texts:
+                return 1.0
+            elif not predicted_texts:
+                return 0.0
+            else:
+                return total_similarity / len(predicted_texts)
+        except Exception as e:
+            logger.warning(f"Error calculating text content accuracy: {e}")
+            return 0.0
+
+    def calculate_style_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates style accuracy by comparing style properties.
+        """
+        def _get_all_styles(node):
+            """Extract all style properties from the tree"""
+            if not isinstance(node, dict):
+                return []
+            
+            styles = []
+            if "style" in node and isinstance(node["style"], dict):
+                styles.append(node["style"])
+            
+            for child in node.get("children", []):
+                styles.extend(_get_all_styles(child))
+            
+            return styles
+
+        try:
+            predicted_styles = _get_all_styles(predicted)
+            expected_styles = _get_all_styles(expected)
+            
+            if not expected_styles:
+                return 1.0 if not predicted_styles else 0.5
+            
+            if not predicted_styles:
+                return 0.0
+            
+            # Calculate style property overlap
+            total_style_properties = 0
+            matching_properties = 0
+            
+            for exp_style in expected_styles:
+                for prop_name, prop_value in exp_style.items():
+                    total_style_properties += 1
+                    
+                    # Find matching property in predicted styles
+                    for pred_style in predicted_styles:
+                        if prop_name in pred_style and pred_style[prop_name] == prop_value:
+                            matching_properties += 1
+                            break
+            
+            return matching_properties / total_style_properties if total_style_properties > 0 else 0.0
+            
+        except Exception as e:
+            logger.warning(f"Error calculating style accuracy: {e}")
+            return 0.0
diff --git a/src/gepa_optimizer/evaluation/universal_evaluator.py b/src/gepa_optimizer/evaluation/universal_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..16714de4ecad31a7a44a0551c3a7b45918a34b2a
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/universal_evaluator.py
@@ -0,0 +1,911 @@
+"""
+Universal Semantic Evaluator for ANY prompt optimization use case.
+
+This evaluator uses LLM-powered semantic analysis to compare predicted vs expected outputs,
+enabling prompt optimization for ANY task without requiring custom evaluator code.
+
+Key Features:
+- Semantic understanding (not just string matching)
+- Works with text, JSON, numbers, structured outputs
+- Provides rich feedback for GEPA reflection
+- No task-specific assumptions
+"""
+
+import json
+import re
+import logging
+from typing import Dict, Any, Optional, List
+from difflib import SequenceMatcher
+
+from .base_evaluator import BaseEvaluator
+
+logger = logging.getLogger(__name__)
+
+
+class UniversalSemanticEvaluator(BaseEvaluator):
+    """
+    Universal evaluator using LLM for semantic comparison.
+    
+    Works for ANY task without hardcoded assumptions:
+    - Text outputs: "The answer is 42" vs "42"
+    - JSON outputs: {"count": 23} vs {"count": 22}
+    - Structured data: Lists, nested objects
+    - Multi-modal: Image descriptions, analysis results
+    
+    Evaluation Strategy:
+    1. Quick checks (exact match, empty handling)
+    2. Structural comparison (for JSON/structured data)
+    3. LLM semantic analysis (for meaning understanding)
+    4. Combine into composite score with rich feedback
+    """
+    
+    def __init__(
+        self,
+        llm_client=None,
+        use_llm_analysis: bool = True,
+        semantic_weight: float = 0.6,
+        structural_weight: float = 0.25,
+        exact_match_bonus: float = 0.15,
+        metric_weights: Optional[Dict[str, float]] = None
+    ):
+        """
+        Initialize Universal Semantic Evaluator.
+        
+        Args:
+            llm_client: LLM client for semantic analysis (optional, falls back to heuristics)
+            use_llm_analysis: Whether to use LLM for semantic comparison
+            semantic_weight: Weight for semantic similarity (0.0-1.0)
+            structural_weight: Weight for structural similarity (0.0-1.0)
+            exact_match_bonus: Bonus weight for exact matches (0.0-1.0)
+            metric_weights: Optional custom weights (overrides above)
+        """
+        default_weights = metric_weights or {
+            "semantic_similarity": semantic_weight,
+            "structural_similarity": structural_weight,
+            "exact_match": exact_match_bonus
+        }
+        super().__init__(metric_weights=default_weights)
+        
+        self.llm_client = llm_client
+        self.use_llm_analysis = use_llm_analysis and llm_client is not None
+        
+        # Cache for LLM analysis to reduce API calls
+        self._analysis_cache: Dict[str, Dict] = {}
+        
+        logger.info(f"🎯 Universal Semantic Evaluator initialized")
+        logger.info(f"   LLM analysis: {'enabled' if self.use_llm_analysis else 'disabled (using heuristics)'}")
+        logger.info(f"   Weights: semantic={semantic_weight}, structural={structural_weight}, exact={exact_match_bonus}")
+    
+    def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]:
+        """
+        Evaluate predicted output against expected output using semantic understanding.
+        
+        Args:
+            predicted: The model's predicted output (string, dict, or any serializable type)
+            expected: The ground truth expected output
+            
+        Returns:
+            Dictionary with metrics including 'composite_score' (required for GEPA)
+        """
+        # Convert to strings for comparison
+        predicted_str = self._to_string(predicted)
+        expected_str = self._to_string(expected)
+        
+        # Initialize result
+        result = {
+            "composite_score": 0.0,
+            "exact_match": 0.0,
+            "semantic_similarity": 0.0,
+            "structural_similarity": 0.0,
+            "predicted_output": predicted_str[:500],  # Truncate for logging
+            "expected_output": expected_str[:500],
+            "analysis": {},
+            "improvement_feedback": ""
+        }
+        
+        # Handle empty/missing outputs
+        if not predicted_str or not predicted_str.strip():
+            result["improvement_feedback"] = "❌ Output is EMPTY. The prompt must instruct the model to produce output."
+            result["analysis"] = {"status": "empty_predicted"}
+            return result
+        
+        if not expected_str or not expected_str.strip():
+            result["improvement_feedback"] = "⚠️ Expected output is empty - cannot evaluate."
+            result["analysis"] = {"status": "empty_expected"}
+            result["composite_score"] = 0.5  # Neutral score
+            return result
+        
+        # ─────────────────────────────────────────────────────
+        # STEP 1: Exact Match Check (Fast Path)
+        # ─────────────────────────────────────────────────────
+        normalized_pred = self._normalize(predicted_str)
+        normalized_exp = self._normalize(expected_str)
+        
+        if normalized_pred == normalized_exp:
+            result["exact_match"] = 1.0
+            result["semantic_similarity"] = 1.0
+            result["structural_similarity"] = 1.0
+            result["composite_score"] = 1.0
+            result["improvement_feedback"] = "✅ Perfect match! Output exactly matches expected."
+            result["analysis"] = {"status": "exact_match"}
+            return result
+        
+        # ─────────────────────────────────────────────────────
+        # STEP 1.5: FORMAT MISMATCH DETECTION (CRITICAL FIX)
+        # ─────────────────────────────────────────────────────
+        # 🔥 CRITICAL: Detect when expected is JSON but predicted is narrative text
+        # This causes catastrophically low scores and needs explicit handling
+        expected_is_json = self._try_parse_json(expected_str) is not None
+        predicted_is_json = self._try_parse_json(predicted_str) is not None
+        
+        format_mismatch = expected_is_json and not predicted_is_json
+        if format_mismatch:
+            # Expected JSON but got narrative - this is a CRITICAL format error
+            # Give partial credit for semantic content but penalize heavily for format
+            result["analysis"]["format_mismatch"] = True
+            result["improvement_feedback"] = (
+                "❌ FORMAT ERROR: Expected JSON output but received narrative text. "
+                "The prompt MUST enforce JSON output format. "
+                "Add explicit instructions like: 'Output ONLY valid JSON, no explanations.' "
+                "Consider adding: 'Do NOT write prose or explanations.'"
+            )
+            # Still evaluate semantic content but cap the score
+            # This gives feedback for improving the prompt
+            logger.warning(f"⚠️  Format mismatch: expected JSON ({len(expected_str)} chars), got narrative ({len(predicted_str)} chars)")
+        
+        # ─────────────────────────────────────────────────────
+        # STEP 2: Structural Comparison (for JSON/structured data)
+        # ─────────────────────────────────────────────────────
+        structural_result = self._compare_structure(predicted_str, expected_str)
+        result["structural_similarity"] = structural_result["score"]
+        result["analysis"]["structural"] = structural_result.get("details", {})
+        
+        # ─────────────────────────────────────────────────────
+        # STEP 3: Semantic Analysis
+        # ─────────────────────────────────────────────────────
+        if self.use_llm_analysis:
+            semantic_result = self._llm_semantic_analysis(predicted_str, expected_str)
+        else:
+            semantic_result = self._heuristic_semantic_analysis(predicted_str, expected_str)
+        
+        result["semantic_similarity"] = semantic_result["score"]
+        result["analysis"]["semantic"] = semantic_result.get("details", {})
+        result["improvement_feedback"] = semantic_result.get("feedback", "")
+        
+        # ─────────────────────────────────────────────────────
+        # STEP 4: Compute Composite Score
+        # ─────────────────────────────────────────────────────
+        weights = self.metric_weights
+        composite = (
+            result["semantic_similarity"] * weights.get("semantic_similarity", 0.6) +
+            result["structural_similarity"] * weights.get("structural_similarity", 0.25) +
+            result["exact_match"] * weights.get("exact_match", 0.15)
+        )
+        
+        # 🔥 CRITICAL FIX: Apply format mismatch penalty
+        # If expected JSON but got narrative, cap the score to encourage format compliance
+        if result.get("analysis", {}).get("format_mismatch"):
+            # Cap at 0.3 to indicate "partial semantic match but wrong format"
+            # This ensures format-correct outputs always score higher
+            composite = min(composite, 0.30)
+            logger.debug(f"📊 Format mismatch penalty applied: score capped at {composite:.3f}")
+        
+        result["composite_score"] = min(max(composite, 0.0), 1.0)
+        
+        # Add score breakdown to feedback
+        if not result["improvement_feedback"]:
+            result["improvement_feedback"] = self._generate_default_feedback(result)
+        
+        # Log evaluation
+        logger.debug(f"📊 Evaluation: composite={result['composite_score']:.3f}, "
+                    f"semantic={result['semantic_similarity']:.3f}, "
+                    f"structural={result['structural_similarity']:.3f}")
+        
+        # #region agent log
+        try:
+            import json as _json_debug
+            import time as _time_debug
+            import os as _os_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_evaluator.py:final_score", "message": "Final evaluation score breakdown", "data": {"composite": result["composite_score"], "semantic": result["semantic_similarity"], "structural": result["structural_similarity"], "exact_match": result["exact_match"], "format_mismatch": result.get("analysis", {}).get("format_mismatch", False), "predicted_preview": predicted_str[:150] if predicted_str else "EMPTY", "expected_preview": expected_str[:150] if expected_str else "EMPTY"}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        except Exception as _e:
+            pass  # Silent fail for instrumentation
+        # #endregion
+        
+        return result
+    
+    def _to_string(self, value: Any) -> str:
+        """Convert any value to string for comparison."""
+        if value is None:
+            return ""
+        if isinstance(value, str):
+            return value.strip()
+        if isinstance(value, dict):
+            try:
+                return json.dumps(value, sort_keys=True, indent=2)
+            except (TypeError, ValueError):
+                return str(value)
+        if isinstance(value, (list, tuple)):
+            try:
+                return json.dumps(list(value), sort_keys=True)
+            except (TypeError, ValueError):
+                return str(value)
+        return str(value).strip()
+    
+    def _normalize(self, text: str) -> str:
+        """Normalize text for comparison (lowercase, whitespace)."""
+        # Lowercase and normalize whitespace
+        normalized = ' '.join(text.lower().split())
+        # Remove common punctuation that doesn't affect meaning
+        normalized = re.sub(r'[.,;:!?\'"]+$', '', normalized)
+        return normalized
+    
+    def _compare_structure(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """
+        Compare structural similarity (especially for JSON/structured outputs).
+        
+        Returns:
+            Dict with 'score' (0.0-1.0) and 'details'
+        """
+        result = {"score": 0.0, "details": {}}
+        
+        # Try to parse as JSON
+        pred_json = self._try_parse_json(predicted)
+        exp_json = self._try_parse_json(expected)
+        
+        if pred_json is not None and exp_json is not None:
+            # Both are valid JSON - do structural comparison
+            return self._compare_json_structures(pred_json, exp_json)
+        
+        # Fallback: Compare as text structure
+        return self._compare_text_structure(predicted, expected)
+    
+    def _try_parse_json(self, text: str) -> Optional[Any]:
+        """
+        Try to parse text as JSON with robust extraction.
+        
+        🔥 FIX: LLMs often wrap JSON in markdown code blocks or add extra text.
+        This method now handles multiple formats:
+        - Direct JSON
+        - ```json ... ``` blocks
+        - ``` ... ``` blocks (no language tag)
+        - JSON embedded in prose
+        - Escaped newlines and quotes
+        """
+        if not text or not isinstance(text, str):
+            return None
+            
+        # 🔥 PREPROCESSING: Clean common LLM output issues
+        cleaned = text.strip()
+        
+        # Remove BOM and other invisible characters
+        cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d')
+        
+        # Strategy 1: Try direct parse (cleanest case)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            pass
+        
+        # Strategy 2: Extract JSON from markdown code block (```json ... ```)
+        # More permissive regex that handles optional language tags
+        json_match = re.search(r'```(?:json|JSON)?\s*([\{|\[].*?[\}|\]])\s*```', cleaned, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        
+        # Strategy 3: Find JSON using balanced brace matching (handles nested objects)
+        def extract_balanced_json(s: str, start_char: str, end_char: str) -> Optional[str]:
+            """Extract JSON with balanced braces/brackets."""
+            count = 0
+            start_idx = -1
+            for i, char in enumerate(s):
+                if char == start_char:
+                    if count == 0:
+                        start_idx = i
+                    count += 1
+                elif char == end_char:
+                    count -= 1
+                    if count == 0 and start_idx >= 0:
+                        return s[start_idx:i+1]
+            return None
+        
+        # Try to find JSON object
+        json_obj = extract_balanced_json(cleaned, '{', '}')
+        if json_obj:
+            try:
+                return json.loads(json_obj)
+            except json.JSONDecodeError:
+                # Try to repair common issues
+                repaired = self._repair_json(json_obj)
+                try:
+                    return json.loads(repaired)
+                except json.JSONDecodeError:
+                    pass
+        
+        # Try to find JSON array
+        json_arr = extract_balanced_json(cleaned, '[', ']')
+        if json_arr:
+            try:
+                return json.loads(json_arr)
+            except json.JSONDecodeError:
+                repaired = self._repair_json(json_arr)
+                try:
+                    return json.loads(repaired)
+                except json.JSONDecodeError:
+                    pass
+        
+        return None
+    
+    def _repair_json(self, json_str: str) -> str:
+        """
+        Attempt to repair common JSON issues from LLM output.
+        
+        Fixes:
+        - Trailing commas before } or ]
+        - Single quotes instead of double quotes
+        - Unquoted keys
+        - Comments (// and /* */)
+        """
+        repaired = json_str
+        
+        # Remove trailing commas
+        repaired = re.sub(r',\s*}', '}', repaired)
+        repaired = re.sub(r',\s*]', ']', repaired)
+        
+        # Remove single-line comments
+        repaired = re.sub(r'//[^\n]*', '', repaired)
+        
+        # Remove multi-line comments
+        repaired = re.sub(r'/\*.*?\*/', '', repaired, flags=re.DOTALL)
+        
+        # Replace single quotes with double quotes (but be careful with apostrophes)
+        # Only replace when it looks like a JSON delimiter
+        def replace_single_quotes(match):
+            content = match.group(0)
+            # Skip if it looks like an apostrophe in a word
+            if re.match(r"'\w+'\s*:", content) or re.match(r":\s*'[^']*'", content):
+                return content.replace("'", '"')
+            return content
+        
+        # Basic single quote replacement for keys
+        repaired = re.sub(r"'([^']+)'\s*:", r'"\1":', repaired)
+        
+        return repaired
+    
+    def _compare_json_structures(self, pred: Any, exp: Any) -> Dict[str, Any]:
+        """Compare two JSON structures."""
+        result = {"score": 0.0, "details": {"type": "json", "matches": [], "mismatches": []}}
+        
+        if type(pred) != type(exp):
+            result["details"]["mismatches"].append(f"Type mismatch: predicted={type(pred).__name__}, expected={type(exp).__name__}")
+            result["score"] = 0.2  # Some credit for being JSON
+            return result
+        
+        if isinstance(pred, dict) and isinstance(exp, dict):
+            return self._compare_dicts(pred, exp)
+        elif isinstance(pred, list) and isinstance(exp, list):
+            return self._compare_lists(pred, exp)
+        else:
+            # Primitive types
+            if pred == exp:
+                result["score"] = 1.0
+                result["details"]["matches"].append(f"Values match: {pred}")
+            else:
+                result["score"] = self._value_similarity(pred, exp)
+                result["details"]["mismatches"].append(f"Value mismatch: predicted={pred}, expected={exp}")
+            return result
+    
+    def _compare_dicts(self, pred: dict, exp: dict) -> Dict[str, Any]:
+        """
+        Compare two dictionaries with CASE-INSENSITIVE key matching.
+        
+        🔥 FIX: LLMs often produce keys like 'Category' when expected is 'category'.
+        This method now normalizes keys before comparison for fair scoring.
+        """
+        result = {"score": 0.0, "details": {"type": "dict", "matches": [], "mismatches": [], "missing_keys": [], "extra_keys": []}}
+        
+        # 🔥 NORMALIZE: Convert all keys to lowercase for comparison
+        # Also handle common variations like underscores vs camelCase
+        def normalize_key(key: str) -> str:
+            """Normalize key: lowercase, underscores to nothing, strip spaces."""
+            return re.sub(r'[_\s-]', '', str(key).lower())
+        
+        # Build normalized key mappings
+        pred_normalized = {normalize_key(k): (k, v) for k, v in pred.items()}
+        exp_normalized = {normalize_key(k): (k, v) for k, v in exp.items()}
+        
+        pred_norm_keys = set(pred_normalized.keys())
+        exp_norm_keys = set(exp_normalized.keys())
+        
+        # Check for missing/extra keys (using normalized comparison)
+        missing_norm = exp_norm_keys - pred_norm_keys
+        extra_norm = pred_norm_keys - exp_norm_keys
+        common_norm = pred_norm_keys & exp_norm_keys
+        
+        # Convert back to original key names for reporting
+        missing = [exp_normalized[k][0] for k in missing_norm]
+        extra = [pred_normalized[k][0] for k in extra_norm]
+        
+        result["details"]["missing_keys"] = missing
+        result["details"]["extra_keys"] = extra
+        
+        if not exp_norm_keys:
+            result["score"] = 1.0 if not pred_norm_keys else 0.5
+            return result
+        
+        # Score based on key overlap (normalized)
+        key_score = len(common_norm) / len(exp_norm_keys) if exp_norm_keys else 1.0
+        
+        # Score based on value matches
+        value_scores = []
+        for norm_key in common_norm:
+            pred_orig_key, pred_val = pred_normalized[norm_key]
+            exp_orig_key, exp_val = exp_normalized[norm_key]
+            
+            if pred_val == exp_val:
+                value_scores.append(1.0)
+                result["details"]["matches"].append(f"{exp_orig_key}: {exp_val}")
+            else:
+                sim = self._value_similarity(pred_val, exp_val)
+                value_scores.append(sim)
+                if sim < 0.8:
+                    result["details"]["mismatches"].append(f"{exp_orig_key}: predicted={pred_val}, expected={exp_val}")
+        
+        value_score = sum(value_scores) / len(value_scores) if value_scores else 0.0
+        
+        # Combine scores
+        result["score"] = 0.3 * key_score + 0.7 * value_score
+        
+        # Penalty for missing keys (reduced from 0.1 to 0.05 per key)
+        if missing:
+            result["score"] *= (1 - 0.05 * len(missing))
+        
+        result["score"] = max(0.0, min(1.0, result["score"]))
+        return result
+    
+    def _compare_lists(self, pred: list, exp: list) -> Dict[str, Any]:
+        """Compare two lists."""
+        result = {"score": 0.0, "details": {"type": "list", "length_match": False, "item_matches": 0}}
+        
+        if not exp:
+            result["score"] = 1.0 if not pred else 0.5
+            return result
+        
+        result["details"]["length_match"] = len(pred) == len(exp)
+        
+        # Compare items (order-sensitive)
+        matches = 0
+        for i, exp_item in enumerate(exp):
+            if i < len(pred):
+                if pred[i] == exp_item:
+                    matches += 1
+                else:
+                    # Check if item exists elsewhere
+                    if exp_item in pred:
+                        matches += 0.5  # Partial credit for wrong position
+        
+        result["details"]["item_matches"] = matches
+        result["score"] = matches / len(exp)
+        
+        # Penalty for length mismatch
+        if len(pred) != len(exp):
+            len_ratio = min(len(pred), len(exp)) / max(len(pred), len(exp))
+            result["score"] *= (0.7 + 0.3 * len_ratio)
+        
+        return result
+    
+    def _value_similarity(self, pred: Any, exp: Any) -> float:
+        """
+        Calculate similarity between two values.
+        
+        🔥 ENHANCED: Now handles:
+        - Case-insensitive string comparison
+        - Semantic similarity for common variations
+        - Underscore/space/dash normalization
+        - Numeric comparison with tolerance
+        """
+        # Same value (exact match)
+        if pred == exp:
+            return 1.0
+        
+        # Numeric comparison
+        try:
+            pred_num = float(pred)
+            exp_num = float(exp)
+            if exp_num == 0:
+                return 1.0 if pred_num == 0 else 0.0
+            # Relative error with tolerance
+            error = abs(pred_num - exp_num) / abs(exp_num)
+            return max(0.0, 1.0 - error)
+        except (ValueError, TypeError):
+            pass
+        
+        # String comparison with normalization
+        pred_str = str(pred).strip()
+        exp_str = str(exp).strip()
+        
+        # Case-insensitive exact match
+        if pred_str.lower() == exp_str.lower():
+            return 0.98  # Slight penalty for case mismatch
+        
+        # Normalize strings (remove underscores, spaces, dashes for comparison)
+        def normalize_str(s: str) -> str:
+            return re.sub(r'[_\s\-]+', '', s.lower())
+        
+        pred_norm = normalize_str(pred_str)
+        exp_norm = normalize_str(exp_str)
+        
+        if pred_norm == exp_norm:
+            return 0.95  # Good match despite formatting differences
+        
+        # Check if one contains the other (partial match)
+        if pred_norm in exp_norm or exp_norm in pred_norm:
+            ratio = min(len(pred_norm), len(exp_norm)) / max(len(pred_norm), len(exp_norm))
+            return 0.7 + (0.2 * ratio)  # 0.7-0.9 for partial matches
+        
+        # 🔥 SEMANTIC SIMILARITY: Check for common equivalent terms
+        semantic_equivalents = {
+            # Priority levels
+            'low': ['low', 'minor', 'trivial', 'p3', 'p4'],
+            'medium': ['medium', 'normal', 'moderate', 'p2'],
+            'high': ['high', 'important', 'major', 'p1', 'critical', 'urgent'],
+            # Boolean variations
+            'true': ['true', 'yes', '1', 'on', 'enabled'],
+            'false': ['false', 'no', '0', 'off', 'disabled'],
+            # Status variations
+            'success': ['success', 'succeeded', 'completed', 'done', 'passed'],
+            'failure': ['failure', 'failed', 'error', 'crashed'],
+            'pending': ['pending', 'waiting', 'queued', 'in_progress', 'processing'],
+        }
+        
+        for canonical, equivalents in semantic_equivalents.items():
+            pred_match = any(eq in pred_norm for eq in equivalents)
+            exp_match = any(eq in exp_norm for eq in equivalents)
+            if pred_match and exp_match:
+                return 0.85  # Semantic match
+        
+        # Sequence matching (character-level similarity)
+        ratio = SequenceMatcher(None, pred_str.lower(), exp_str.lower()).ratio()
+        
+        # 🔥 WORD-LEVEL SIMILARITY: Check word overlap
+        pred_words = set(re.findall(r'\w+', pred_str.lower()))
+        exp_words = set(re.findall(r'\w+', exp_str.lower()))
+        
+        if pred_words and exp_words:
+            word_overlap = len(pred_words & exp_words) / max(len(pred_words), len(exp_words))
+            # Combine character and word similarity
+            return max(ratio, word_overlap * 0.9)
+    
+    def _compare_text_structure(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """Compare text structure when not JSON."""
+        result = {"score": 0.0, "details": {"type": "text"}}
+        
+        # Word overlap
+        pred_words = set(predicted.lower().split())
+        exp_words = set(expected.lower().split())
+        
+        if not exp_words:
+            result["score"] = 1.0 if not pred_words else 0.5
+            return result
+        
+        overlap = len(pred_words & exp_words)
+        result["details"]["word_overlap"] = overlap
+        result["details"]["expected_words"] = len(exp_words)
+        
+        # Jaccard similarity
+        union = len(pred_words | exp_words)
+        result["score"] = overlap / union if union > 0 else 0.0
+        
+        return result
+    
+    def _llm_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """
+        Use LLM for semantic analysis of predicted vs expected.
+        
+        Uses XML-delimited prompt structure to prevent context bleeding
+        and Multi-Dimensional Scoring (Semantics vs. Syntax).
+        
+        Returns:
+            Dict with 'score' (0.0-1.0), 'details', and 'feedback'
+        """
+        # Check cache
+        cache_key = f"{hash(predicted)}:{hash(expected)}"
+        if cache_key in self._analysis_cache:
+            return self._analysis_cache[cache_key]
+        
+        result = {"score": 0.0, "details": {}, "feedback": ""}
+        
+        try:
+            # Truncate for token limits but preserve enough context
+            expected_truncated = expected[:10000]
+            predicted_truncated = predicted[:10000]
+            
+            # OPTIMIZED: Penalty-based scoring with self-verification
+            # Starts at 1.0 and deducts for failures - more consistent than subjective scoring
+            analysis_prompt = f"""<system_role>
+You are a **Semantic Logic Engine** tasked with grading AI performance.
+You must compare a [PREDICTED] output against a [EXPECTED] truth.
+</system_role>
+
+<input_data>
+    <expected_output>
+{expected_truncated}
+    </expected_output>
+
+    <predicted_output>
+{predicted_truncated}
+    </predicted_output>
+</input_data>
+
+<scoring_algorithm>
+Calculate the score based on these STRICT rules. Start with 1.0 and deduct penalties.
+
+1. **Information Completeness (Max -0.5)**:
+   - If key facts/fields are missing, deduct proportional to importance.
+   - If a nested JSON field is missing, deduct 0.1 per field.
+
+2. **Accuracy & Hallucination (Max -1.0)**:
+   - If factual numbers/IDs are wrong: Score = 0 immediately.
+   - If the model invents information NOT in the input: Deduct 0.3.
+
+3. **Format Compliance (Max -0.3)**:
+   - If JSON is requested but Markdown is returned: Deduct 0.3.
+   - If keys are lowercase instead of snake_case: Deduct 0.1.
+
+4. **Semantic Equivalence (No Penalty)**:
+   - Synonyms are ACCEPTED (e.g., "Purchase" == "Buy").
+   - Formatting differences (whitespace) are IGNORED.
+</scoring_algorithm>
+
+<self_verification>
+Before finalizing the score, ask: "If I used the predicted output in code expecting the original output, would the code crash?"
+- If YES (Crash) -> Score must be < 0.5.
+- If NO (Safe) -> Score can be high.
+</self_verification>
+
+<output_schema>
+Return JSON ONLY:
+{{
+    "semantic_similarity": 0.0-1.0,
+    "structural_similarity": 0.0-1.0,
+    "verdict": "PERFECT" | "ACCEPTABLE" | "FORMAT_ERROR" | "DATA_CORRUPTION",
+    "critical_failures": ["List specific failures that caused score < 1.0"],
+    "penalty_breakdown": {{"completeness": -0.0, "accuracy": -0.0, "format": -0.0}},
+    "fix_directive": "Imperative command to fix the prompt"
+}}
+</output_schema>
+"""
+
+            response = self.llm_client.generate(
+                system_prompt="You are a Semantic Logic Engine. Calculate scores using penalty-based deduction from 1.0. Respond only with valid JSON.",
+                user_prompt=analysis_prompt,
+                image_base64=""
+            )
+            
+            content = response.get("content", str(response)) if isinstance(response, dict) else str(response)
+            
+            # Parse JSON response
+            analysis = self._extract_json_from_response(content)
+            
+            if analysis:
+                # Extract semantic similarity (primary score)
+                semantic_sim = float(analysis.get("semantic_similarity", 0.5))
+                structural_sim = float(analysis.get("structural_similarity", semantic_sim))
+                
+                # Compute weighted score based on verdict (updated for new schema)
+                verdict = analysis.get("verdict", "ACCEPTABLE")
+                verdict_multiplier = {
+                    "PERFECT": 1.0,
+                    "ACCEPTABLE": 0.85,
+                    "FORMAT_ERROR": 0.6,      # New: was WRONG_FORMAT
+                    "DATA_CORRUPTION": 0.1,   # New: replaces WRONG_CONTENT + HALLUCINATION
+                    # Legacy support
+                    "WRONG_FORMAT": 0.6,
+                    "WRONG_CONTENT": 0.3,
+                    "HALLUCINATION": 0.1
+                }.get(verdict, 0.5)
+                
+                # Final score: weighted combination
+                result["score"] = min(1.0, semantic_sim * 0.6 + structural_sim * 0.3 + verdict_multiplier * 0.1)
+                
+                # Extract penalty breakdown if available
+                penalty_breakdown = analysis.get("penalty_breakdown", {})
+                critical_failures = analysis.get("critical_failures", [])
+                
+                result["details"] = {
+                    "verdict": verdict,
+                    "semantic_similarity": semantic_sim,
+                    "structural_similarity": structural_sim,
+                    "critical_failures": critical_failures,
+                    "penalty_breakdown": penalty_breakdown,
+                    # Legacy field support
+                    "key_matches": analysis.get("key_matches", []),
+                    "key_differences": analysis.get("key_differences", critical_failures),
+                    "value_errors": analysis.get("value_errors", {}),
+                    "reasoning": analysis.get("reasoning", "")
+                }
+                result["feedback"] = analysis.get("fix_directive", "")
+            else:
+                # Fallback if JSON parsing fails
+                result = self._heuristic_semantic_analysis(predicted, expected)
+            
+            # Cache result
+            self._analysis_cache[cache_key] = result
+            
+        except Exception as e:
+            logger.warning(f"LLM semantic analysis failed: {e}, falling back to heuristics")
+            result = self._heuristic_semantic_analysis(predicted, expected)
+        
+        return result
+    
+    def _extract_json_from_response(self, content: str) -> Optional[Dict]:
+        """Extract JSON from LLM response."""
+        # Try to find JSON in response
+        json_match = re.search(r'\{[\s\S]*\}', content)
+        if json_match:
+            try:
+                return json.loads(json_match.group(0))
+            except json.JSONDecodeError:
+                pass
+        return None
+    
+    def _heuristic_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """
+        Heuristic-based semantic analysis when LLM is not available.
+        
+        Uses multiple signals:
+        - Word overlap (Jaccard)
+        - Sequence matching (SequenceMatcher)
+        - Number extraction and comparison
+        - Key phrase matching
+        """
+        result = {"score": 0.0, "details": {}, "feedback": ""}
+        
+        pred_lower = predicted.lower()
+        exp_lower = expected.lower()
+        
+        # 1. Sequence similarity
+        seq_sim = SequenceMatcher(None, pred_lower, exp_lower).ratio()
+        
+        # 2. Word overlap (Jaccard)
+        pred_words = set(pred_lower.split())
+        exp_words = set(exp_lower.split())
+        jaccard = len(pred_words & exp_words) / len(pred_words | exp_words) if (pred_words | exp_words) else 0.0
+        
+        # 3. Number comparison
+        pred_nums = re.findall(r'-?\d+\.?\d*', predicted)
+        exp_nums = re.findall(r'-?\d+\.?\d*', expected)
+        
+        num_score = 1.0
+        num_errors = []
+        if exp_nums:
+            matches = 0
+            for exp_num in exp_nums:
+                if exp_num in pred_nums:
+                    matches += 1
+                else:
+                    # Check for close matches
+                    try:
+                        exp_val = float(exp_num)
+                        for pred_num in pred_nums:
+                            pred_val = float(pred_num)
+                            if abs(pred_val - exp_val) <= 1:  # Off by 1
+                                matches += 0.9
+                                num_errors.append(f"Number close: expected {exp_num}, got {pred_num}")
+                                break
+                        else:
+                            num_errors.append(f"Number missing: expected {exp_num}")
+                    except ValueError:
+                        pass
+            num_score = matches / len(exp_nums) if exp_nums else 1.0
+        
+        # 4. Key entity extraction (simple approach)
+        # Look for capitalized words, quoted strings, etc.
+        pred_entities = set(re.findall(r'\b[A-Z][a-z]+\b', predicted))
+        exp_entities = set(re.findall(r'\b[A-Z][a-z]+\b', expected))
+        entity_overlap = len(pred_entities & exp_entities) / len(exp_entities) if exp_entities else 1.0
+        
+        # Combine scores
+        result["score"] = (
+            0.3 * seq_sim +
+            0.25 * jaccard +
+            0.25 * num_score +
+            0.2 * entity_overlap
+        )
+        
+        result["details"] = {
+            "sequence_similarity": seq_sim,
+            "word_overlap": jaccard,
+            "number_accuracy": num_score,
+            "entity_overlap": entity_overlap,
+            "number_errors": num_errors
+        }
+        
+        # Generate feedback
+        feedback_parts = []
+        if jaccard < 0.5:
+            feedback_parts.append("Low word overlap - output may be missing key terms.")
+        if num_errors:
+            feedback_parts.append(f"Number issues: {'; '.join(num_errors[:3])}")
+        if entity_overlap < 0.5 and exp_entities:
+            missing = exp_entities - pred_entities
+            feedback_parts.append(f"Missing entities: {', '.join(list(missing)[:3])}")
+        
+        if feedback_parts:
+            result["feedback"] = " | ".join(feedback_parts)
+        else:
+            result["feedback"] = "Output is semantically similar but not exact match."
+        
+        return result
+    
+    def _generate_default_feedback(self, result: Dict) -> str:
+        """Generate default feedback based on scores."""
+        score = result["composite_score"]
+        semantic = result["semantic_similarity"]
+        structural = result["structural_similarity"]
+        
+        if score >= 0.9:
+            return "✅ Excellent match! Minor differences only."
+        elif score >= 0.7:
+            return f"⚠️ Good match (semantic={semantic:.0%}, structural={structural:.0%}). Some differences to address."
+        elif score >= 0.5:
+            return f"⚠️ Partial match (semantic={semantic:.0%}, structural={structural:.0%}). Significant differences found."
+        else:
+            return f"❌ Poor match (semantic={semantic:.0%}, structural={structural:.0%}). Major issues to fix."
+    
+    def get_evaluation_summary(self, results: List[Dict]) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        
+        Args:
+            results: List of evaluation result dictionaries
+            
+        Returns:
+            Summary statistics
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "accuracy": 0.0,
+                "avg_semantic_similarity": 0.0,
+                "avg_structural_similarity": 0.0
+            }
+        
+        total = len(results)
+        scores = [r.get("composite_score", 0.0) for r in results]
+        semantic_scores = [r.get("semantic_similarity", 0.0) for r in results]
+        structural_scores = [r.get("structural_similarity", 0.0) for r in results]
+        
+        return {
+            "total_samples": total,
+            "accuracy": sum(1 for s in scores if s >= 0.8) / total,
+            "avg_composite_score": sum(scores) / total,
+            "avg_semantic_similarity": sum(semantic_scores) / total,
+            "avg_structural_similarity": sum(structural_scores) / total,
+            "min_score": min(scores),
+            "max_score": max(scores)
+        }
+
+
+# Convenience function to create evaluator
+def create_universal_evaluator(llm_client=None) -> UniversalSemanticEvaluator:
+    """
+    Create a Universal Semantic Evaluator.
+    
+    Args:
+        llm_client: Optional LLM client for semantic analysis.
+                   If not provided, uses heuristic-based analysis.
+    
+    Returns:
+        Configured UniversalSemanticEvaluator instance
+    """
+    return UniversalSemanticEvaluator(
+        llm_client=llm_client,
+        use_llm_analysis=llm_client is not None
+    )
+
diff --git a/src/gepa_optimizer/evaluation/validation_evaluator.py b/src/gepa_optimizer/evaluation/validation_evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d7f95765f060e9f1cc181550a28e9d48f9b368b
--- /dev/null
+++ b/src/gepa_optimizer/evaluation/validation_evaluator.py
@@ -0,0 +1,495 @@
+"""
+Validation Evaluator for UI Validation Use Case
+
+Evaluates predicted validation results (true/false) against expected results.
+Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
+"""
+
+from typing import Dict, Any, Optional
+import re
+import logging
+
+try:
+    from .base_evaluator import BaseEvaluator
+except ImportError:
+    # For standalone testing
+    import sys
+    from pathlib import Path
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+    from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
+
+
+class ValidationEvaluator(BaseEvaluator):
+    """
+    Evaluator for validation use case (true/false results).
+    
+    Features:
+    - Normalizes boolean formats ("true"/"True"/"1" → True, "false"/"False"/"0" → False)
+    - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
+    - Binary scoring: correct boolean = 1.0, wrong = 0.0
+    - Returns reasoning in evaluation results for LLM-as-judge feedback
+    """
+    
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize validation evaluator.
+        
+        Args:
+            metric_weights: Weights for evaluation metrics
+                          Default: {"output_match": 1.0}
+        """
+        default_weights = {
+            "output_match": 1.0  # Binary boolean comparison
+        }
+        
+        weights = metric_weights or default_weights
+        super().__init__(metric_weights=weights)
+    
+    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+        """
+        Evaluate predicted validation result against expected result.
+        
+        Scoring Strategy:
+        1. Normalize both predicted and expected to boolean
+        2. Compare booleans (exact match required)
+        3. Extract reasoning from both (for LLM-as-judge)
+        4. Return 1.0 if match, 0.0 otherwise (binary scoring)
+        
+        Args:
+            predicted: LLM's output (may include "true"/"false" + reasoning)
+            expected: Expected output (should be "true" or "false", may include reasoning)
+            
+        Returns:
+            Dictionary with evaluation metrics, extracted booleans, and reasoning:
+            {
+                "output_match": 1.0 or 0.0,
+                "composite_score": 1.0 or 0.0,
+                "predicted_output": str,
+                "expected_output": str,
+                "predicted_boolean": True/False,
+                "expected_boolean": True/False,
+                "predicted_reasoning": str,  # REQUIRED for LLM-as-judge
+                "expected_reasoning": str,   # REQUIRED for LLM-as-judge
+                "evaluation_reason": str
+            }
+        """
+        if not predicted or not expected:
+            return {
+                "output_match": 0.0,
+                "composite_score": 0.0,
+                "predicted_output": str(predicted).strip() if predicted else "",
+                "expected_output": str(expected).strip() if expected else "",
+                "predicted_boolean": None,
+                "expected_boolean": None,
+                "predicted_reasoning": "",
+                "expected_reasoning": "",
+                "evaluation_reason": "❌ Empty or missing input/output"
+            }
+        
+        predicted_str = str(predicted).strip()
+        expected_str = str(expected).strip()
+        
+        # 1. Extract boolean from predicted output
+        pred_bool = self._normalize_to_bool(predicted_str)
+        pred_reasoning = self._extract_reasoning(predicted_str)
+        
+        # 2. Extract boolean from expected output
+        exp_bool = self._normalize_to_bool(expected_str)
+        exp_reasoning = self._extract_reasoning(expected_str)
+        
+        # 🔥 NEW: Detect output structure for both expected and predicted
+        expected_structure = self._detect_output_structure(expected_str)
+        predicted_structure = self._detect_output_structure(predicted_str)
+        
+        # Compare structures
+        structure_match = (expected_structure['format'] == predicted_structure['format'])
+        
+        # 3. Compare booleans (binary scoring)
+        if pred_bool is None or exp_bool is None:
+            # Could not extract boolean from one or both
+            score = 0.0
+            reason = "❌ Could not extract boolean value"
+            if pred_bool is None:
+                reason += " from predicted output"
+            if exp_bool is None:
+                reason += " from expected output"
+        else:
+            # Both booleans extracted successfully - compare
+            score = 1.0 if pred_bool == exp_bool else 0.0
+            if score == 1.0:
+                reason = f"✅ Correct! Result matches (both are {exp_bool})"
+                # 🔥 NEW: Add note if structure doesn't match
+                if not structure_match:
+                    reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
+            else:
+                reason = f"❌ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"
+        
+        # 4. Log evaluation details
+        self.logger.info(f"\n{'─'*70}")
+        self.logger.info(f"📊 VALIDATION EVALUATION")
+        self.logger.info(f"{'─'*70}")
+        self.logger.info(f"   Expected: '{expected_str[:100]}...' → {exp_bool}")
+        self.logger.info(f"   Predicted: '{predicted_str[:100]}...' → {pred_bool}")
+        self.logger.info(f"   {'─'*66}")
+        self.logger.info(f"   🎯 SCORE: {score:.2f} - {reason}")
+        if pred_reasoning:
+            self.logger.info(f"   📝 Predicted Reasoning: {pred_reasoning[:150]}...")
+        if exp_reasoning:
+            self.logger.info(f"   📝 Expected Reasoning: {exp_reasoning[:150]}...")
+        # 🔥 NEW: Log structure comparison
+        self.logger.info(f"   📐 Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
+        self.logger.info(f"   📐 Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
+        if not structure_match:
+            self.logger.warning(f"   ⚠️  OUTPUT STRUCTURE MISMATCH!")
+        self.logger.info(f"{'─'*70}\n")
+        
+        return {
+            "output_match": score,
+            "composite_score": score,  # This is what GEPA uses
+            "predicted_output": predicted_str,
+            "expected_output": expected_str,
+            "predicted_boolean": pred_bool,
+            "expected_boolean": exp_bool,
+            "predicted_reasoning": pred_reasoning,  # REQUIRED for LLM-as-judge
+            "expected_reasoning": exp_reasoning,     # REQUIRED for LLM-as-judge
+            "evaluation_reason": reason,
+            # 🔥 NEW: Structure metadata for LLM-as-judge
+            "expected_structure": expected_structure,
+            "predicted_structure": predicted_structure,
+            "output_structure_match": structure_match,
+            "expected_has_reasoning": expected_structure['has_reasoning'],
+            "predicted_has_reasoning": predicted_structure['has_reasoning'],
+            "reasoning_quality_gap": expected_structure['reasoning_quality'] + " → " + predicted_structure['reasoning_quality']
+        }
+    
+    def _normalize_to_bool(self, value: str) -> Optional[bool]:
+        """
+        Normalize various formats to boolean.
+        
+        Handles:
+        - "true", "True", "TRUE" → True
+        - "false", "False", "FALSE" → False
+        - "1", "0" → True, False
+        - "yes", "no" → True, False
+        - "correct", "incorrect" → True, False
+        - JSON: {"result": true} → True
+        - Text with boolean: "The result is true because..." → True
+        
+        Args:
+            value: String that may contain a boolean value
+            
+        Returns:
+            Boolean value or None if cannot be determined
+        """
+        if not value:
+            return None
+        
+        value_lower = value.lower().strip()
+        
+        # Direct boolean strings
+        if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
+            return True
+        if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
+            return False
+        
+        # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
+        # This handles the production prompt's JSON output format
+        # Match both quoted and unquoted values, case-insensitive
+        action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower)
+        if action_match:
+            action_value = action_match.group(1).lower()
+            if action_value == "true":
+                return True
+            elif action_value == "false":
+                return False
+            elif action_value == "loading":
+                # Treat LOADING as False for validation purposes (screen not ready)
+                return False
+        
+        # Also try to parse full JSON structure if present (more robust)
+        try:
+            import json
+            # Try to find and parse JSON object
+            json_start = value.find('{')
+            if json_start != -1:
+                # Try to extract JSON from the response
+                for end_idx in range(len(value), json_start, -1):
+                    try:
+                        json_str = value[json_start:end_idx]
+                        data = json.loads(json_str)
+                        # Check for "action" field (production prompt format)
+                        if "action" in data:
+                            action_val = str(data["action"]).upper()
+                            if action_val == "TRUE":
+                                return True
+                            elif action_val == "FALSE":
+                                return False
+                            elif action_val == "LOADING":
+                                return False  # Treat as False
+                        # Check for "result" field (alternative format)
+                        if "result" in data:
+                            result_val = data["result"]
+                            if isinstance(result_val, bool):
+                                return result_val
+                            elif isinstance(result_val, str):
+                                return result_val.lower() in ("true", "1", "yes")
+                    except (json.JSONDecodeError, KeyError, ValueError):
+                        continue
+        except Exception:
+            pass  # Fall through to other extraction methods
+        
+        # JSON format: {"result": true} or {"result": false}
+        json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower)
+        if json_match:
+            return json_match.group(1) == "true"
+        
+        # Pattern: "result is true" or "result: true"
+        pattern_match = re.search(r'result[:\s]+(true|false)', value_lower)
+        if pattern_match:
+            return pattern_match.group(1) == "true"
+        
+        # Pattern: "is true" or "is false" (standalone)
+        is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower)
+        if is_match:
+            return is_match.group(2) == "true"
+        
+        # Pattern: "true" or "false" as standalone word (not in other words)
+        standalone_match = re.search(r'\b(true|false)\b', value_lower)
+        if standalone_match:
+            return standalone_match.group(1) == "true"
+        
+        # Last resort: check if "true" appears before "false" in text
+        true_pos = value_lower.find("true")
+        false_pos = value_lower.find("false")
+        
+        if true_pos != -1 and false_pos != -1:
+            # Both found - use the one that appears first
+            return true_pos < false_pos
+        elif true_pos != -1:
+            return True
+        elif false_pos != -1:
+            return False
+        
+        # Cannot determine
+        return None
+    
+    def _detect_output_structure(self, output: str) -> Dict[str, Any]:
+        """
+        Dynamically detect the structure/components of the output.
+        
+        This detects:
+        - Boolean result presence
+        - Reasoning/explanation presence and quality
+        - Output format (boolean only, boolean+reasoning, etc.)
+        
+        Args:
+            output: Output string to analyze
+            
+        Returns:
+            Dictionary with structure information:
+            {
+                "has_boolean": bool,
+                "has_reasoning": bool,
+                "reasoning_length": int,
+                "reasoning_quality": str,  # "missing", "minimal", "adequate", "detailed"
+                "format": str  # "boolean_only", "boolean_with_reasoning", "unknown"
+            }
+        """
+        if not output:
+            return {
+                "has_boolean": False,
+                "has_reasoning": False,
+                "reasoning_length": 0,
+                "reasoning_quality": "missing",
+                "format": "empty"
+            }
+        
+        output_clean = output.strip()
+        
+        # Detect boolean
+        has_boolean = self._normalize_to_bool(output_clean) is not None
+        
+        # Extract reasoning
+        reasoning = self._extract_reasoning(output_clean)
+        has_reasoning = len(reasoning) > 15  # Minimum 15 chars to count as reasoning
+        reasoning_length = len(reasoning)
+        
+        # Classify reasoning quality
+        if reasoning_length == 0:
+            reasoning_quality = "missing"
+        elif reasoning_length < 30:
+            reasoning_quality = "minimal"  # Just a few words
+        elif reasoning_length < 100:
+            reasoning_quality = "adequate"  # Brief explanation
+        else:
+            reasoning_quality = "detailed"  # Full explanation
+        
+        # Determine format
+        if has_boolean and has_reasoning:
+            output_format = "boolean_with_reasoning"
+        elif has_boolean and not has_reasoning:
+            output_format = "boolean_only"
+        elif not has_boolean and has_reasoning:
+            output_format = "reasoning_only"
+        else:
+            output_format = "unknown"
+        
+        return {
+            "has_boolean": has_boolean,
+            "has_reasoning": has_reasoning,
+            "reasoning_length": reasoning_length,
+            "reasoning_quality": reasoning_quality,
+            "format": output_format
+        }
+    
+    def _extract_reasoning(self, output: str) -> str:
+        """
+        Extract reasoning/explanation from output string.
+        
+        This is REQUIRED for LLM-as-judge feedback. The reasoning helps
+        the judge understand why the result was true/false and compare
+        predicted vs expected reasoning.
+        
+        Args:
+            output: Full output string that may contain reasoning
+            
+        Returns:
+            Extracted reasoning text, or empty string if not found
+        """
+        if not output:
+            return ""
+        
+        # Patterns to find reasoning sections
+        reasoning_patterns = [
+            r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)',  # "Reason: ..."
+            r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)',  # "Explanation: ..."
+            r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)',  # "Because: ..."
+            r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)',  # "Why: ..."
+            r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)',  # "Details: ..."
+        ]
+        
+        # Try each pattern
+        for pattern in reasoning_patterns:
+            match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
+            if match:
+                reasoning = match.group(1).strip()
+                if len(reasoning) > 20:  # Only return if substantial
+                    return reasoning
+        
+        # If no explicit reasoning section, check if output has substantial text
+        # after boolean (likely contains reasoning)
+        bool_match = re.search(r'\b(true|false)\b', output.lower())
+        if bool_match:
+            # Get text after the boolean
+            bool_pos = bool_match.end()
+            remaining = output[bool_pos:].strip()
+            
+            # If remaining text is substantial (more than just punctuation), use it
+            if len(remaining) > 30:
+                # Clean up common prefixes
+                remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
+                if remaining:
+                    return remaining
+        
+        # If output is long and doesn't start with boolean, might be all reasoning
+        if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE):
+            # Return first 500 chars as reasoning
+            return output[:500].strip()
+        
+        # No reasoning found
+        return ""
+    
+    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        
+        Args:
+            results: List of evaluation result dictionaries
+            
+        Returns:
+            Summary statistics including accuracy, true/false distribution
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "accuracy": 0.0,
+                "correct_predictions": 0,
+                "incorrect_predictions": 0,
+                "true_predictions": 0,
+                "false_predictions": 0
+            }
+        
+        total = len(results)
+        correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
+        accuracy = correct / total if total > 0 else 0.0
+        
+        # Count true/false predictions
+        true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
+        false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)
+        
+        return {
+            "total_samples": total,
+            "accuracy": accuracy,
+            "correct_predictions": correct,
+            "incorrect_predictions": total - correct,
+            "true_predictions": true_preds,
+            "false_predictions": false_preds
+        }
+
+
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Validation Evaluator...")
+    
+    evaluator = ValidationEvaluator()
+    
+    # Test cases
+    test_cases = [
+        # (predicted, expected, should_match)
+        ("true", "true", True),
+        ("false", "false", True),
+        ("True", "true", True),
+        ("FALSE", "false", True),
+        ("1", "true", True),
+        ("0", "false", True),
+        ("true", "false", False),
+        ("false", "true", False),
+        ("The result is true because the button is visible", "true", True),
+        ("The result is false because the element is not found", "false", True),
+        ('{"result": true, "reasoning": "Button is visible"}', "true", True),
+        ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
+        ("", "true", False),
+        ("invalid", "true", False),
+    ]
+    
+    print("\n📝 Running test cases:")
+    print("-" * 80)
+    
+    results = []
+    for predicted, expected, should_match in test_cases:
+        result = evaluator.evaluate(predicted, expected)
+        match = result["composite_score"] == 1.0
+        
+        status = "✅" if match == should_match else "❌"
+        pred_bool = result.get("predicted_boolean", "?")
+        exp_bool = result.get("expected_boolean", "?")
+        pred_reason = result.get("predicted_reasoning", "")[:50]
+        
+        print(f"{status} Predicted: '{predicted[:40]}...' → {pred_bool}")
+        print(f"   Expected: '{expected}' → {exp_bool}")
+        print(f"   Match: {match} (should be {should_match})")
+        if pred_reason:
+            print(f"   Reasoning: {pred_reason}...")
+        print()
+        
+        results.append(result)
+    
+    # Summary
+    print("\n📊 Summary:")
+    summary = evaluator.get_evaluation_summary(results)
+    print(f"   Total: {summary['total_samples']}")
+    print(f"   Correct: {summary['correct_predictions']}")
+    print(f"   Accuracy: {summary['accuracy']:.1%}")
+    print(f"   True predictions: {summary['true_predictions']}")
+    print(f"   False predictions: {summary['false_predictions']}")
+
diff --git a/src/gepa_optimizer/infrastructure/__init__.py b/src/gepa_optimizer/infrastructure/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b3d27aeb3cf77dcd0d6f4cd1b4e5565580b3df9
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/__init__.py
@@ -0,0 +1,15 @@
+"""
+Infrastructure module for cross-cutting concerns.
+
+This module contains infrastructure components that are used across
+the entire application, including logging, metrics, and configuration.
+"""
+
+from .logging import get_logger, configure_logging, LogContext
+
+__all__ = [
+    "get_logger",
+    "configure_logging", 
+    "LogContext",
+]
+
diff --git a/src/gepa_optimizer/infrastructure/logging/__init__.py b/src/gepa_optimizer/infrastructure/logging/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6086e4bb54f61fc14cb384dc87db2d639f9dca85
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/__init__.py
@@ -0,0 +1,43 @@
+"""
+Centralized Logging Infrastructure for GEPA Optimizer.
+
+This module provides a unified logging system with:
+- Structured logging with context
+- Consistent formatting across all modules
+- Log level configuration
+- Operation tracking with timing
+- Contextual logging for debugging
+
+Usage:
+    from gepa_optimizer.infrastructure.logging import get_logger, LogContext
+    
+    logger = get_logger(__name__)
+    logger.info("Starting optimization", extra={"iteration": 1})
+    
+    with LogContext(logger, "evaluation", sample_id=123):
+        logger.info("Evaluating sample")
+"""
+
+from .logger import (
+    get_logger,
+    configure_logging,
+    LogLevel,
+    GEPA_LOGGER_NAME,
+)
+from .context import LogContext, log_operation
+from .formatters import GepaFormatter, JsonFormatter
+
+__all__ = [
+    # Core logging
+    "get_logger",
+    "configure_logging",
+    "LogLevel",
+    "GEPA_LOGGER_NAME",
+    # Context management
+    "LogContext",
+    "log_operation",
+    # Formatters
+    "GepaFormatter",
+    "JsonFormatter",
+]
+
diff --git a/src/gepa_optimizer/infrastructure/logging/context.py b/src/gepa_optimizer/infrastructure/logging/context.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2a305e5ac78fd50311331a62713ceff09f4315d
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/context.py
@@ -0,0 +1,257 @@
+"""
+Logging Context Management.
+
+Provides context managers and decorators for:
+- Operation tracking with timing
+- Contextual logging with nested contexts
+- Automatic exception logging
+"""
+
+import logging
+import time
+import functools
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Optional, TypeVar, ParamSpec
+
+P = ParamSpec('P')
+R = TypeVar('R')
+
+
+class LogContext:
+    """
+    Context manager for logging operations with timing and context.
+    
+    Features:
+    - Automatic start/end logging
+    - Timing measurement
+    - Exception capture
+    - Nested context support
+    
+    Example:
+        logger = get_logger(__name__)
+        
+        with LogContext(logger, "optimization", iteration=5):
+            # ... optimization code ...
+            logger.info("Processing sample")  # Inherits context
+        
+        # Output:
+        # INFO | Starting optimization | iteration=5
+        # INFO | Processing sample | iteration=5
+        # INFO | Completed optimization | iteration=5 duration_ms=1234
+    """
+    
+    def __init__(
+        self,
+        logger: logging.Logger,
+        operation: str,
+        log_start: bool = True,
+        log_end: bool = True,
+        log_level: int = logging.INFO,
+        **context_fields: Any
+    ):
+        """
+        Initialize log context.
+        
+        Args:
+            logger: Logger instance to use
+            operation: Name of the operation being performed
+            log_start: Whether to log when entering context
+            log_end: Whether to log when exiting context
+            log_level: Log level for start/end messages
+            **context_fields: Additional fields to include in all logs
+        """
+        self.logger = logger
+        self.operation = operation
+        self.log_start = log_start
+        self.log_end = log_end
+        self.log_level = log_level
+        self.context_fields = context_fields
+        self.start_time: Optional[float] = None
+        self.exception: Optional[Exception] = None
+    
+    def __enter__(self) -> "LogContext":
+        """Enter the context, logging start if configured."""
+        self.start_time = time.perf_counter()
+        
+        if self.log_start:
+            self.logger.log(
+                self.log_level,
+                f"Starting {self.operation}",
+                extra=self.context_fields
+            )
+        
+        return self
+    
+    def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
+        """Exit the context, logging completion or error."""
+        duration_ms = (time.perf_counter() - self.start_time) * 1000
+        
+        extra = {
+            **self.context_fields,
+            "duration_ms": round(duration_ms, 2)
+        }
+        
+        if exc_type is not None:
+            # Log exception
+            self.exception = exc_val
+            self.logger.error(
+                f"Failed {self.operation}: {exc_type.__name__}: {exc_val}",
+                extra=extra,
+                exc_info=True
+            )
+            # Don't suppress the exception
+            return False
+        
+        if self.log_end:
+            self.logger.log(
+                self.log_level,
+                f"Completed {self.operation}",
+                extra=extra
+            )
+        
+        return False
+    
+    def log(self, level: int, message: str, **extra_fields: Any) -> None:
+        """Log a message within this context, inheriting context fields."""
+        self.logger.log(
+            level,
+            message,
+            extra={**self.context_fields, **extra_fields}
+        )
+    
+    def info(self, message: str, **extra_fields: Any) -> None:
+        """Log info message within context."""
+        self.log(logging.INFO, message, **extra_fields)
+    
+    def debug(self, message: str, **extra_fields: Any) -> None:
+        """Log debug message within context."""
+        self.log(logging.DEBUG, message, **extra_fields)
+    
+    def warning(self, message: str, **extra_fields: Any) -> None:
+        """Log warning message within context."""
+        self.log(logging.WARNING, message, **extra_fields)
+    
+    def error(self, message: str, **extra_fields: Any) -> None:
+        """Log error message within context."""
+        self.log(logging.ERROR, message, **extra_fields)
+
+
+def log_operation(
+    logger: Optional[logging.Logger] = None,
+    operation: Optional[str] = None,
+    log_args: bool = False,
+    log_result: bool = False,
+    log_level: int = logging.INFO,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    """
+    Decorator for logging function execution.
+    
+    Automatically logs:
+    - Function entry (with arguments if configured)
+    - Function exit (with result if configured)
+    - Execution duration
+    - Exceptions
+    
+    Args:
+        logger: Logger to use (defaults to logger named after module)
+        operation: Operation name (defaults to function name)
+        log_args: Whether to log function arguments
+        log_result: Whether to log function result
+        log_level: Log level for messages
+        
+    Example:
+        @log_operation(log_args=True)
+        def process_batch(batch_id: int, items: List[str]) -> int:
+            return len(items)
+        
+        # Output:
+        # INFO | Starting process_batch | batch_id=123 items=['a', 'b']
+        # INFO | Completed process_batch | duration_ms=45.2 result=2
+    """
+    def decorator(func: Callable[P, R]) -> Callable[P, R]:
+        nonlocal logger, operation
+        
+        if logger is None:
+            logger = logging.getLogger(func.__module__)
+        if operation is None:
+            operation = func.__name__
+        
+        @functools.wraps(func)
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+            start_time = time.perf_counter()
+            
+            # Build context fields
+            extra: Dict[str, Any] = {}
+            if log_args:
+                # Include positional args (skip self for methods)
+                arg_names = func.__code__.co_varnames[:func.__code__.co_argcount]
+                for i, (name, value) in enumerate(zip(arg_names, args)):
+                    if name != 'self':
+                        extra[name] = _safe_repr(value)
+                # Include keyword args
+                for key, value in kwargs.items():
+                    extra[key] = _safe_repr(value)
+            
+            logger.log(log_level, f"Starting {operation}", extra=extra)
+            
+            try:
+                result = func(*args, **kwargs)
+                
+                duration_ms = (time.perf_counter() - start_time) * 1000
+                result_extra: Dict[str, Any] = {"duration_ms": round(duration_ms, 2)}
+                
+                if log_result:
+                    result_extra["result"] = _safe_repr(result)
+                
+                logger.log(log_level, f"Completed {operation}", extra=result_extra)
+                
+                return result
+                
+            except Exception as e:
+                duration_ms = (time.perf_counter() - start_time) * 1000
+                logger.error(
+                    f"Failed {operation}: {type(e).__name__}: {e}",
+                    extra={"duration_ms": round(duration_ms, 2)},
+                    exc_info=True
+                )
+                raise
+        
+        return wrapper
+    
+    return decorator
+
+
+@contextmanager
+def timed_block(logger: logging.Logger, description: str, log_level: int = logging.DEBUG):
+    """
+    Simple context manager for timing a block of code.
+    
+    Less verbose than LogContext, suitable for quick timing measurements.
+    
+    Example:
+        with timed_block(logger, "data processing"):
+            process_data()
+        # Output: DEBUG | data processing completed in 123.45ms
+    """
+    start = time.perf_counter()
+    try:
+        yield
+    finally:
+        duration_ms = (time.perf_counter() - start) * 1000
+        logger.log(log_level, f"{description} completed in {duration_ms:.2f}ms")
+
+
+def _safe_repr(value: Any, max_length: int = 100) -> str:
+    """
+    Create a safe string representation of a value for logging.
+    
+    Truncates long strings and handles non-serializable objects.
+    """
+    try:
+        repr_str = repr(value)
+        if len(repr_str) > max_length:
+            return repr_str[:max_length] + "..."
+        return repr_str
+    except Exception:
+        return f"<{type(value).__name__}>"
+
diff --git a/src/gepa_optimizer/infrastructure/logging/formatters.py b/src/gepa_optimizer/infrastructure/logging/formatters.py
new file mode 100644
index 0000000000000000000000000000000000000000..2387fe8deac3f641e363aab02067a55ed29a4474
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/formatters.py
@@ -0,0 +1,259 @@
+"""
+Custom Log Formatters for GEPA Optimizer.
+
+Provides formatters for:
+- Console output with colors and emoji
+- JSON structured logging for production
+- Plain text for file logging
+"""
+
+import json
+import logging
+from datetime import datetime
+from typing import Any, Dict, Optional
+
+
+# ANSI color codes for terminal output
+class Colors:
+    """ANSI color codes for terminal coloring."""
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
+    
+    # Log level colors
+    DEBUG = "\033[36m"      # Cyan
+    INFO = "\033[32m"       # Green
+    WARNING = "\033[33m"    # Yellow
+    ERROR = "\033[31m"      # Red
+    CRITICAL = "\033[35m"   # Magenta
+    
+    # Semantic colors
+    TIMESTAMP = "\033[90m"  # Gray
+    MODULE = "\033[34m"     # Blue
+    MESSAGE = "\033[0m"     # Default
+
+
+# Emoji prefixes for visual log scanning
+LEVEL_EMOJI = {
+    logging.DEBUG: "🔍",
+    logging.INFO: "ℹ️ ",
+    logging.WARNING: "⚠️ ",
+    logging.ERROR: "❌",
+    logging.CRITICAL: "🚨",
+}
+
+# Level colors mapping
+LEVEL_COLORS = {
+    logging.DEBUG: Colors.DEBUG,
+    logging.INFO: Colors.INFO,
+    logging.WARNING: Colors.WARNING,
+    logging.ERROR: Colors.ERROR,
+    logging.CRITICAL: Colors.CRITICAL,
+}
+
+
+class GepaFormatter(logging.Formatter):
+    """
+    Custom formatter for GEPA Optimizer logs.
+    
+    Features:
+    - Optional color output for console
+    - Optional emoji prefixes for visual scanning
+    - Structured extra fields support
+    - Clean, readable format
+    
+    Example output:
+        2024-01-15 10:30:45 | INFO     | ℹ️  gepa_optimizer.core.optimizer | Starting optimization iteration=5
+    """
+    
+    def __init__(
+        self,
+        fmt: Optional[str] = None,
+        datefmt: Optional[str] = None,
+        use_colors: bool = True,
+        include_emoji: bool = True,
+    ):
+        """
+        Initialize the formatter.
+        
+        Args:
+            fmt: Format string (uses default if not provided)
+            datefmt: Date format string
+            use_colors: Whether to use ANSI colors
+            include_emoji: Whether to include emoji prefixes
+        """
+        super().__init__(fmt=fmt, datefmt=datefmt)
+        self.use_colors = use_colors
+        self.include_emoji = include_emoji
+    
+    def format(self, record: logging.LogRecord) -> str:
+        """Format a log record with colors and emoji."""
+        # Store original values
+        original_msg = record.msg
+        original_levelname = record.levelname
+        
+        try:
+            # Add emoji prefix if enabled
+            if self.include_emoji:
+                emoji = LEVEL_EMOJI.get(record.levelno, "")
+                record.levelname = f"{emoji} {record.levelname}"
+            
+            # Add colors if enabled
+            if self.use_colors:
+                color = LEVEL_COLORS.get(record.levelno, Colors.RESET)
+                record.levelname = f"{color}{record.levelname}{Colors.RESET}"
+                record.name = f"{Colors.MODULE}{record.name}{Colors.RESET}"
+            
+            # Format extra fields if present
+            extra_str = self._format_extra(record)
+            if extra_str:
+                record.msg = f"{record.msg} | {extra_str}"
+            
+            # Call parent formatter
+            formatted = super().format(record)
+            
+            return formatted
+            
+        finally:
+            # Restore original values
+            record.msg = original_msg
+            record.levelname = original_levelname
+    
+    def _format_extra(self, record: logging.LogRecord) -> str:
+        """
+        Format extra fields from the log record.
+        
+        Extra fields are passed via the 'extra' parameter to logging calls:
+            logger.info("Message", extra={"key": "value"})
+        """
+        # Standard LogRecord attributes to exclude
+        standard_attrs = {
+            'name', 'msg', 'args', 'created', 'filename', 'funcName',
+            'levelname', 'levelno', 'lineno', 'module', 'msecs',
+            'pathname', 'process', 'processName', 'relativeCreated',
+            'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
+            'taskName', 'message'
+        }
+        
+        # Collect extra fields
+        extra_fields = {
+            k: v for k, v in record.__dict__.items()
+            if k not in standard_attrs and not k.startswith('_')
+        }
+        
+        if not extra_fields:
+            return ""
+        
+        # Format as key=value pairs
+        parts = []
+        for key, value in extra_fields.items():
+            if isinstance(value, str):
+                parts.append(f"{key}={value}")
+            elif isinstance(value, (int, float)):
+                parts.append(f"{key}={value}")
+            elif isinstance(value, bool):
+                parts.append(f"{key}={str(value).lower()}")
+            else:
+                parts.append(f"{key}={repr(value)}")
+        
+        return " ".join(parts)
+
+
+class JsonFormatter(logging.Formatter):
+    """
+    JSON formatter for structured logging.
+    
+    Outputs each log record as a single JSON line, suitable for:
+    - Log aggregation systems (ELK, Splunk)
+    - Cloud logging (CloudWatch, Stackdriver)
+    - Log parsing and analysis
+    
+    Example output:
+        {"timestamp": "2024-01-15T10:30:45.123Z", "level": "INFO", "logger": "gepa_optimizer.core", "message": "Starting optimization", "iteration": 5}
+    """
+    
+    def __init__(
+        self,
+        include_timestamp: bool = True,
+        include_location: bool = False,
+    ):
+        """
+        Initialize JSON formatter.
+        
+        Args:
+            include_timestamp: Include ISO timestamp
+            include_location: Include file/line information
+        """
+        super().__init__()
+        self.include_timestamp = include_timestamp
+        self.include_location = include_location
+    
+    def format(self, record: logging.LogRecord) -> str:
+        """Format record as JSON string."""
+        log_dict: Dict[str, Any] = {}
+        
+        # Timestamp
+        if self.include_timestamp:
+            log_dict["timestamp"] = datetime.utcfromtimestamp(
+                record.created
+            ).isoformat() + "Z"
+        
+        # Core fields
+        log_dict["level"] = record.levelname
+        log_dict["logger"] = record.name
+        log_dict["message"] = record.getMessage()
+        
+        # Location info
+        if self.include_location:
+            log_dict["file"] = record.filename
+            log_dict["line"] = record.lineno
+            log_dict["function"] = record.funcName
+        
+        # Exception info
+        if record.exc_info:
+            log_dict["exception"] = self.formatException(record.exc_info)
+        
+        # Extra fields
+        standard_attrs = {
+            'name', 'msg', 'args', 'created', 'filename', 'funcName',
+            'levelname', 'levelno', 'lineno', 'module', 'msecs',
+            'pathname', 'process', 'processName', 'relativeCreated',
+            'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
+            'taskName', 'message'
+        }
+        
+        for key, value in record.__dict__.items():
+            if key not in standard_attrs and not key.startswith('_'):
+                try:
+                    # Ensure value is JSON serializable
+                    json.dumps(value)
+                    log_dict[key] = value
+                except (TypeError, ValueError):
+                    log_dict[key] = str(value)
+        
+        return json.dumps(log_dict, default=str)
+
+
+class CompactFormatter(logging.Formatter):
+    """
+    Compact formatter for minimal log output.
+    
+    Useful for:
+    - CI/CD pipelines
+    - Reduced log verbosity
+    - Quick debugging
+    
+    Example output:
+        10:30:45 INFO optimizer: Starting optimization
+    """
+    
+    def format(self, record: logging.LogRecord) -> str:
+        """Format record in compact form."""
+        # Short timestamp (time only)
+        time_str = datetime.fromtimestamp(record.created).strftime("%H:%M:%S")
+        
+        # Short module name (last part only)
+        short_name = record.name.split(".")[-1]
+        
+        return f"{time_str} {record.levelname:5s} {short_name}: {record.getMessage()}"
+
diff --git a/src/gepa_optimizer/infrastructure/logging/logger.py b/src/gepa_optimizer/infrastructure/logging/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5cd5e11a86747139d3238cb77d1813aa712baf8
--- /dev/null
+++ b/src/gepa_optimizer/infrastructure/logging/logger.py
@@ -0,0 +1,260 @@
+"""
+Core Logger Factory and Configuration.
+
+This module provides the centralized logger factory that should be used
+across all GEPA Optimizer modules. It ensures consistent logging behavior
+and formatting throughout the application.
+
+Design Principles:
+- Single source of truth for logger configuration
+- Lazy initialization (loggers created on first use)
+- Thread-safe logger access
+- Configurable log levels per module
+"""
+
+import logging
+import sys
+from enum import Enum
+from typing import Optional, Dict, Any
+from functools import lru_cache
+
+from .formatters import GepaFormatter
+
+# Root logger name for GEPA Optimizer
+GEPA_LOGGER_NAME = "gepa_optimizer"
+
+# Default log format
+DEFAULT_FORMAT = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
+DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+
+
+class LogLevel(str, Enum):
+    """Supported log levels with string representation."""
+    DEBUG = "DEBUG"
+    INFO = "INFO"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    CRITICAL = "CRITICAL"
+    
+    @classmethod
+    def from_string(cls, level: str) -> "LogLevel":
+        """Convert string to LogLevel enum."""
+        try:
+            return cls(level.upper())
+        except ValueError:
+            return cls.INFO
+
+
+class LoggerConfig:
+    """
+    Configuration class for GEPA logging.
+    
+    This class holds all logging configuration and can be modified
+    before calling configure_logging() to customize behavior.
+    """
+    
+    # Default configuration
+    level: LogLevel = LogLevel.INFO
+    format: str = DEFAULT_FORMAT
+    date_format: str = DEFAULT_DATE_FORMAT
+    
+    # Module-specific log levels (for fine-grained control)
+    module_levels: Dict[str, LogLevel] = {}
+    
+    # Output configuration
+    log_to_console: bool = True
+    log_to_file: Optional[str] = None
+    
+    # Formatting options
+    use_colors: bool = True
+    include_emoji: bool = True  # For visual clarity in development
+    
+    @classmethod
+    def reset(cls) -> None:
+        """Reset configuration to defaults."""
+        cls.level = LogLevel.INFO
+        cls.format = DEFAULT_FORMAT
+        cls.date_format = DEFAULT_DATE_FORMAT
+        cls.module_levels = {}
+        cls.log_to_console = True
+        cls.log_to_file = None
+        cls.use_colors = True
+        cls.include_emoji = True
+
+
+# Global flag to track if logging is configured
+_logging_configured = False
+
+
+def configure_logging(
+    level: Optional[str] = None,
+    log_file: Optional[str] = None,
+    use_colors: bool = True,
+    include_emoji: bool = True,
+    format_string: Optional[str] = None,
+    module_levels: Optional[Dict[str, str]] = None,
+) -> None:
+    """
+    Configure the GEPA logging system.
+    
+    This should be called once at application startup. Subsequent calls
+    will update the configuration.
+    
+    Args:
+        level: Global log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        log_file: Optional path to log file
+        use_colors: Whether to use colored output in console
+        include_emoji: Whether to include emoji prefixes for visual clarity
+        format_string: Custom format string (optional)
+        module_levels: Dict mapping module names to their specific log levels
+        
+    Example:
+        configure_logging(
+            level="DEBUG",
+            log_file="optimization.log",
+            module_levels={
+                "gepa_optimizer.core.optimizer": "INFO",
+                "gepa_optimizer.llms": "DEBUG"
+            }
+        )
+    """
+    global _logging_configured
+    
+    # Update configuration
+    if level:
+        LoggerConfig.level = LogLevel.from_string(level)
+    if log_file:
+        LoggerConfig.log_to_file = log_file
+    LoggerConfig.use_colors = use_colors
+    LoggerConfig.include_emoji = include_emoji
+    if format_string:
+        LoggerConfig.format = format_string
+    if module_levels:
+        LoggerConfig.module_levels = {
+            k: LogLevel.from_string(v) for k, v in module_levels.items()
+        }
+    
+    # Get or create root GEPA logger
+    root_logger = logging.getLogger(GEPA_LOGGER_NAME)
+    root_logger.setLevel(getattr(logging, LoggerConfig.level.value))
+    
+    # Remove existing handlers to avoid duplicates
+    root_logger.handlers.clear()
+    
+    # Console handler
+    if LoggerConfig.log_to_console:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(getattr(logging, LoggerConfig.level.value))
+        
+        # Use custom formatter
+        formatter = GepaFormatter(
+            fmt=LoggerConfig.format,
+            datefmt=LoggerConfig.date_format,
+            use_colors=use_colors,
+            include_emoji=include_emoji,
+        )
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+    
+    # File handler (if configured)
+    if LoggerConfig.log_to_file:
+        file_handler = logging.FileHandler(LoggerConfig.log_to_file)
+        file_handler.setLevel(getattr(logging, LoggerConfig.level.value))
+        
+        # File logs don't use colors
+        file_formatter = GepaFormatter(
+            fmt=LoggerConfig.format,
+            datefmt=LoggerConfig.date_format,
+            use_colors=False,
+            include_emoji=False,
+        )
+        file_handler.setFormatter(file_formatter)
+        root_logger.addHandler(file_handler)
+    
+    # Apply module-specific levels
+    for module_name, module_level in LoggerConfig.module_levels.items():
+        module_logger = logging.getLogger(module_name)
+        module_logger.setLevel(getattr(logging, module_level.value))
+    
+    _logging_configured = True
+    
+    # Log that configuration is complete
+    root_logger.debug(
+        f"Logging configured: level={LoggerConfig.level.value}, "
+        f"file={LoggerConfig.log_to_file}"
+    )
+
+
+@lru_cache(maxsize=128)
+def get_logger(name: str) -> logging.Logger:
+    """
+    Get a logger instance for the given module name.
+    
+    This is the primary factory function for obtaining loggers.
+    All GEPA modules should use this instead of logging.getLogger().
+    
+    Args:
+        name: Module name (typically __name__)
+        
+    Returns:
+        Configured Logger instance
+        
+    Example:
+        from gepa_optimizer.infrastructure.logging import get_logger
+        
+        logger = get_logger(__name__)
+        logger.info("Starting process")
+        logger.error("Failed to connect", exc_info=True)
+    """
+    global _logging_configured
+    
+    # Auto-configure with defaults if not yet configured
+    if not _logging_configured:
+        configure_logging()
+    
+    # Ensure name is under GEPA namespace for consistent handling
+    if not name.startswith(GEPA_LOGGER_NAME) and name != GEPA_LOGGER_NAME:
+        # External module - still use our formatting
+        pass
+    
+    logger = logging.getLogger(name)
+    
+    # Apply module-specific level if configured
+    if name in LoggerConfig.module_levels:
+        logger.setLevel(getattr(logging, LoggerConfig.module_levels[name].value))
+    
+    return logger
+
+
+def set_log_level(level: str, module: Optional[str] = None) -> None:
+    """
+    Dynamically change log level at runtime.
+    
+    Args:
+        level: New log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        module: Optional module name. If None, changes global level.
+        
+    Example:
+        # Enable debug for specific module
+        set_log_level("DEBUG", "gepa_optimizer.core.optimizer")
+        
+        # Change global level
+        set_log_level("WARNING")
+    """
+    log_level = LogLevel.from_string(level)
+    
+    if module:
+        # Set level for specific module
+        logger = logging.getLogger(module)
+        logger.setLevel(getattr(logging, log_level.value))
+        LoggerConfig.module_levels[module] = log_level
+    else:
+        # Set global level
+        LoggerConfig.level = log_level
+        root_logger = logging.getLogger(GEPA_LOGGER_NAME)
+        root_logger.setLevel(getattr(logging, log_level.value))
+        
+        # Update all handlers
+        for handler in root_logger.handlers:
+            handler.setLevel(getattr(logging, log_level.value))
+
diff --git a/src/gepa_optimizer/llms/__init__.py b/src/gepa_optimizer/llms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..352f7a77e882131acec1c3084b72bb5e502a8ef0
--- /dev/null
+++ b/src/gepa_optimizer/llms/__init__.py
@@ -0,0 +1,10 @@
+"""
+LLM module for GEPA Optimizer
+"""
+
+from .base_llm import BaseLLMClient
+from .vision_llm import VisionLLMClient
+from .batch_llm import BatchLLMClient
+from .llego_enhanced_llm import LLEGOEnhancedLLMClient
+
+__all__ = ["BaseLLMClient", "VisionLLMClient", "BatchLLMClient", "LLEGOEnhancedLLMClient"]
diff --git a/src/gepa_optimizer/llms/base_llm.py b/src/gepa_optimizer/llms/base_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..15ffb4e83212922f30edad4d9e18a4b248af6234
--- /dev/null
+++ b/src/gepa_optimizer/llms/base_llm.py
@@ -0,0 +1,56 @@
+"""
+Base LLM client class for all LLM providers.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Union
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BaseLLMClient(ABC):
+    """
+    Abstract base class for all LLM clients.
+    
+    Provides a consistent interface for different LLM providers and models.
+    """
+    
+    def __init__(self, provider: str, model_name: str, **kwargs):
+        """
+        Initialize LLM client.
+        
+        Args:
+            provider: LLM provider (e.g., 'openai', 'anthropic')
+            model_name: Specific model name
+            **kwargs: Additional provider-specific parameters
+        """
+        self.provider = provider
+        self.model_name = model_name
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        
+        # Store additional configuration
+        self.config = kwargs
+    
+    @abstractmethod
+    def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> Dict[str, Any]:
+        """
+        Generate response from LLM.
+        
+        Args:
+            system_prompt: System-level instructions
+            user_prompt: User's input prompt
+            **kwargs: Additional generation parameters (e.g., image_base64)
+            
+        Returns:
+            Dictionary with 'content' key containing the generated response
+            and additional metadata
+        """
+        pass
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get model information for logging and debugging"""
+        return {
+            'provider': self.provider,
+            'model_name': self.model_name,
+            'class': self.__class__.__name__
+        }
diff --git a/src/gepa_optimizer/llms/batch_llm.py b/src/gepa_optimizer/llms/batch_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..dae4cf1462d0fe037442b59e9190cac09fe54975
--- /dev/null
+++ b/src/gepa_optimizer/llms/batch_llm.py
@@ -0,0 +1,712 @@
+"""
+Batch LLM Client for cost-effective processing using Gemini Batch API.
+
+This client provides 50% cost savings by using Google's Gemini Batch API
+instead of real-time API calls. Ideal for large-scale prompt optimization
+where latency is acceptable.
+
+Features:
+- 50% cost reduction compared to standard API
+- Automatic batching and job management
+- Built-in retry and polling logic
+- Thread-safe operation
+- Comprehensive error handling
+
+Author: GEPA Optimizer Team
+"""
+
+import os
+import json
+import time
+import logging
+import tempfile
+import io
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+from .base_llm import BaseLLMClient
+
+try:
+    from PIL import Image
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    Image = None
+
+try:
+    from google import genai
+    from google.genai import types
+    GENAI_AVAILABLE = True
+except ImportError:
+    GENAI_AVAILABLE = False
+    genai = None
+    types = None
+
+logger = logging.getLogger(__name__)
+
+
+class BatchLLMClient(BaseLLMClient):
+    """
+    Batch LLM client that uses Gemini Batch API for cost-effective processing.
+    
+    This client processes multiple requests together in batch jobs, providing:
+    - 50% cost savings vs standard API
+    - No rate limit impact
+    - Automatic job management and polling
+    
+    Usage:
+        >>> from gepa_optimizer.llms import BatchLLMClient
+        >>> 
+        >>> client = BatchLLMClient(
+        ...     provider="google",
+        ...     model_name="gemini-2.5-flash",
+        ...     api_key="your-key",
+        ...     batch_size=20,
+        ...     polling_interval=30
+        ... )
+        >>> 
+        >>> # Use just like VisionLLMClient - adapter handles the rest!
+        >>> result = client.generate(
+        ...     system_prompt="You are a helpful assistant",
+        ...     user_prompt="Analyze this image",
+        ...     image_base64="..."
+        ... )
+    
+    Performance Note:
+        Batch processing adds latency (30s+ polling time) but reduces costs by 50%.
+        Choose this mode for large-scale optimization where cost > speed.
+    """
+    
+    def __init__(
+        self,
+        provider: str,
+        model_name: str,
+        api_key: Optional[str] = None,
+        batch_size: int = 20,
+        polling_interval: int = 30,
+        max_polling_time: int = 3600,
+        temp_dir: str = ".gepa_batch_temp",
+        **kwargs
+    ):
+        """
+        Initialize Batch LLM Client.
+        
+        Args:
+            provider: Must be "google" or "gemini"
+            model_name: Gemini model (e.g., "gemini-2.5-flash", "gemini-1.5-flash")
+            api_key: Google API key (defaults to GEMINI_API_KEY env var)
+            batch_size: Number of samples to process per batch job (1-100)
+            polling_interval: Seconds between job status checks (default: 30)
+            max_polling_time: Maximum seconds to wait for job completion (default: 3600)
+            temp_dir: Directory for temporary files (default: ".gepa_batch_temp")
+            **kwargs: Additional parameters
+        
+        Raises:
+            ValueError: If provider is not Google/Gemini
+            ImportError: If google-genai is not installed
+        """
+        super().__init__(provider=provider, model_name=model_name, **kwargs)
+        
+        # Validate provider
+        if provider.lower() not in ["google", "gemini"]:
+            raise ValueError(
+                f"BatchLLMClient only supports Google/Gemini provider. Got: {provider}"
+            )
+        
+        # Check dependencies
+        if not GENAI_AVAILABLE:
+            raise ImportError(
+                "google-genai not installed. Install with: pip install google-genai"
+            )
+        
+        # Configuration
+        self.batch_size = batch_size
+        self.polling_interval = polling_interval
+        self.max_polling_time = max_polling_time
+        self.temp_dir = Path(temp_dir)
+        self.temp_dir.mkdir(exist_ok=True)
+        
+        # Initialize Gemini client
+        from ..utils.api_keys import APIKeyManager
+        self.api_key = api_key or APIKeyManager().get_api_key("google")
+        
+        if not self.api_key:
+            raise ValueError(
+                "Google API key required. Provide via api_key parameter or "
+                "set GEMINI_API_KEY environment variable."
+            )
+        
+        self.client = genai.Client(api_key=self.api_key)
+        
+        logger.info(
+            f"✓ BatchLLMClient initialized: {model_name} "
+            f"(batch_size={batch_size}, polling={polling_interval}s)"
+        )
+    
+    def generate(
+        self, 
+        system_prompt: str, 
+        user_prompt: str, 
+        image_base64: Optional[str] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response using batch API.
+        
+        Note: This method is primarily for compatibility. For batch optimization,
+        the adapter will call generate_batch() directly with multiple requests.
+        
+        Args:
+            system_prompt: System-level instructions
+            user_prompt: User's input prompt
+            image_base64: Optional base64 encoded image
+            **kwargs: Additional generation parameters
+            
+        Returns:
+            Dict with 'content' key containing generated text
+        """
+        # Single request - process as a batch of 1
+        requests = [{
+            'system_prompt': system_prompt,
+            'user_prompt': user_prompt,
+            'image_base64': image_base64
+        }]
+        
+        results = self.generate_batch(requests)
+        return results[0] if results else {"content": "", "error": "No results"}
+    
+    def generate_batch(
+        self,
+        requests: List[Dict[str, Any]],
+        timeout_override: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Process multiple requests in a single batch job.
+        
+        This is the main method called by UniversalGepaAdapter during GEPA optimization.
+        
+        Args:
+            requests: List of request dicts with keys:
+                - system_prompt: System instructions
+                - user_prompt: User input
+                - image_base64: Optional base64 image
+            timeout_override: Override max_polling_time for this batch
+            
+        Returns:
+            List of response dicts with 'content' key
+            
+        Raises:
+            RuntimeError: If batch job fails
+            TimeoutError: If polling exceeds timeout
+        """
+        logger.info(f"📦 Processing batch of {len(requests)} requests via Gemini Batch API...")
+        
+        start_time = time.time()
+        
+        try:
+            # Step 1: Upload images if needed
+            file_uris, mime_types = self._upload_images_for_batch(requests)
+            
+            # Step 2: Create JSONL file
+            jsonl_path = self._create_batch_jsonl(requests, file_uris, mime_types)
+            
+            # Step 3: Submit batch job
+            batch_job_name = self._submit_batch_job(jsonl_path)
+            
+            # Step 4: Wait for completion
+            timeout = timeout_override or self.max_polling_time
+            self._wait_for_batch_completion(batch_job_name, timeout)
+            
+            # Step 5: Retrieve results
+            results = self._retrieve_batch_results(batch_job_name)
+            
+            # Cleanup
+            jsonl_path.unlink(missing_ok=True)
+            
+            elapsed_time = time.time() - start_time
+            logger.info(
+                f"✓ Batch processing complete: {len(results)} results in {elapsed_time:.1f}s "
+                f"(~{elapsed_time/len(results):.1f}s per request)"
+            )
+            
+            return results
+            
+        except Exception as e:
+            elapsed_time = time.time() - start_time
+            logger.error(f"❌ Batch processing failed after {elapsed_time:.1f}s: {e}")
+            raise
+    
+    def _upload_images_for_batch(self, requests: List[Dict]) -> Tuple[List[Optional[str]], List[Optional[str]]]:
+        """
+        Upload images to Gemini and return file URIs and MIME types.
+        
+        Args:
+            requests: List of request dicts
+            
+        Returns:
+            Tuple of (file_uris, mime_types) - both are lists with None for requests without images
+        """
+        file_uris = []
+        mime_types = []
+        images_to_upload = sum(1 for r in requests if r.get('image_base64'))
+        
+        if images_to_upload > 0:
+            logger.info(f"   ⬆️  Uploading {images_to_upload} images to Gemini...")
+        
+        for i, request in enumerate(requests):
+            image_base64 = request.get('image_base64')
+            
+            if not image_base64:
+                file_uris.append(None)
+                mime_types.append(None)
+                continue
+            
+            try:
+                # Decode image data
+                import base64
+                image_data = base64.b64decode(image_base64)
+                
+                # Detect image format using Pillow
+                image_format = None
+                if PIL_AVAILABLE:
+                    try:
+                        img = Image.open(io.BytesIO(image_data))
+                        image_format = img.format.lower() if img.format else None
+                    except Exception as e:
+                        logger.warning(f"   ⚠️  Could not detect image format: {e}")
+                
+                # Map format to extension and MIME type
+                format_map = {
+                    'jpeg': ('.jpg', 'image/jpeg'),
+                    'jpg': ('.jpg', 'image/jpeg'),
+                    'png': ('.png', 'image/png'),
+                    'gif': ('.gif', 'image/gif'),
+                    'webp': ('.webp', 'image/webp'),
+                    'bmp': ('.bmp', 'image/bmp'),
+                    'tiff': ('.tiff', 'image/tiff'),
+                    'tif': ('.tiff', 'image/tiff'),
+                }
+                
+                # Get extension and MIME type (default to PNG if unknown)
+                ext, mime_type = format_map.get(image_format, ('.png', 'image/png'))
+                
+                if image_format and image_format not in format_map:
+                    logger.warning(f"   ⚠️  Unknown image format '{image_format}' for image {i}, defaulting to PNG")
+                elif not image_format:
+                    logger.debug(f"   ℹ️  Could not detect format for image {i}, using PNG")
+                
+                # Save to temp file with correct extension
+                temp_file = tempfile.NamedTemporaryFile(
+                    delete=False, 
+                    suffix=ext, 
+                    dir=self.temp_dir
+                )
+                temp_file.write(image_data)
+                temp_file.close()
+                
+                # Upload to Gemini with correct MIME type
+                uploaded_file = self.client.files.upload(
+                    file=temp_file.name,
+                    config=types.UploadFileConfig(
+                        display_name=f"batch_image_{i}_{int(time.time())}{ext}",
+                        mime_type=mime_type
+                    )
+                )
+                
+                logger.debug(f"   ✓ Uploaded image {i} as {mime_type}")
+                
+                # Wait for file to be active
+                self._wait_for_file_active(uploaded_file)
+                file_uris.append(uploaded_file.uri)
+                mime_types.append(mime_type)
+                
+                # Cleanup temp file
+                Path(temp_file.name).unlink()
+                
+            except Exception as e:
+                logger.error(f"   ✗ Failed to upload image {i}: {e}")
+                file_uris.append(None)
+                mime_types.append(None)
+        
+        if images_to_upload > 0:
+            successful = sum(1 for uri in file_uris if uri is not None)
+            logger.info(f"   ✓ Uploaded {successful}/{images_to_upload} images successfully")
+        
+        return file_uris, mime_types
+    
+    def _create_batch_jsonl(
+        self, 
+        requests: List[Dict],
+        file_uris: List[Optional[str]],
+        mime_types: List[Optional[str]]
+    ) -> Path:
+        """
+        Create JSONL file for batch job.
+        
+        Args:
+            requests: List of request dicts
+            file_uris: List of uploaded file URIs
+            mime_types: List of MIME types for uploaded files
+            
+        Returns:
+            Path to created JSONL file
+        """
+        timestamp = int(time.time())
+        jsonl_path = self.temp_dir / f"batch_{timestamp}.jsonl"
+        
+        with open(jsonl_path, 'w', encoding='utf-8') as f:
+            for i, (request, file_uri, mime_type) in enumerate(zip(requests, file_uris, mime_types)):
+                # Combine system and user prompts
+                system_prompt = request.get('system_prompt', '')
+                user_prompt = request.get('user_prompt', '')
+                full_prompt = f"{system_prompt}\n\n{user_prompt}".strip()
+                
+                # Build request parts
+                parts = [{"text": full_prompt}]
+                
+                if file_uri:
+                    parts.append({
+                        "file_data": {
+                            "file_uri": file_uri,
+                            "mime_type": mime_type or "image/png"  # Use actual MIME type
+                        }
+                    })
+                
+                # Gemini Batch API format according to official docs
+                # Reference: https://ai.google.dev/gemini-api/docs/batch-inference
+                # NOTE: The "request" wrapper is REQUIRED for Gemini 2.5 batch API
+                batch_request = {
+                    "custom_id": f"request-{i}",
+                    "request": {
+                        "contents": [{
+                            "role": "user",
+                            "parts": parts
+                        }]
+                    }
+                }
+                
+                f.write(json.dumps(batch_request, ensure_ascii=False) + '\n')
+        
+        logger.info(f"   📝 Created JSONL file: {jsonl_path.name} ({len(requests)} requests)")
+        return jsonl_path
+    
+    def _submit_batch_job(self, jsonl_path: Path) -> str:
+        """
+        Submit batch job to Gemini.
+        
+        Args:
+            jsonl_path: Path to JSONL file
+            
+        Returns:
+            Batch job name
+        """
+        # Upload JSONL file
+        # Try multiple methods as the google-genai SDK can be finicky
+        try:
+            logger.info(f"   📤 Uploading JSONL file: {jsonl_path.name}")
+            
+            # Read and validate file content
+            with open(jsonl_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                line_count = len(content.strip().split('\n'))
+                logger.debug(f"   📄 JSONL: {len(content)} bytes, {line_count} lines")
+                
+                # Validate JSONL format
+                for line_num, line in enumerate(content.strip().split('\n'), 1):
+                    try:
+                        json.loads(line)
+                    except json.JSONDecodeError as e:
+                        logger.error(f"   ❌ Invalid JSON at line {line_num}: {e}")
+                        logger.error(f"      Content: {line[:100]}...")
+                        raise ValueError(f"Invalid JSONL format at line {line_num}") from e
+            
+            # Method 1: Try uploading with Path object
+            logger.info(f"   🔄 Upload method 1: Using Path object...")
+            try:
+                jsonl_file = self.client.files.upload(
+                    file=jsonl_path,
+                    config=types.UploadFileConfig(
+                        display_name=f'gepa-batch-{int(time.time())}',
+                        mime_type='application/json'  # Try application/json instead of application/jsonl
+                    )
+                )
+                logger.info(f"   ✓ JSONL file uploaded: {jsonl_file.name}")
+                
+            except Exception as e1:
+                logger.warning(f"   ⚠️  Method 1 failed: {e1}")
+                logger.info(f"   🔄 Upload method 2: Using string path...")
+                
+                # Method 2: Fallback to string path
+                try:
+                    jsonl_file = self.client.files.upload(
+                        file=str(jsonl_path.absolute()),
+                        config=types.UploadFileConfig(
+                            display_name=f'gepa-batch-{int(time.time())}',
+                            mime_type='application/json'
+                        )
+                    )
+                    logger.info(f"   ✓ JSONL file uploaded (method 2): {jsonl_file.name}")
+                except Exception as e2:
+                    logger.error(f"   ❌ Method 2 also failed: {e2}")
+                    raise e2
+            
+        except KeyError as e:
+            logger.error(f"❌ KeyError during JSONL upload: {e}")
+            logger.error(f"   This suggests the Gemini API response format changed")
+            logger.error(f"   Try updating google-genai: pip install --upgrade google-genai")
+            raise RuntimeError(f"Gemini Batch API response format error: {e}") from e
+        except Exception as e:
+            logger.error(f"❌ Failed to upload JSONL file: {e}")
+            logger.error(f"   File path: {jsonl_path}")
+            logger.error(f"   File exists: {jsonl_path.exists()}")
+            logger.error(f"   File size: {jsonl_path.stat().st_size if jsonl_path.exists() else 'N/A'} bytes")
+            raise RuntimeError(f"Gemini Batch API file upload failed: {e}") from e
+        
+        # Wait for JSONL to be active
+        try:
+            logger.info(f"   ⏳ Waiting for JSONL file to be processed...")
+            self._wait_for_file_active(jsonl_file)
+        except Exception as e:
+            logger.error(f"❌ JSONL file processing failed: {e}")
+            raise
+        
+        # Create batch job
+        try:
+            logger.info(f"   🚀 Creating batch job...")
+            batch_job = self.client.batches.create(
+                model=self.model_name,
+                src=jsonl_file.name,
+                config={'display_name': f'gepa-opt-{int(time.time())}'}
+            )
+            
+            logger.info(f"   ✓ Batch job submitted: {batch_job.name}")
+            return batch_job.name
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to create batch job: {e}")
+            raise RuntimeError(f"Batch job creation failed: {e}") from e
+    
+    def _wait_for_batch_completion(self, job_name: str, timeout: int):
+        """
+        Poll batch job until completion.
+        
+        Args:
+            job_name: Batch job name
+            timeout: Maximum seconds to wait
+            
+        Raises:
+            TimeoutError: If polling exceeds timeout
+            RuntimeError: If batch job fails
+        """
+        logger.info(f"   ⏳ Polling for completion (checking every {self.polling_interval}s)...")
+        
+        start_time = time.time()
+        poll_count = 0
+        
+        while True:
+            elapsed = time.time() - start_time
+            
+            if elapsed > timeout:
+                raise TimeoutError(
+                    f"Batch job timeout after {elapsed:.0f}s "
+                    f"(max: {timeout}s)"
+                )
+            
+            try:
+                batch_job = self.client.batches.get(name=job_name)
+                state = batch_job.state.name
+                
+                # Success states
+                if state in ['JOB_STATE_SUCCEEDED', 'SUCCEEDED']:
+                    logger.info(f"   ✓ Batch job completed in {elapsed:.0f}s")
+                    return
+                
+                # Failure states
+                if state in ['JOB_STATE_FAILED', 'FAILED']:
+                    raise RuntimeError(f"Batch job failed with state: {state}")
+                
+                if state in ['JOB_STATE_CANCELLED', 'CANCELLED']:
+                    raise RuntimeError(f"Batch job was cancelled: {state}")
+                
+                # Still processing
+                poll_count += 1
+                if poll_count % 5 == 0:  # Log every 5 polls
+                    logger.info(f"   ... still processing ({elapsed:.0f}s elapsed, state: {state})")
+                
+                time.sleep(self.polling_interval)
+                
+            except (TimeoutError, RuntimeError):
+                raise
+            except Exception as e:
+                logger.warning(f"   ⚠️  Error checking job status: {e}, retrying...")
+                time.sleep(5)
+    
+    def _retrieve_batch_results(self, job_name: str) -> List[Dict[str, Any]]:
+        """
+        Retrieve and parse batch results.
+        
+        Args:
+            job_name: Batch job name
+            
+        Returns:
+            List of result dicts
+        """
+        batch_job = self.client.batches.get(name=job_name)
+        
+        # Check for inline responses (preferred)
+        if hasattr(batch_job.dest, 'inlined_responses') and batch_job.dest.inlined_responses:
+            logger.info(f"   📥 Processing inline responses...")
+            return self._parse_inline_results(batch_job.dest.inlined_responses)
+        
+        # Download results file (fallback)
+        if hasattr(batch_job.dest, 'file_name') and batch_job.dest.file_name:
+            logger.info(f"   📥 Downloading results file: {batch_job.dest.file_name}")
+            file_data = self.client.files.download(file=batch_job.dest.file_name)
+            return self._parse_file_results(file_data)
+        
+        raise RuntimeError("No results available from batch job")
+    
+    def _parse_inline_results(self, inline_responses) -> List[Dict[str, Any]]:
+        """Parse inline batch results."""
+        results = []
+        
+        for response_obj in inline_responses:
+            if hasattr(response_obj, 'response') and response_obj.response:
+                text = self._extract_text_from_response(response_obj.response)
+                results.append({
+                    "content": text,
+                    "role": "assistant",
+                    "model": self.model_name,
+                    "provider": "google"
+                })
+            else:
+                error_msg = str(getattr(response_obj, 'error', 'Unknown error'))
+                logger.warning(f"   ⚠️  Response error: {error_msg}")
+                results.append({
+                    "content": "",
+                    "error": error_msg
+                })
+        
+        return results
+    
+    def _parse_file_results(self, file_data) -> List[Dict[str, Any]]:
+        """Parse JSONL results file."""
+        if isinstance(file_data, bytes):
+            jsonl_content = file_data.decode('utf-8')
+        else:
+            jsonl_content = file_data
+        
+        results = []
+        
+        for line_num, line in enumerate(jsonl_content.strip().split('\n'), 1):
+            if not line.strip():
+                continue
+            
+            try:
+                result = json.loads(line)
+                
+                if 'response' in result:
+                    text = self._extract_text_from_dict(result['response'])
+                    results.append({
+                        "content": text,
+                        "role": "assistant",
+                        "model": self.model_name,
+                        "provider": "google"
+                    })
+                else:
+                    error_msg = result.get('error', 'Unknown error')
+                    logger.warning(f"   ⚠️  Line {line_num} error: {error_msg}")
+                    results.append({
+                        "content": "",
+                        "error": error_msg
+                    })
+            
+            except json.JSONDecodeError as e:
+                logger.error(f"   ✗ Line {line_num}: JSON decode error: {e}")
+                results.append({"content": "", "error": f"JSON decode error: {e}"})
+        
+        return results
+    
+    def _extract_text_from_response(self, response_obj) -> str:
+        """Extract text from response object."""
+        try:
+            # Direct text attribute
+            if hasattr(response_obj, 'text'):
+                return response_obj.text
+            
+            # Navigate through candidates
+            if hasattr(response_obj, 'candidates') and response_obj.candidates:
+                candidate = response_obj.candidates[0]
+                if hasattr(candidate, 'content'):
+                    content = candidate.content
+                    if hasattr(content, 'parts') and content.parts:
+                        part = content.parts[0]
+                        if hasattr(part, 'text'):
+                            return part.text
+            
+            # Fallback to string representation
+            return str(response_obj)
+            
+        except Exception as e:
+            logger.error(f"Error extracting text from response: {e}")
+            return ""
+    
+    def _extract_text_from_dict(self, response_dict: Dict) -> str:
+        """Extract text from response dictionary."""
+        try:
+            # Direct text key
+            if 'text' in response_dict:
+                return response_dict['text']
+            
+            # Navigate through candidates
+            if 'candidates' in response_dict and response_dict['candidates']:
+                candidate = response_dict['candidates'][0]
+                if 'content' in candidate and 'parts' in candidate['content']:
+                    parts = candidate['content']['parts']
+                    if parts and 'text' in parts[0]:
+                        return parts[0]['text']
+            
+            # Fallback to JSON string
+            return json.dumps(response_dict)
+            
+        except Exception as e:
+            logger.error(f"Error extracting text from dict: {e}")
+            return ""
+    
+    def _wait_for_file_active(self, uploaded_file, timeout: int = 60):
+        """
+        Wait for uploaded file to become active.
+        
+        Args:
+            uploaded_file: Uploaded file object
+            timeout: Maximum seconds to wait
+            
+        Raises:
+            TimeoutError: If file processing exceeds timeout
+            RuntimeError: If file processing fails
+        """
+        start_time = time.time()
+        
+        while uploaded_file.state.name == "PROCESSING":
+            if time.time() - start_time > timeout:
+                raise TimeoutError(f"File processing timeout: {uploaded_file.name}")
+            
+            time.sleep(1)
+            uploaded_file = self.client.files.get(name=uploaded_file.name)
+        
+        if uploaded_file.state.name != "ACTIVE":
+            raise RuntimeError(
+                f"File processing failed: {uploaded_file.name} "
+                f"(state: {uploaded_file.state.name})"
+            )
+    
+    def get_model_info(self) -> Dict[str, str]:
+        """Get model information for logging and debugging."""
+        return {
+            'provider': self.provider,
+            'model_name': self.model_name,
+            'class': self.__class__.__name__,
+            'mode': 'batch',
+            'batch_size': str(self.batch_size),
+            'polling_interval': f'{self.polling_interval}s'
+        }
+
diff --git a/src/gepa_optimizer/llms/llego_enhanced_llm.py b/src/gepa_optimizer/llms/llego_enhanced_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8785cffea3eb2ef716139f07e92b0f62d3d5cde3
--- /dev/null
+++ b/src/gepa_optimizer/llms/llego_enhanced_llm.py
@@ -0,0 +1,1625 @@
+"""
+LLEGO-Enhanced LLM Client Wrapper
+
+This wrapper intercepts LLM calls and uses LLEGO genetic operators
+when generating new prompt candidates during GEPA's reflection phase.
+"""
+
+import logging
+import re
+from typing import Optional, Dict, Any, Callable, List
+from .base_llm import BaseLLMClient
+
+logger = logging.getLogger(__name__)
+
+# Fallback system prompt for sequential generation (when JSON parsing fails)
+# Uses Linear Command structure for reliability when complex JSON generation fails
+_FALLBACK_SYSTEM_PROMPT = """You are a Prompt Optimization Engine operating in **SAFE MODE**.
+
+<task>
+Rewrite the prompt based on the feedback provided below.
+</task>
+
+<output_rules>
+1. Output **ONLY** the new prompt text.
+2. No JSON. No Explanations. No "Here is the prompt".
+3. The prompt must be fully functional and self-contained.
+4. START directly with the prompt content (e.g., "You are a..." or task instructions).
+5. Preserve the core task/domain - only improve HOW it's described.
+</output_rules>
+
+<quality_standards>
+- Be specific and concrete (no vague instructions)
+- Use clear, imperative language
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+- Add explicit constraints for format/output if needed
+</quality_standards>
+
+<forbidden_outputs>
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..."
+- Anything other than the raw prompt text
+</forbidden_outputs>
+
+Start of New Prompt:"""
+
+
+class LLEGOEnhancedLLMClient(BaseLLMClient):
+    """
+    Wrapper around BaseLLMClient that uses LLEGO for candidate generation.
+    
+    This wrapper detects when GEPA is asking for new prompt candidates
+    and routes those requests through LLEGO's genetic operators instead
+    of standard LLM generation.
+    """
+    
+    def __init__(
+        self,
+        base_llm: BaseLLMClient,
+        llego_layer,
+        config=None,
+        verbose: bool = True
+    ):
+        """
+        Initialize LLEGO-enhanced LLM client.
+        
+        Args:
+            base_llm: The underlying LLM client (VisionLLMClient, etc.)
+            llego_layer: LLEGOIntegrationLayer instance
+            config: Optional OptimizationConfig for hybrid mode settings
+            verbose: Whether to log LLEGO operations
+        """
+        self.base_llm = base_llm
+        self.llego = llego_layer
+        self.config = config
+        self.verbose = verbose
+        
+        # Get log level from config (default to INFO)
+        self.log_level = getattr(config, 'log_level', 'INFO') if config else 'INFO'
+        
+        # Track context for detecting reflection calls
+        self.reflection_context = {
+            'current_prompt': None,
+            'feedback': None,
+            'in_reflection': False
+        }
+        
+        # Queue for hybrid mode candidates (GEPA will call generate() multiple times)
+        self._candidate_queue = []
+        self._hybrid_generation_complete = False
+        
+        # 🔥 CRITICAL: Queue for adapter-generated candidates (from make_reflective_dataset)
+        # When adapter generates candidates at adapter level, they're stored here
+        # GEPA will call generate() for proposals, and we'll return these candidates
+        self._adapter_generated_candidates = []
+        
+
+        # 🔥 FORMAT AWARENESS: Store format info from adapter for use in candidate generation
+        self._detected_format = None  # Will be set by adapter after format detection
+        
+        # FIX #5: Circuit breaker for LLEGO failures
+        self._llego_failures = 0
+        self._llego_disabled = False
+        self._llego_failure_threshold = 3  # Disable after 3 consecutive failures
+        
+        logger.info("🧬 LLEGO-Enhanced LLM Client initialized")
+        logger.info(f"   Base LLM: {base_llm.__class__.__name__}")
+        logger.info(f"   LLEGO enabled: {llego_layer is not None}")
+        if config and hasattr(config, 'enable_gepa_reflection_with_llego'):
+            logger.info(f"   Hybrid mode: {config.enable_gepa_reflection_with_llego}")
+        logger.debug(f"   Log level: {self.log_level}")
+    
+    def _should_log_debug(self) -> bool:
+        """
+        Check if DEBUG logging is enabled.
+        
+        Returns:
+            True if DEBUG level logging is enabled, False otherwise
+        """
+        return self.log_level == "DEBUG" or (
+            hasattr(logging, 'getLogger') and 
+            logging.getLogger().isEnabledFor(logging.DEBUG)
+        )
+    
+    def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str:
+        """
+        🛡️ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions.
+        
+        NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text.
+        However, this extraction logic serves as a safety net in case the LLM still adds:
+        "Based on the performance analysis...
+        ### Recommendations...
+        ### Revised Prompt Example:
+        [THE ACTUAL PROMPT HERE]
+        ### Conclusion..."
+        
+        This is now a defensive measure, not the primary mechanism.
+        
+        Args:
+            reflection_output: Full reflection output (should be clean prompt, but may contain analysis)
+            
+        Returns:
+            str: Clean, extracted prompt (or original if extraction fails or not needed)
+        """
+        if not reflection_output or not isinstance(reflection_output, str):
+            return reflection_output
+        
+        # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:"
+        patterns = [
+            r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)',
+        ]
+        
+        for pattern in patterns:
+            match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL)
+            if match:
+                extracted = match.group(1).strip()
+                # Clean up common artifacts
+                extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE)
+                extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE)
+                extracted = extracted.strip()
+                
+                if len(extracted) > 50:  # Reasonable minimum length for a prompt
+                    logger.debug(f"✅ Extracted clean prompt using pattern: {pattern[:50]}...")
+                    logger.debug(f"   Original length: {len(reflection_output)} chars")
+                    logger.debug(f"   Extracted length: {len(extracted)} chars")
+                    return extracted
+        
+        # Pattern 2: If output starts with a quote or prompt-like structure
+        # Look for text that starts with "You are..." and is substantial
+        if 'You are' in reflection_output:
+            # Find the longest continuous block that starts with "You are"
+            prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)', 
+                                    reflection_output, re.IGNORECASE | re.DOTALL)
+            if prompt_match:
+                extracted = prompt_match.group(1).strip()
+                if len(extracted) > 50:
+                    logger.debug(f"✅ Extracted prompt starting with 'You are...'")
+                    return extracted
+        
+        # Pattern 3: If the reflection output is actually just a clean prompt (no analysis)
+        # Check if it's relatively short and doesn't contain analysis keywords
+        analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion', 
+                           'optimization', 'analysis', 'feedback']
+        if (len(reflection_output) < 2000 and 
+            not any(keyword in reflection_output.lower() for keyword in analysis_keywords)):
+            # Likely a clean prompt, return as-is
+            logger.debug(f"✅ Reflection output appears to be a clean prompt (no analysis detected)")
+            return reflection_output.strip()
+        
+        # Fallback: Try to extract ANY valid prompt-like text
+        # Look for text that might be a prompt even if not perfectly formatted
+        if 'You are' in reflection_output:
+            # Try to find a substantial block starting with "You are"
+            potential_prompt = re.search(
+                r'(You are(?:[^\.]|\.(?!\s*(?:Here|This|These|The above)))*?)(?:\n\n|\n###|Conclusion|\Z)',
+                reflection_output, 
+                re.IGNORECASE | re.DOTALL
+            )
+            if potential_prompt and len(potential_prompt.group(1)) > 100:
+                extracted = potential_prompt.group(1).strip()
+                logger.warning(f"⚠️  Could not extract clean prompt using standard patterns")
+                logger.warning(f"   Falling back to 'You are...' block (length: {len(extracted)} chars)")
+                logger.warning(f"   This may still contain some analysis text")
+                return extracted
+        
+        # Final fallback: If still nothing, return original but log strongly
+        logger.warning(f"⚠️  Could not extract clean prompt from reflection output")
+        logger.warning(f"   Output length: {len(reflection_output)} chars")
+        logger.warning(f"   Output preview: {reflection_output[:200]}...")
+        logger.warning(f"   ⚠️  WARNING: Returning original output (may contain analysis text or be invalid)")
+        logger.warning(f"   This candidate may perform poorly - consider improving extraction logic")
+        return reflection_output.strip()
+    
+    def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]:
+        """
+        🔥 OPTIMIZED: Parse N prompt variations from JSON format response.
+        
+        Uses robust JSON parsing with multiple fallback strategies.
+        
+        Handles common LLM output issues:
+        - Markdown code blocks (```json ... ```)
+        - Extra text before/after JSON
+        - Trailing commas
+        - Comments in JSON
+        - Newlines in strings
+        """
+        import json
+        import re
+        
+        if not response_text or not isinstance(response_text, str):
+            raise ValueError("Empty or invalid response text")
+        
+        # 🔥 PREPROCESSING: Clean LLM output
+        cleaned = response_text.strip()
+        
+        # Remove BOM and invisible chars
+        cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d')
+        
+        # Strategy 0: Handle Python dict syntax (single quotes -> double quotes)
+        # LLMs sometimes return Python dict syntax {'key': 'value'} instead of JSON {"key": "value"}
+        if "'variations'" in cleaned or (cleaned.startswith("{'") or cleaned.startswith("{'variations'")):
+            try:
+                import ast
+                # Try to parse as Python literal (handles single quotes, True/False, None)
+                python_dict = ast.literal_eval(cleaned)
+                if isinstance(python_dict, dict) and 'variations' in python_dict:
+                    # Convert to JSON-compatible format
+                    json_str = json.dumps(python_dict)
+                    data = json.loads(json_str)
+                    if 'variations' in data:
+                        # #region agent log
+                        import json as _json_debug
+                        import time as _time_debug
+                        import os as _os_debug
+                        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                        _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                        with open(_debug_log_path, "a") as _f:
+                            _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_parse", "message": "Successfully parsed Python dict syntax", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                        # #endregion
+                        return self._extract_variations_from_json(data, num_expected)
+            except (ValueError, SyntaxError, TypeError) as e:
+                # If ast.literal_eval fails, try string replacement as fallback
+                try:
+                    # Simple conversion: replace single quotes with double quotes (with escaping)
+                    # This is a heuristic and may not work for all cases
+                    converted = cleaned.replace("'", '"')
+                    data = json.loads(converted)
+                    if 'variations' in data:
+                        # #region agent log
+                        import json as _json_debug
+                        import time as _time_debug
+                        import os as _os_debug
+                        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                        _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                        with open(_debug_log_path, "a") as _f:
+                            _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_string_replace", "message": "Parsed Python dict via string replacement", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                        # #endregion
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    pass
+        
+        # Strategy 1: Direct JSON parse (cleanest case)
+        try:
+            data = json.loads(cleaned)
+            if 'variations' in data:
+                return self._extract_variations_from_json(data, num_expected)
+        except json.JSONDecodeError:
+            pass
+        
+        # Strategy 2: Extract from markdown code block
+        # More permissive regex that handles various formats
+        code_block_patterns = [
+            r'```(?:json|JSON)?\s*(\{[\s\S]*?\})\s*```',  # Standard markdown
+            r'```\s*(\{[\s\S]*"variations"[\s\S]*\})\s*```',  # With "variations" keyword
+        ]
+        
+        for pattern in code_block_patterns:
+            json_match = re.search(pattern, cleaned)
+            if json_match:
+                json_str = json_match.group(1)
+                try:
+                    data = json.loads(json_str)
+                    if 'variations' in data:
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    # Try repair
+                    repaired = self._repair_json_string(json_str)
+                    try:
+                        data = json.loads(repaired)
+                        if 'variations' in data:
+                            return self._extract_variations_from_json(data, num_expected)
+                    except json.JSONDecodeError:
+                        pass
+        
+        # Strategy 3: Balanced brace extraction (handles nested objects)
+        json_str = self._extract_balanced_json(cleaned)
+        if json_str:
+            try:
+                data = json.loads(json_str)
+                if 'variations' in data:
+                    return self._extract_variations_from_json(data, num_expected)
+            except json.JSONDecodeError:
+                repaired = self._repair_json_string(json_str)
+                try:
+                    data = json.loads(repaired)
+                    if 'variations' in data:
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    pass
+        
+        # Strategy 4: Find JSON object with "variations" keyword
+        # Use greedy matching to get the full object
+        json_match = re.search(r'(\{[\s\S]*"variations"[\s\S]*\})', cleaned)
+        if json_match:
+            json_str = json_match.group(1)
+            # Find the balanced JSON within
+            balanced = self._extract_balanced_json(json_str)
+            if balanced:
+                try:
+                    data = json.loads(balanced)
+                    if 'variations' in data:
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    repaired = self._repair_json_string(balanced)
+                    try:
+                        data = json.loads(repaired)
+                        if 'variations' in data:
+                            return self._extract_variations_from_json(data, num_expected)
+                    except json.JSONDecodeError:
+                        pass
+        
+        # Strategy 5: Fallback to numbered sections
+        logger.warning(f"JSON parsing failed, trying numbered section fallback...")
+        try:
+            return self._parse_numbered_section_variations(response_text, num_expected)
+        except ValueError:
+            pass
+        
+        # #region agent log
+        import json as _json_debug
+        import time as _time_debug
+        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+        with open(_debug_log_path, "a") as _f:
+            _f.write(_json_debug.dumps({"hypothesisId": "D", "location": "llego_enhanced_llm.py:json_parse_fail", "message": "JSON parsing failed completely", "data": {"num_expected": num_expected, "response_preview": response_text[:500] if response_text else "EMPTY", "response_length": len(response_text) if response_text else 0}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        # #endregion
+        
+        raise ValueError(f"Could not parse {num_expected} variations from response")
+    
+    def _extract_balanced_json(self, text: str) -> Optional[str]:
+        """Extract JSON with balanced braces."""
+        brace_count = 0
+        start_idx = -1
+        in_string = False
+        escape_next = False
+        
+        for i, char in enumerate(text):
+            # Handle string escaping
+            if escape_next:
+                escape_next = False
+                continue
+            if char == '\\' and in_string:
+                escape_next = True
+                continue
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+            
+            # Skip characters inside strings
+            if in_string:
+                continue
+            
+            if char == '{':
+                if brace_count == 0:
+                    start_idx = i
+                brace_count += 1
+            elif char == '}':
+                brace_count -= 1
+                if brace_count == 0 and start_idx >= 0:
+                    return text[start_idx:i+1]
+        
+        return None
+    
+    def _repair_json_string(self, json_str: str) -> str:
+        """
+        Repair common JSON issues from LLM output.
+        
+        Fixes:
+        - Trailing commas
+        - Comments
+        - Unescaped newlines in strings
+        """
+        repaired = json_str
+        
+        # Remove trailing commas before } or ]
+        repaired = re.sub(r',\s*}', '}', repaired)
+        repaired = re.sub(r',\s*]', ']', repaired)
+        
+        # Remove single-line comments
+        repaired = re.sub(r'//[^\n]*\n', '\n', repaired)
+        
+        # Remove multi-line comments
+        repaired = re.sub(r'/\*[\s\S]*?\*/', '', repaired)
+        
+        return repaired
+    
+    def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]:
+        """Extract and validate variations from parsed JSON data."""
+        
+        if not isinstance(data, dict):
+            raise ValueError("JSON data is not a dictionary")
+        
+        variations_list = data.get('variations', [])
+        if not isinstance(variations_list, list):
+            raise ValueError("'variations' field is not a list")
+        
+        # Extract and sort by index
+        variations_with_index = []
+        for var in variations_list:
+            if not isinstance(var, dict):
+                continue
+            index = var.get('index', 0)
+            prompt = var.get('prompt', '')
+            if prompt and isinstance(prompt, str):
+                variations_with_index.append((index, prompt.strip()))
+        
+        variations_with_index.sort(key=lambda x: x[0])
+        variations = [v[1] for v in variations_with_index]
+        
+        # Validate count
+        if len(variations) < num_expected:
+            logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}")
+            while len(variations) < num_expected:
+                variations.append(variations[-1] if variations else "")
+        
+        variations = variations[:num_expected]
+        
+        if not all(v for v in variations):
+            raise ValueError(f"Some variations are empty after parsing")
+        
+        return variations
+    
+    def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]:
+        """Fallback parser: Extract variations from numbered sections."""
+        import re
+        
+        variations = []
+        
+        pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)'
+        matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE)
+        
+        pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)'
+        matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE)
+        
+        pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)'
+        matches3 = re.findall(pattern3, response_text, re.DOTALL)
+        
+        matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3)
+        
+        if len(matches) >= num_expected:
+            matches.sort(key=lambda x: int(x[0]))
+            variations = [match[1].strip() for match in matches[:num_expected]]
+        
+        if len(variations) != num_expected:
+            raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}")
+        
+        return variations
+    
+    def _is_valid_prompt(self, prompt: str) -> bool:
+        """
+        Validate that extracted text is actually a valid system prompt.
+        
+        Uses minimal, conservative filtering: only rejects OBVIOUSLY wrong text.
+        Let evaluation decide on quality - false negatives (rejecting good prompts)
+        are worse than false positives (accepting bad prompts).
+        
+        Args:
+            prompt: Extracted text to validate
+            
+        Returns:
+            True if appears to be a valid prompt, False if obviously wrong
+        """
+        if not prompt or not prompt.strip():
+            return False
+        
+        prompt_lower = prompt.lower().strip()
+        
+        # STRONG indicators of analysis text (high confidence rejection)
+        # These are phrases that almost never appear in actual prompts
+        strong_analysis_patterns = [
+            'in conclusion',
+            'to summarize',
+            'based on the analysis',
+            'the analysis shows',
+            'here are some suggestions',
+            'it seems you\'re looking for',
+        ]
+        
+        # Check first 200 characters for strong patterns
+        first_200 = prompt_lower[:200]
+        for pattern in strong_analysis_patterns:
+            if pattern in first_200:
+                if self._should_log_debug():
+                    logger.debug(f"Rejected prompt: contains analysis pattern '{pattern}'")
+                return False
+        
+        # POSITIVE indicators of valid prompt (high confidence acceptance)
+        # These are common prompt starters
+        valid_starters = [
+            'you are',
+            'you\'re',
+            'your task',
+            'your role',
+            'analyze',
+            'identify',
+            'select',
+            'determine',
+            'given',
+            'when',
+        ]
+        
+        # If starts with valid prompt pattern, accept immediately
+        first_100 = prompt_lower[:100]
+        if any(first_100.startswith(starter) for starter in valid_starters):
+            return True
+        
+        # DEFAULT: Accept everything else and let evaluation decide
+        # This is conservative - we'd rather evaluate a bad prompt than reject a good one
+        return True
+    
+    def set_reflection_context(
+        self,
+        current_prompt: Optional[str] = None,
+        feedback: Optional[Any] = None,
+        in_reflection: bool = False
+    ):
+        """
+        Set context for the next generate() call.
+        
+        Args:
+            current_prompt: The prompt being reflected upon
+            feedback: Evaluation feedback
+            in_reflection: Whether we're in reflection mode
+        """
+        self.reflection_context = {
+            'current_prompt': current_prompt,
+            'feedback': feedback,
+            'in_reflection': in_reflection
+        }
+        
+        # Reset candidate queue when entering new reflection phase
+        if in_reflection:
+            self._candidate_queue = []
+            self._hybrid_generation_complete = False
+            if self._should_log_debug():
+                logger.debug("🔄 Entering LLEGO reflection mode (queue reset)")
+            else:
+                logger.info("🔄 Entering LLEGO reflection mode")
+    
+    def generate(
+        self,
+        system_prompt: str = "",
+        user_prompt: str = "",
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response, using LLEGO for reflection calls.
+        
+        🔥 CRITICAL: This method intercepts ALL LLM calls. For candidate generation,
+        it checks if we have pre-generated candidates from hybrid mode and returns those.
+        
+        Args:
+            system_prompt: System prompt
+            user_prompt: User prompt
+            image_base64: Base64-encoded image (if any)
+            **kwargs: Additional arguments
+            
+        Returns:
+            Dict with 'content' key containing the generated text
+        """
+        # 🔍 DEBUG: Log generate calls (full details at DEBUG level)
+        if self._should_log_debug():
+            logger.debug(f"🔍 LLEGO Wrapper: generate() called")
+            logger.debug(f"   system_prompt: '{system_prompt[:100]}...' (truncated)")
+            logger.debug(f"   user_prompt length: {len(user_prompt)} chars")
+            logger.debug(f"   in_reflection: {self.reflection_context['in_reflection']}")
+            logger.debug(f"   has_image: {bool(image_base64)}")
+        
+        # #region agent log
+        try:
+            import json as _json_debug
+            import time as _time_debug
+            import os as _os_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "INTERCEPTION", "location": "llego_enhanced_llm.py:generate", "message": "Generate called", "data": {"system_prompt_len": len(system_prompt), "user_prompt_len": len(user_prompt), "has_image": bool(image_base64), "has_candidates": len(getattr(self, '_adapter_generated_candidates', [])), "in_reflection": self.reflection_context.get('in_reflection', False)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        except Exception:
+            pass
+        # #endregion
+        
+        # 🔥 CRITICAL: Check if we have pre-generated candidates from adapter-level generation
+        # This happens when GEPA calls adapter.llm_client to generate candidates
+        # We intercept and return our pre-generated candidates instead
+        # 🔥 NEW: Select BEST candidate instead of FIFO
+        # 🔥 FIX: DON'T intercept evaluation calls (those have images!)
+        # Only intercept proposal calls (no images, just asking for new candidate)
+        # 🔥 FIX 2: DON'T intercept TEST EVALUATION calls!
+        # Test evaluation has no images but uses the OPTIMIZED prompt to execute tasks
+        # We detect test evaluation by checking if this is a TASK EXECUTION call (not reflection)
+        is_task_execution = (
+            # Task execution prompts contain task instructions, not optimization requests
+            not any(kw in system_prompt.lower() for kw in ['evolutionary', 'mutation', 'variation', 'optimize', 'improve prompt', 'rewrite', 'generate variations']) and
+            # Short prompts are usually task prompts, not optimization prompts
+            len(system_prompt) < 1000 and
+            # User prompt is the actual task input (short), not feedback (long)
+            len(user_prompt) < 2000
+        )
+        
+        # Log task execution detection for debugging
+        if is_task_execution and hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates:
+            logger.info(f"🔒 NOT intercepting: Task execution detected (not optimization)")
+            logger.debug(f"   system_prompt_len={len(system_prompt)}, user_prompt_len={len(user_prompt)}")
+        
+        if hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates and not image_base64 and not is_task_execution:
+            # 🔥 BEST-CANDIDATE SELECTION: Find candidate with highest Dpareto score
+            # This ensures we use the best candidate for the current iteration
+            best_candidate = None
+            best_score = -float('inf')
+            best_idx = -1
+            
+            # Check if candidates have scores stored
+            for idx, cand in enumerate(self._adapter_generated_candidates):
+                if isinstance(cand, dict):
+                    # Try to get score from candidate dict
+                    score = cand.get('score', -float('inf'))
+                    
+                    # If score not in dict, try to get from Pareto logger
+                    if score == -float('inf'):
+                        from ..utils.pareto_logger import get_pareto_logger
+                        pareto_log = get_pareto_logger()
+                        
+                        # Look up score in Pareto front or evaluated candidates
+                        cand_prompt = cand.get('prompt', '')
+                        if cand_prompt:
+                            normalized = cand_prompt.strip().strip('"\'')
+                            # Check in Pareto front
+                            for front_cand in pareto_log.pareto_front:
+                                if front_cand.get('prompt', '').strip().strip('"\'') == normalized:
+                                    score = front_cand.get('score', -float('inf'))
+                                    break
+                            
+                            # If not in front, check evaluated candidates
+                            if score == -float('inf'):
+                                for eval_cand in pareto_log.candidates_evaluated:
+                                    if eval_cand.get('prompt', '').strip().strip('"\'') == normalized:
+                                        score = eval_cand.get('score', -float('inf'))
+                                        break
+                    
+                    if score > best_score:
+                        best_score = score
+                        best_candidate = cand
+                        best_idx = idx
+            
+            # If no scores found, fall back to FIFO (first candidate)
+            if best_candidate is None and self._adapter_generated_candidates:
+                best_candidate = self._adapter_generated_candidates[0]
+                best_idx = 0
+                logger.info(f"⚠️  No scores found for candidates - using FIFO selection")
+            
+            # Remove selected candidate from queue
+            if best_idx >= 0:
+                self._adapter_generated_candidates.pop(best_idx)
+            
+            # Important event - keep at INFO
+            if best_score > -float('inf'):
+                logger.info(f"🎯 INTERCEPTING GEPA PROPOSAL CALL - Returning BEST candidate (score: {best_score:.4f})!")
+                logger.info(f"🎯 Remaining candidates: {len(self._adapter_generated_candidates)}")
+            else:
+                logger.info(f"🎯 INTERCEPTING GEPA PROPOSAL CALL - Returning pre-generated candidate!")
+                logger.info(f"🎯 Remaining candidates: {len(self._adapter_generated_candidates)}")
+            
+            if isinstance(best_candidate, dict) and 'prompt' in best_candidate:
+                prompt = best_candidate['prompt']
+                
+                # Detailed logging only in DEBUG mode
+                if self._should_log_debug():
+                    logger.debug(f"✅ Pre-generated candidate details:")
+                    logger.debug(f"{'▓'*80}")
+                    logger.debug(f"{prompt}")
+                    logger.debug(f"{'▓'*80}")
+                else:
+                    source = best_candidate.get('source', 'unknown')
+                    score_info = f" (score: {best_score:.4f})" if best_score > -float('inf') else ""
+                    logger.info(f"✅ Candidate length: {len(prompt)} chars, Source: {source}{score_info}")
+                
+                return {'content': prompt, 'source': best_candidate.get('source', 'adapter_generated')}
+            elif isinstance(best_candidate, str):
+                if self._should_log_debug():
+                    logger.debug(f"✅ Pre-generated candidate (string format):")
+                    logger.debug(f"{'▓'*80}")
+                    logger.debug(f"{best_candidate}")
+                    logger.debug(f"{'▓'*80}")
+                else:
+                    logger.info(f"✅ Candidate length: {len(best_candidate)} chars")
+                return {'content': best_candidate, 'source': 'adapter_generated'}
+        
+        # 🔥 ENHANCED CALL TYPE DETECTION
+        # We need to distinguish between 4 types of calls:
+        # 1. Evaluation calls: Image + task command → identify element (pass through)
+        # 2. Judge calls: Image + "prompt engineer" → analyze feedback (pass through)  
+        # 3. Proposal calls: No image + feedback → generate candidate (intercept)
+        # 4. JSON batch calls: JSON generation request (pass through)
+        
+        # FIX: DON'T intercept JSON batch generation calls
+        is_json_batch_request = (
+            '"variations"' in system_prompt or
+            'MUST BE VALID JSON' in system_prompt or
+            'Output ONLY the JSON object' in system_prompt or
+            '```json' in system_prompt.lower()
+        )
+        
+        # FIX: DON'T intercept LLM-as-Judge calls (they analyze feedback with images)
+        is_judge_call = (
+            'prompt engineer' in system_prompt.lower() or
+            'analyzing mobile ui automation' in system_prompt.lower() or
+            'expert prompt engineer' in system_prompt.lower() or
+            ('analyze' in system_prompt.lower() and 'screenshot with numbered bounding boxes' in system_prompt.lower() and image_base64)
+        )
+        
+        # Check if this is a reflection call (GEPA asking for new candidate)
+        is_reflection_call = (
+            self.reflection_context['in_reflection'] or
+            self._detect_reflection_call(system_prompt, user_prompt)
+        )
+        
+        # Proposal calls are reflection calls WITHOUT images and NOT judge/JSON calls
+        # These are the calls we want to intercept with LLEGO
+        is_proposal_call = (
+            not is_json_batch_request and  # Not a JSON generation request
+            not is_judge_call and  # Not an LLM-as-Judge analysis
+            not image_base64 and  # No image = not an evaluation/judge call
+            (
+                is_reflection_call or
+                'improve' in system_prompt.lower() or
+                'optimize' in system_prompt.lower() or
+                'suggest' in system_prompt.lower() or
+                'feedback' in system_prompt.lower() or
+                'reflection' in system_prompt.lower()
+            ) and
+            len(user_prompt) > 100  # Proposal calls have substantial feedback
+        )
+        
+        # Detailed call detection logging only in DEBUG mode
+        if self._should_log_debug():
+            logger.debug(f"   is_json_batch_request: {is_json_batch_request}")
+            logger.debug(f"   is_judge_call: {is_judge_call}")
+            logger.debug(f"   is_reflection_call: {is_reflection_call}")
+            logger.debug(f"   is_proposal_call: {is_proposal_call}")
+            logger.debug(f"   has_image: {bool(image_base64)}")
+            logger.debug(f"   has_llego: {self.llego is not None}")
+        
+        # Only intercept proposal calls (not judge, not evaluation, not JSON)
+        if is_proposal_call and self.llego:
+            # FIX #5: Check if LLEGO is disabled due to repeated failures
+            if self._llego_disabled:
+                logger.warning("⚠️  LLEGO is disabled (circuit breaker), using base LLM")
+                return self.base_llm.generate(
+                    system_prompt=system_prompt,
+                    user_prompt=user_prompt,
+                    image_base64=image_base64,
+                    **kwargs
+                )
+            
+            # Important event - keep at INFO
+            logger.info("🔥 INTERCEPTING REFLECTION/PROPOSAL CALL FOR CANDIDATE GENERATION")
+            return self._llego_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+        else:
+            # Standard LLM call (for evaluation, not reflection)
+            if self._should_log_debug():
+                logger.debug("   → Standard LLM call (evaluation, not reflection)")
+            return self.base_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                image_base64=image_base64,
+                **kwargs
+            )
+    
+    def _clean_reflection_feedback(self, feedback_text: str, max_length: int = 50000) -> str:
+        """
+        Clean reflection feedback by removing base64 images and truncating.
+        
+        🔥 CRITICAL: GEPA's feedback can include massive base64 images (7MB+).
+        This function removes them and keeps feedback concise.
+        
+        Args:
+            feedback_text: Original feedback (may contain base64)
+            max_length: Maximum length after cleaning (default: 50K chars)
+            
+        Returns:
+            Cleaned feedback without base64, within size limits
+        """
+        if not feedback_text:
+            return feedback_text
+        
+        # Step 1: Remove very long base64-like sequences (50K+ chars of alphanumeric)
+        base64_pattern = r'[A-Za-z0-9+/=]{5000,}'
+        cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', feedback_text)
+        
+        # Step 2: Remove explicit image_base64 references and their values
+        cleaned = re.sub(r'image_base64["\']?\s*[:=]\s*["\']?[A-Za-z0-9+/=]+["\']?', 
+                        'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE)
+        
+        # Step 3: Remove detailed_scores sections that might contain base64
+        cleaned = re.sub(r'##\s+detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*',
+                        '## detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE)
+        
+        # Step 4: Remove any remaining very long strings (likely base64)
+        cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned)
+        
+        # Step 5: Truncate if still too long (keep beginning which has most important info)
+        if len(cleaned) > max_length:
+            truncated_size = len(cleaned) - max_length
+            cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters - keeping essential feedback only]"
+            logger.warning(f"⚠️  Reflection feedback truncated: {len(feedback_text)} → {len(cleaned)} chars")
+        
+        return cleaned
+    
+    def _detect_reflection_call(self, system_prompt: str, user_prompt: str) -> bool:
+        """
+        Heuristic to detect if this is a reflection call from GEPA.
+        
+        GEPA's reflection calls typically contain feedback/error analysis.
+        """
+        reflection_keywords = [
+            'improve', 'feedback', 'error', 'failure', 'reflection',
+            'better prompt', 'modify', 'enhance', 'optimize'
+        ]
+        
+        combined = (system_prompt + " " + user_prompt).lower()
+        return any(keyword in combined for keyword in reflection_keywords)
+    
+    def _llego_generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Use LLEGO (or Hybrid mode) to generate new prompt candidates.
+        
+        Args:
+            system_prompt: System prompt
+            user_prompt: User prompt (contains reflection feedback)
+            image_base64: Image data (for reflection, always empty)
+            **kwargs: Additional arguments (may contain image_base64, will be removed)
+            
+        Returns:
+            Dict with 'content' key containing a new prompt candidate
+        """
+        try:
+            # 🔥 CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error
+            kwargs.pop('image_base64', None)  # Remove if present to avoid conflict
+            
+            # 🔥 HYBRID MODE: Generate from BOTH GEPA reflection AND LLEGO
+            if (self.config and 
+                hasattr(self.config, 'enable_gepa_reflection_with_llego') and 
+                self.config.enable_gepa_reflection_with_llego):
+                
+                return self._hybrid_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+            
+            # STANDARD LLEGO MODE (LLEGO only)
+            return self._llego_only_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+                
+        except Exception as e:
+            # FIX #5: Circuit breaker - track failures and disable LLEGO if needed
+            self._llego_failures += 1
+            
+            logger.error(f"❌ LLEGO generation failed ({self._llego_failures}/{self._llego_failure_threshold}): {e}")
+            logger.error("⚠️  Falling back to base LLM")
+            
+            if self._llego_failures >= self._llego_failure_threshold:
+                self._llego_disabled = True
+                logger.error(f"🚫 LLEGO DISABLED - {self._llego_failures} consecutive failures detected")
+                logger.error("   All future requests will use base LLM only")
+            
+            import traceback
+            logger.debug(traceback.format_exc())
+            
+            # Fallback to base LLM - ensure image_base64 is not in kwargs
+            kwargs.pop('image_base64', None)
+            return self.base_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                image_base64=image_base64,
+                **kwargs
+            )
+    
+    def _hybrid_generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        🔥 HYBRID MODE: Generate candidates from BOTH GEPA reflection AND LLEGO operators.
+        
+        Smart Compensation Strategy:
+        - When crossover can't run (< 2 parents), compensates with extra GEPA reflection
+        - GEPA is smarter than mutation (uses semantic understanding of feedback)
+        - Crossover only runs when we have 2+ scored parents to combine
+        
+        GEPA will call generate() multiple times. On first call, we generate all candidates
+        and queue them. Subsequent calls return from the queue.
+        """
+        # If we already generated candidates, return next from queue
+        if self._hybrid_generation_complete and self._candidate_queue:
+            candidate = self._candidate_queue.pop(0)
+            source = candidate.get('source', 'unknown')
+            logger.info(f"📦 Returning queued candidate (source: {source}, {len(self._candidate_queue)} remaining)")
+            return {'content': candidate['prompt'], 'source': source}
+        
+        # First call: Generate ALL candidates
+        from ..utils.clean_logger import get_clean_logger
+        clean_log = get_clean_logger()
+        
+        all_candidates = []
+        
+        # ─────────────────────────────────────────────────────
+        # PHASE 0: Check if crossover will be possible
+        # ─────────────────────────────────────────────────────
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        gepa_pareto_front = pareto_log.pareto_front
+        
+        # Determine if we need to compensate for crossover
+        crossover_possible = len(gepa_pareto_front) >= 2
+        n_crossover_config = self.config.n_crossover if hasattr(self.config, 'n_crossover') else 2
+        crossover_compensation = 0 if crossover_possible else n_crossover_config
+        
+        if not crossover_possible:
+            logger.info(f"⚠️  Crossover NOT possible (have {len(gepa_pareto_front)} parents, need 2+)")
+            logger.info(f"   → Smart compensation: +{crossover_compensation} extra GEPA reflection candidates")
+        
+        # ─────────────────────────────────────────────────────
+        # PHASE 1: GEPA REFLECTION (Semantic Understanding)
+        # More GEPA = better, it understands WHY things fail
+        # ─────────────────────────────────────────────────────
+        base_gepa_count = self.config.num_gepa_reflection_candidates if hasattr(self.config, 'num_gepa_reflection_candidates') else 3
+        
+        # 🔥 SMART COMPENSATION: More GEPA when crossover can't run
+        num_gepa = base_gepa_count + crossover_compensation
+        
+        logger.info("─" * 80)
+        logger.info("PHASE 1: GEPA REFLECTION (Semantic Understanding)")
+        if crossover_compensation > 0:
+            logger.info(f"Generating {num_gepa} candidates ({base_gepa_count} base + {crossover_compensation} compensation for skipped crossover)")
+        else:
+            logger.info(f"Generating {num_gepa} candidates")
+        logger.info("─" * 80)
+        
+        # 🔥 OPTIMIZED: Single call with JSON format for multiple variations
+        try:
+            # Clean user_prompt before sending to LLM
+            cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+            
+            # Build diversity requirements based on num_gepa
+            diversity_requirements = self._build_diversity_requirements(num_gepa)
+            
+            # 🔥 FORMAT AWARENESS: Get format constraint if available
+            format_constraint = ""
+            if self._detected_format and self._detected_format.get('format_constraint'):
+                format_constraint = self._detected_format['format_constraint']
+                logger.info(f"📐 Injecting format constraint into candidate generation")
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "Format constraint injected", "data": {"format_type": self._detected_format.get('format_type', 'unknown'), "constraint_length": len(format_constraint), "avg_length": self._detected_format.get('avg_length', 0)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+            else:
+                format_constraint = "No specific format detected - ensure output is CONCISE and matches expected examples."
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "No format constraint available", "data": {"has_detected_format": bool(self._detected_format)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+            
+            # 🔥 EVOLUTIONARY PROMPT ENGINEER: Forces radically different mutations
+            # Each variation MUST use a distinct genetic strategy to maximize search space
+            optimization_system_prompt = f"""<system_core>
+You are an **Evolutionary Prompt Engineer**. Your task is to mutate a [FAILING_PROMPT] into a high-performance instruction set using genetic strategies.
+You must generate {num_gepa} radically different prompt variations based on the [FAILURE_FEEDBACK].
+</system_core>
+
+<input_data>
+    <failure_feedback_log>
+{cleaned_user_prompt}
+    </failure_feedback_log>
+</input_data>
+
+<mutation_strategies>
+You MUST use a different strategy for each variation. Assign strategies in order:
+
+1. **STRATEGY A: The Strict Auditor (Constraints)**
+   - Focus: Add "Negative Constraints" (e.g., "Do NOT...", "NEVER...", "FORBIDDEN:").
+   - Use strict XML tagging for the output schema.
+   - Goal: Fix hallucinations and formatting errors.
+
+2. **STRATEGY B: The Reasoning Expert (Chain of Thought)**
+   - Focus: Add a "Reasoning Steps" section.
+   - Instruct the model to "Think step-by-step" before generating the final output.
+   - Goal: Fix logic errors and complex multi-step reasoning failures.
+
+3. **STRATEGY C: The Few-Shot Teacher (Examples)**
+   - Focus: Generate a *synthetic* example of Input -> Correct Output within the prompt.
+   - Goal: Fix understanding of abstract concepts or strict schema requirements.
+
+4. **STRATEGY D: The Role-Player (Persona)**
+   - Focus: Change the persona to a hyper-specific expert (e.g., "Senior Data Engineer at Fortune 500" vs "Coder").
+   - Add domain-specific vocabulary and expertise markers.
+   - Goal: Fix domain-specific terminology errors.
+
+5. **STRATEGY E: The Structure Architect (Format)**
+   - Focus: Add explicit output schema with field-by-field instructions.
+   - Use markdown or XML headers to organize the prompt.
+   - Goal: Fix output structure and field naming errors.
+</mutation_strategies>
+
+<output_constraints>
+1. **Self-Contained**: Each variation must be the FULL prompt text (100-500 words), ready to run.
+2. **No Meta-Talk**: Do not explain your strategy inside the prompt. Just output the optimized prompt.
+3. **Preserve Core Task**: Keep the original task/domain - only improve HOW it's described.
+4. **JSON Output**: Follow the schema below exactly.
+5. **ENFORCE OUTPUT FORMAT**: The generated prompt MUST instruct the model to output in the EXACT format shown in examples.
+</output_constraints>
+
+<critical_output_format_requirement>
+🚨 THE GENERATED PROMPTS MUST INCLUDE EXPLICIT OUTPUT FORMAT INSTRUCTIONS!
+Common failure: The model generates explanations/prose instead of the required concise format.
+
+{format_constraint}
+
+Your generated prompts MUST include:
+- Explicit instruction to output ONLY in the required format
+- "Do NOT explain", "No reasoning", "Output ONLY [format]" constraints
+- Length constraint to prevent verbose responses
+</critical_output_format_requirement>
+
+<response_format>
+You MUST output ONLY valid JSON. No comments, no explanations, no markdown code blocks.
+
+Generate exactly {num_gepa} variations in this exact format:
+
+{{
+    "variations": [
+        {{
+            "index": 1,
+            "strategy": "Strict Auditor",
+            "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]"
+        }},
+        {{
+            "index": 2,
+            "strategy": "Reasoning Expert",
+            "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]"
+        }}
+    ]
+}}
+
+CRITICAL RULES:
+1. Output ONLY the JSON object - no text before or after
+2. Do NOT use markdown code blocks (no ```json)
+3. Do NOT include comments (no // or /* */)
+4. Ensure all strings are properly escaped
+5. Generate exactly {num_gepa} variations
+6. Each variation must have: index (number), strategy (string), prompt (string)
+</response_format>
+"""
+            
+            # Standard GEPA reflection call
+            call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+            result = self.base_llm.generate(
+                system_prompt=optimization_system_prompt,
+                user_prompt=cleaned_user_prompt,
+                image_base64=image_base64,
+                **call_kwargs
+            )
+            
+            if isinstance(result, dict):
+                response_text = result.get("content", str(result))
+            else:
+                response_text = str(result)
+            
+            # Parse JSON variations
+            gepa_variations = self._parse_json_variations(response_text, num_gepa)
+            
+            # Add all variations to candidates
+            for idx, variation_prompt in enumerate(gepa_variations, 1):
+                # 🛡️ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis
+                gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt)
+                
+                # Validate extracted prompt before adding
+                if not self._is_valid_prompt(gepa_candidate):
+                    logger.warning(f"   ⚠️  Variation {idx} appears invalid, skipping")
+                    continue
+                
+                # 🔍 DIAGNOSTIC: Log candidate length to help diagnose scoring issues
+                if self._should_log_debug():
+                    logger.debug(f"   Candidate {idx} length: {len(gepa_candidate)} chars")
+                    logger.debug(f"   Candidate {idx} preview: {gepa_candidate[:100]}...")
+                
+                all_candidates.append({
+                    'prompt': gepa_candidate,
+                    'source': 'gepa_reflection',
+                    'index': idx
+                })
+                
+                clean_log.log_gepa_reflection_candidate(idx, gepa_candidate)
+            
+            gepa_count = len(all_candidates)
+            logger.info(f"✅ GEPA Reflection: {gepa_count} candidates generated in single optimized call")
+            
+        except Exception as e:
+            logger.error(f"❌ Error generating GEPA reflection candidates: {e}")
+            logger.warning(f"   Falling back to sequential generation...")
+            import traceback
+            logger.debug(traceback.format_exc())
+            
+            # Fallback: Sequential generation (when JSON parsing fails)
+            gepa_count = self._fallback_sequential_gepa_generation(
+                num_gepa, user_prompt, image_base64, kwargs, all_candidates, clean_log
+            )
+        
+        if gepa_count > 0:
+            logger.info(f"GEPA Reflection Complete: {gepa_count} candidates")
+        
+        # ─────────────────────────────────────────────────────
+        # PHASE 2: LLEGO GENETIC OPERATORS
+        # ─────────────────────────────────────────────────────
+        logger.info("─" * 80)
+        logger.info("PHASE 2: LLEGO GENETIC OPERATORS")
+        logger.info("─" * 80)
+        
+        # Extract current prompt from context
+        current_prompt = self.reflection_context.get('current_prompt', '')
+        if not current_prompt:
+            current_prompt = self._extract_prompt_from_feedback(user_prompt)
+        
+        if not current_prompt and self.llego.population:
+            current_prompt = self.llego.population[0].prompt
+            logger.info(f"   Using population prompt (length: {len(current_prompt)})")
+        
+        # Convert GEPA Pareto front to PromptCandidate format (already fetched in Phase 0)
+        pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+        pareto_front = pareto_candidates
+        
+        logger.info(f"   Pareto front: {len(pareto_front)} candidates with scores")
+        for idx, p in enumerate(pareto_front, 1):
+            notation = p.metadata.get('notation', 'S') if p.metadata else 'S'
+            logger.info(f"      {notation}: fitness={p.fitness:.3f}")
+        
+        # Create LLM callable for LLEGO genetic operations (crossover/mutation)
+        call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+        
+        # LLEGO genetic prompt with SAFETY LOCKS to prevent task drift
+        # Directed mutations ensure prompts improve without losing core functionality
+        genetic_operator_system_prompt = """<system_role>
+You are a **Prompt Mutation Engine**. Your input is a [PARENT_PROMPT]. Your output is a [MUTATED_CHILD].
+</system_role>
+
+<mutation_directives>
+Apply ONE of the following micro-mutations to improve the prompt:
+
+1. **COMPRESS**: Remove fluff words ("please", "ensure to", "kindly"). Make it telegraphic and efficient.
+2. **INTENSIFY**: Capitalize key constraints (e.g., "must return JSON" -> "**MUST** return **VALID JSON**").
+3. **STRUCTURIZE**: Add markdown headers or XML tags to organize a messy prompt.
+4. **CLARIFY**: Expand vague nouns (e.g., "code" -> "production-ready Python code with type hints").
+5. **CONSTRAIN**: Add negative constraints ("Do NOT include explanations", "NEVER output markdown").
+</mutation_directives>
+
+<safety_locks>
+1. **IMMUTABLE CORE**: You MUST NOT change the core task (e.g., do not change "Extract JSON" to "Write a Summary").
+2. **NO EXPLANATION**: Output ONLY the new prompt string. No meta-commentary.
+3. **VALIDITY**: The output must remain a functional system prompt.
+4. **LENGTH LIMIT**: Keep mutations within 20% of original length (no excessive expansion).
+</safety_locks>"""
+        
+        def llm_callable(genetic_prompt: str) -> str:
+            result = self.base_llm.generate(
+                system_prompt=genetic_operator_system_prompt,
+                user_prompt=genetic_prompt,
+                image_base64="",
+                **call_kwargs
+            )
+            if isinstance(result, dict):
+                return result.get('content', str(result))
+            return str(result)
+        
+        # Generate LLEGO offspring (crossover will be skipped if < 2 parents)
+        llego_prompts = self.llego.evolve_generation(
+            llm=llm_callable,
+            pareto_front=pareto_front
+        )
+        
+        # Track actual crossover count from LLEGO (it tracks internally now)
+        actual_crossover = getattr(self.llego, '_actual_crossover_count', 0)
+        crossover_skipped = getattr(self.llego, '_crossover_skipped', False)
+        
+        crossover_idx = 1
+        mutation_idx = 1
+        
+        for i, prompt in enumerate(llego_prompts):
+            if i < actual_crossover:
+                source = 'llego_crossover'
+                clean_log.log_llego_crossover_candidate(crossover_idx, prompt)
+                crossover_idx += 1
+            else:
+                source = 'llego_mutation'
+                clean_log.log_llego_mutation_candidate(mutation_idx, prompt)
+                mutation_idx += 1
+            
+            all_candidates.append({
+                'prompt': prompt,
+                'source': source,
+                'index': i + 1
+            })
+        
+        mutation_count = len(llego_prompts) - actual_crossover
+        logger.info(f"🧬 LLEGO: {actual_crossover} crossover + {mutation_count} mutation = {len(llego_prompts)} candidates")
+        if crossover_skipped:
+            logger.info(f"   (Crossover was skipped - compensated with extra GEPA reflection)")
+        
+        # ─────────────────────────────────────────────────────
+        # SUMMARY
+        # ─────────────────────────────────────────────────────
+        total_gepa = len([c for c in all_candidates if c.get('source') == 'gepa_reflection'])
+        total_crossover = len([c for c in all_candidates if c.get('source') == 'llego_crossover'])
+        total_mutation = len([c for c in all_candidates if c.get('source') == 'llego_mutation'])
+        
+        logger.info("─" * 80)
+        logger.info("CANDIDATE GENERATION SUMMARY")
+        logger.info("─" * 80)
+        logger.info(f"   GEPA Reflection:  {total_gepa} candidates (semantic understanding)")
+        logger.info(f"   LLEGO Crossover:  {total_crossover} candidates (combine best)")
+        logger.info(f"   LLEGO Mutation:   {total_mutation} candidates (exploration)")
+        logger.info(f"   TOTAL:            {len(all_candidates)} candidates")
+        if crossover_skipped:
+            logger.info(f"   📝 Note: Crossover skipped (waiting for 2+ scored parents)")
+        logger.info("─" * 80)
+        
+        clean_log.log_candidate_generation_summary()
+        
+        # Store in queue (skip first one - return it now)
+        self._candidate_queue = all_candidates[1:] if len(all_candidates) > 1 else []
+        self._hybrid_generation_complete = True
+        
+        # Return first candidate
+        if all_candidates:
+            first = all_candidates[0]
+            logger.info(f"📤 Returning FIRST candidate (source: {first['source']})")
+            return {'content': first['prompt'], 'source': first['source']}
+        else:
+            logger.error("❌ No candidates generated!")
+            return {'content': '', 'source': 'error'}
+    
+    def _llego_only_generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        STANDARD LLEGO MODE: Generate candidates using only LLEGO operators.
+        """
+        # 🔥 CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error
+        kwargs.pop('image_base64', None)
+        
+        # 🔥 FIX: Clean user_prompt if it contains feedback (might have base64)
+        cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+        
+        # Extract current prompt from context or user_prompt
+        current_prompt = self.reflection_context.get('current_prompt', '')
+        
+        if not current_prompt:
+            # Try to extract from cleaned user_prompt
+            current_prompt = self._extract_prompt_from_feedback(cleaned_user_prompt)
+        
+        logger.info(f"🧬 LLEGO: Evolving prompt...")
+        if self._should_log_debug():
+            logger.debug(f"   Current prompt: '{current_prompt[:100]}...' (length: {len(current_prompt)} chars)")
+        else:
+            logger.info(f"   Prompt length: {len(current_prompt)} chars")
+        
+        # 🔥 FIX 2: Get Pareto front from GEPA (not LLEGO population)
+        # This ensures LLEGO operators use true non-dominated solutions
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        gepa_pareto_front = pareto_log.pareto_front
+        
+        # Convert GEPA Pareto front to PromptCandidate format
+        pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+        pareto_front = pareto_candidates
+        
+        logger.info(f"   Using GEPA Pareto front (size: {len(gepa_pareto_front)})")
+        logger.info(f"   Converted to {len(pareto_front)} PromptCandidate objects")
+        
+        # Create LLM callable for LLEGO genetic operations
+        # Uses Genetic Mutation Engine prompt for micro-mutations
+        call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+        
+        genetic_system_prompt = """You are a **Genetic Mutation Engine** for Text Prompts.
+
+<task>
+Apply a specific micro-mutation to the provided prompt to increase its clarity, strictness, or effectiveness.
+</task>
+
+<mutation_types>
+1. **Compress**: Shorten verbose instructions without losing meaning.
+2. **Expand**: Add detail to vague nouns (e.g., "code" -> "production-ready Python 3.10 code").
+3. **Emphasize**: Highlight CRITICAL constraints using caps, bold, or explicit markers.
+4. **Constrain**: Add explicit boundaries (what NOT to do, format rules, length limits).
+5. **Exemplify**: Add a brief example if the task is ambiguous.
+</mutation_types>
+
+<output_rules>
+1. Output ONLY the mutated prompt text.
+2. Do NOT change the core intent or task domain.
+3. Do NOT add explanations or meta-commentary.
+4. Apply ONE primary mutation type while preserving all existing strengths.
+</output_rules>"""
+        
+        def llm_callable(prompt: str) -> str:
+            # Clean prompt before sending (might contain base64 if from feedback)
+            cleaned_prompt = self._clean_reflection_feedback(prompt)
+            result = self.base_llm.generate(
+                system_prompt=genetic_system_prompt,
+                user_prompt=cleaned_prompt,
+                image_base64="",  # Always empty for LLEGO genetic operations
+                **call_kwargs
+            )
+            if isinstance(result, dict):
+                return result.get('content', str(result))
+            return str(result)
+        
+        # Generate offspring using LLEGO
+        new_prompts = self.llego.evolve_generation(
+            llm=llm_callable,
+            pareto_front=pareto_front
+        )
+        
+        if new_prompts:
+            new_prompt = new_prompts[0]
+            logger.info(f"✅ LLEGO generated new candidate (length: {len(new_prompt)} chars)")
+            
+            if self._should_log_debug():
+                logger.debug(f"   Full prompt:")
+                logger.debug(f"   '{new_prompt}'")
+            
+            return {
+                'content': new_prompt,
+                'source': 'llego',
+                'num_candidates': len(new_prompts)
+            }
+        else:
+            logger.warning("⚠️  LLEGO returned no candidates, falling back to base LLM")
+            return self.base_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                image_base64="",
+                **kwargs
+            )
+    
+    def _build_diversity_requirements(self, num_gepa: int) -> str:
+        """
+        Build diversity requirements using research-backed Prompt Design Patterns.
+        
+        These are proven strategies from prompt engineering literature:
+        - Chain-of-Thought (CoT)
+        - Few-Shot Learning
+        - Negative Constraints
+        - Persona Pattern
+        
+        Args:
+            num_gepa: Number of GEPA variations to generate
+            
+        Returns:
+            String with diversity requirements for the optimization prompt
+        """
+        # Research-backed Prompt Design Patterns that solve specific classes of problems
+        strategies = [
+            """
+    <variation_1>
+        **STRATEGY: COGNITIVE DECOMPOSITION (Chain-of-Thought)**
+        - **Goal**: Fixes logic/reasoning errors.
+        - **Action**: Add a thinking process section that forces step-by-step reasoning.
+        - **Implementation**: Include instructions like "First analyze..., then identify..., finally conclude..."
+        - **Pattern**: Force the model to "Plan before executing".
+    </variation_1>
+            """,
+            
+            """
+    <variation_2>
+        **STRATEGY: FEW-SHOT SIMULATION (In-Context Learning)**
+        - **Goal**: Fixes formatting/syntax errors and output structure issues.
+        - **Action**: Invent 1-2 realistic "Input -> Output" examples that mirror the expected format.
+        - **Implementation**: Add "Example: Given [input], respond with: [expected output format]"
+        - **Pattern**: Show, don't just tell. Demonstrate the gold standard.
+    </variation_2>
+            """,
+            
+            """
+    <variation_3>
+        **STRATEGY: SEMANTIC CONSTRAINING (Negative Constraints)**
+        - **Goal**: Fixes hallucinations, verbosity, and off-topic responses.
+        - **Action**: Add explicit forbidden actions and boundaries.
+        - **Implementation**: Include "Do NOT explain your reasoning", "Do NOT add preambles", "Do NOT include information not asked for"
+        - **Pattern**: Define the walls, not just the path.
+    </variation_3>
+            """,
+            
+            """
+    <variation_4>
+        **STRATEGY: PERSONA & ROLE HARDENING**
+        - **Goal**: Fixes tone, domain knowledge gaps, and inconsistent behavior.
+        - **Action**: Define a hyper-specific expert role with clear responsibilities.
+        - **Implementation**: Instead of "You are a helpful assistant", use "You are a Senior Data Analyst with 10 years of experience in [domain]"
+        - **Pattern**: Adopt the mental model and rigorous standards of a real expert.
+    </variation_4>
+            """,
+            
+            """
+    <variation_5>
+        **STRATEGY: OUTPUT SCHEMA ENFORCEMENT**
+        - **Goal**: Fixes structural and format compliance issues.
+        - **Action**: Define an explicit output schema with field names and types.
+        - **Implementation**: Include "Your response MUST follow this exact format: {field1: type, field2: type}"
+        - **Pattern**: Leave no ambiguity about what the output should look like.
+    </variation_5>
+            """,
+            
+            """
+    <variation_6>
+        **STRATEGY: SELF-VERIFICATION LOOP**
+        - **Goal**: Fixes errors that could be caught by double-checking.
+        - **Action**: Add instructions for the model to verify its own output.
+        - **Implementation**: Include "Before responding, verify: 1) Does this match the required format? 2) Did I include all requested information?"
+        - **Pattern**: Build in quality control before submission.
+    </variation_6>
+            """,
+            
+            """
+    <variation_7>
+        **STRATEGY: TASK DECOMPOSITION**
+        - **Goal**: Fixes complex tasks that overwhelm the model.
+        - **Action**: Break the task into numbered sub-tasks.
+        - **Implementation**: "Step 1: [subtask]. Step 2: [subtask]. Step 3: Combine results."
+        - **Pattern**: Divide and conquer complexity.
+    </variation_7>
+            """
+        ]
+        
+        # Select strategies based on num_gepa
+        selected = strategies[:min(num_gepa, len(strategies))]
+        
+        requirements = "<required_strategies>\n"
+        requirements += "Each variation MUST use a DIFFERENT strategy from the list below:\n"
+        requirements += "\n".join(selected)
+        requirements += "\n</required_strategies>"
+        
+        requirements += """
+
+<strategy_application_rules>
+    1. Each variation must apply its assigned strategy comprehensively.
+    2. Each variation must ALSO address ALL issues mentioned in the feedback.
+    3. The strategies are not mutually exclusive - but the PRIMARY focus of each variation should be its assigned strategy.
+    4. Do not just add a single line - transform the prompt structure according to the strategy.
+</strategy_application_rules>
+"""
+        
+        return requirements
+    
+    def _fallback_sequential_gepa_generation(
+        self,
+        num_gepa: int,
+        user_prompt: str,
+        image_base64: str,
+        kwargs: dict,
+        all_candidates: list,
+        clean_log
+    ) -> int:
+        """
+        Fallback to sequential generation when JSON parsing fails.
+        
+        Args:
+            num_gepa: Number of candidates to generate
+            user_prompt: The feedback/context
+            image_base64: Image data (if any)
+            kwargs: Additional kwargs
+            all_candidates: List to append candidates to
+            clean_log: Logger for clean output
+            
+        Returns:
+            Number of candidates generated
+        """
+        generated_count = 0
+        
+        for i in range(num_gepa):
+            logger.debug(f"Generating Reflection Candidate #{i+1}/{num_gepa} (fallback mode)...")
+            try:
+                cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+                
+                # Use research-backed strategy for each variation
+                strategy_prompts = [
+                    "<optimization_rule>\nApply CHAIN-OF-THOUGHT: Add step-by-step reasoning instructions. Force the model to 'think before answering'.\n</optimization_rule>",
+                    "<optimization_rule>\nApply FEW-SHOT LEARNING: Add 1-2 concrete input/output examples within the prompt. Show, don't just tell.\n</optimization_rule>",
+                    "<optimization_rule>\nApply NEGATIVE CONSTRAINTS: Add explicit 'Do NOT' rules. Define what the model must avoid.\n</optimization_rule>",
+                    "<optimization_rule>\nApply PERSONA HARDENING: Define a specific expert role with clear responsibilities and standards.\n</optimization_rule>",
+                    "<optimization_rule>\nApply OUTPUT SCHEMA: Define the exact output format with field names and types. Leave no ambiguity.\n</optimization_rule>",
+                ]
+                
+                strategy = strategy_prompts[i % len(strategy_prompts)]
+                
+                fallback_prompt = f"""You are a Prompt Optimization Engine in **SAFE MODE**.
+
+{strategy}
+
+{_FALLBACK_SYSTEM_PROMPT}"""
+                
+                call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+                result = self.base_llm.generate(
+                    system_prompt=fallback_prompt,
+                    user_prompt=cleaned_user_prompt,
+                    image_base64=image_base64,
+                    **call_kwargs
+                )
+                
+                if isinstance(result, dict):
+                    gepa_candidate_raw = result.get("content", str(result))
+                else:
+                    gepa_candidate_raw = str(result)
+                
+                gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw)
+                
+                if not self._is_valid_prompt(gepa_candidate):
+                    logger.warning(f"   ⚠️  Fallback candidate #{i+1} appears invalid, skipping")
+                    continue
+                
+                all_candidates.append({
+                    'prompt': gepa_candidate,
+                    'source': 'gepa_reflection',
+                    'index': i + 1
+                })
+                
+                clean_log.log_gepa_reflection_candidate(i + 1, gepa_candidate)
+                generated_count += 1
+                
+            except Exception as fallback_error:
+                logger.error(f"❌ Error in fallback generation #{i+1}: {fallback_error}")
+        
+        return generated_count
+    
+    def _extract_prompt_from_feedback(self, user_prompt: str) -> str:
+        """
+        Try to extract the current prompt from GEPA's reflection feedback.
+        
+        Args:
+            user_prompt: The feedback text from GEPA
+            
+        Returns:
+            Extracted prompt or empty string
+        """
+        # Look for common patterns in GEPA's feedback
+        if "current prompt:" in user_prompt.lower():
+            lines = user_prompt.split('\n')
+            for i, line in enumerate(lines):
+                if "current prompt:" in line.lower():
+                    # Return the next line(s) as the prompt
+                    return '\n'.join(lines[i+1:i+10])
+        
+        return ""
+    
+    # Forward other methods to base LLM
+    def get_model_info(self) -> str:
+        """Get model information."""
+        return f"LLEGO({self.base_llm.get_model_info()})"
+    
+    def __getattr__(self, name):
+        """Forward unknown attributes to base LLM."""
+        return getattr(self.base_llm, name)
+
diff --git a/src/gepa_optimizer/llms/vision_llm.py b/src/gepa_optimizer/llms/vision_llm.py
new file mode 100644
index 0000000000000000000000000000000000000000..79f84dac705b901341c722f2955af1b8b473561c
--- /dev/null
+++ b/src/gepa_optimizer/llms/vision_llm.py
@@ -0,0 +1,813 @@
+"""
+Vision LLM Client for GEPA Optimizer
+"""
+
+import json
+import logging
+import time
+from enum import Enum
+import requests
+from typing import Dict, Optional, Any, TYPE_CHECKING, Union
+
+# Assuming APIKeyManager is available from utils
+from ..utils.api_keys import APIKeyManager
+
+# Import ModelConfig only for type checking to avoid circular imports
+if TYPE_CHECKING:
+    from ..models.config import ModelConfig
+
+from .base_llm import BaseLLMClient
+
+class ProviderType(str, Enum):
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    HUGGINGFACE = "huggingface"
+    VLLM = "vllm"
+    GOOGLE = "google"
+    GEMINI = "gemini"
+
+class ErrorType(str, Enum):
+    API_ERROR = "api_error"
+    VALIDATION_ERROR = "validation_error"
+    NETWORK_ERROR = "network_error"
+    RATE_LIMIT = "rate_limit"
+    TIMEOUT = "timeout"
+
+class GepaLLMError(Exception):
+    """Base exception for GEPA LLM related errors"""
+    def __init__(self, message: str, error_type: ErrorType, status_code: Optional[int] = None):
+        self.message = message
+        self.error_type = error_type
+        self.status_code = status_code
+        super().__init__(self.message)
+
+    def __str__(self):
+        if self.status_code:
+            return f"{self.error_type.value} (HTTP {self.status_code}): {self.message}"
+        return f"{self.error_type.value}: {self.message}"
+
+logger = logging.getLogger(__name__)
+
+OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
+
+class VisionLLMClient(BaseLLMClient):
+    """
+    A client for interacting with multi-modal Vision LLMs (e.g., OpenAI GPT-4 Vision).
+    
+    Example:
+        ```python
+        # Basic usage
+        client = VisionLLMClient(
+            provider="openai",
+            model_name="gpt-4-vision-preview",
+            temperature=0.7,
+            max_tokens=2048
+        )
+        
+        # With custom configuration
+        config = ModelConfig(
+            provider="openai",
+            model_name="gpt-4-vision-preview",
+            temperature=0.5,
+            max_tokens=1024
+        )
+        client = VisionLLMClient.from_config(config)
+        ```
+    """
+
+    def __init__(
+        self,
+        provider: Union[str, ProviderType],
+        model_name: str,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        top_p: float = 1.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        timeout: int = 120,  # Increase to 2 minutes for large prompts
+        max_retries: int = 3
+    ):
+        """
+        Initializes the VisionLLMClient with model configuration.
+
+        Args:
+            provider: The provider of the model (e.g., 'openai', 'anthropic')
+            model_name: The name of the multi-modal LLM model to use (e.g., "gpt-4-vision-preview").
+            api_key: Optional API key. If not provided, it will be fetched from APIKeyManager.
+            base_url: Optional base URL for the API endpoint.
+            temperature: Controls randomness in the response generation.
+            max_tokens: Maximum number of tokens to generate.
+            top_p: Controls diversity via nucleus sampling.
+            frequency_penalty: Penalizes repeated tokens.
+            presence_penalty: Penalizes new tokens based on their presence in the text so far.
+        """
+        # Initialize parent class
+        super().__init__(provider=str(provider), model_name=model_name, **{
+            'api_key': api_key,
+            'base_url': base_url,
+            'temperature': temperature,
+            'max_tokens': max_tokens,
+            'top_p': top_p,
+            'frequency_penalty': frequency_penalty,
+            'presence_penalty': presence_penalty,
+            'timeout': timeout,
+            'max_retries': max_retries
+        })
+        
+        # Initialize the actual client
+        self._initialize_client(provider, model_name, api_key, base_url, temperature, 
+                              max_tokens, top_p, frequency_penalty, presence_penalty, 
+                              timeout, max_retries)
+    
+    def _initialize_client(self, provider, model_name, api_key, base_url, temperature, 
+                          max_tokens, top_p, frequency_penalty, presence_penalty, 
+                          timeout, max_retries):
+        """Initialize the actual client (existing logic)"""
+        # Input validation
+        try:
+            self.provider = ProviderType(provider.lower())
+        except ValueError:
+            raise GepaLLMError(
+                f"Unsupported provider: {provider}. "
+                f"Supported providers: {[p.value for p in ProviderType]}",
+                ErrorType.VALIDATION_ERROR
+            )
+            
+        if not model_name:
+            raise GepaLLMError("model_name cannot be empty", ErrorType.VALIDATION_ERROR)
+            
+        if not isinstance(temperature, (int, float)) or not 0 <= temperature <= 2:
+            raise GepaLLMError(
+                f"temperature must be between 0 and 2, got {temperature}",
+                ErrorType.VALIDATION_ERROR
+            )
+            
+        if not isinstance(max_tokens, int) or max_tokens <= 0:
+            raise GepaLLMError(
+                f"max_tokens must be a positive integer, got {max_tokens}",
+                ErrorType.VALIDATION_ERROR
+            )
+            
+        # Initialize API key
+        try:
+            self.api_key = api_key or APIKeyManager().get_api_key(self.provider.value)
+            if not self.api_key:
+                raise GepaLLMError(
+                    f"No API key found for provider: {self.provider}",
+                    ErrorType.VALIDATION_ERROR
+                )
+        except Exception as e:
+            raise GepaLLMError(
+                f"Failed to initialize API key: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+            
+        self.model_name = model_name
+        self.base_url = base_url or OPENAI_API_URL
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        
+        # Configure session with retry
+        self.session = requests.Session()
+        retry_strategy = requests.adapters.Retry(
+            total=max_retries,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["POST"]
+        )
+        adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
+        self.session.mount("https://", adapter)
+        self.session.mount("http://", adapter)
+        
+        # No hardcoded model restrictions - user can specify any model name
+        # The API provider will validate if the model exists and supports vision
+    
+    def _get_api_key(self) -> Optional[str]:
+        """Get API key based on provider"""
+        if self.provider == 'openai':
+            return APIKeyManager().get_api_key('openai')
+        elif self.provider == 'anthropic':
+            return APIKeyManager().get_api_key('anthropic')
+        elif self.provider in ['google', 'gemini']:
+            return APIKeyManager().get_api_key('google')
+        # Add other providers as needed
+        return None
+
+    @classmethod
+    def from_config(cls, config: 'ModelConfig') -> 'VisionLLMClient':
+        """Create a VisionLLMClient from a ModelConfig object.
+        
+        Args:
+            config: ModelConfig instance with provider and model settings
+            
+        Returns:
+            Configured VisionLLMClient instance
+            
+        Example:
+            ```python
+            config = ModelConfig(
+                provider="openai",
+                model_name="gpt-4-vision-preview",
+                temperature=0.7
+            )
+            client = VisionLLMClient.from_config(config)
+            ```
+        """
+        return cls(
+            provider=config.provider,
+            model_name=config.model_name,
+            api_key=config.api_key,
+            base_url=config.base_url,
+            temperature=config.temperature,
+            max_tokens=config.max_tokens,
+            top_p=config.top_p,
+            frequency_penalty=config.frequency_penalty,
+            presence_penalty=config.presence_penalty
+        )
+        
+    @classmethod
+    def from_model_string(cls, model_string: str, **kwargs) -> 'VisionLLMClient':
+        """Create a VisionLLMClient from a model string like "provider/model-name".
+        
+        Args:
+            model_string: Model identifier in format "provider/model-name" or just "model-name"
+                         Examples: "google/gemini-2.0-flash", "openai/gpt-4o", "gemini-1.5-pro"
+            **kwargs: Additional configuration options (temperature, max_tokens, etc.)
+            
+        Returns:
+            Configured VisionLLMClient instance
+            
+        Example:
+            ```python
+            # With provider
+            client = VisionLLMClient.from_model_string("google/gemini-2.0-flash")
+            
+            # Without provider (defaults to openai)
+            client = VisionLLMClient.from_model_string("gpt-4o")
+            
+            # With additional options
+            client = VisionLLMClient.from_model_string(
+                "google/gemini-2.0-flash",
+                temperature=0.5,
+                max_tokens=4096
+            )
+            ```
+        """
+        import os
+        
+        # Parse "provider/model-name" format
+        if "/" in model_string:
+            provider, model_name = model_string.split("/", 1)
+        else:
+            # Default to openai if no provider specified
+            provider = "openai"
+            model_name = model_string
+        
+        # Normalize provider names
+        provider = provider.lower()
+        if provider == "gemini":
+            provider = "google"
+        
+        # Get API key from environment if not provided
+        api_key = kwargs.pop('api_key', None)
+        if not api_key:
+            env_var_map = {
+                "openai": "OPENAI_API_KEY",
+                "anthropic": "ANTHROPIC_API_KEY",
+                "google": "GOOGLE_API_KEY",
+            }
+            env_var = env_var_map.get(provider, f"{provider.upper()}_API_KEY")
+            api_key = os.getenv(env_var)
+        
+        return cls(
+            provider=provider,
+            model_name=model_name,
+            api_key=api_key,
+            **kwargs
+        )
+        
+    def generate(
+        self, 
+        system_prompt: str, 
+        user_prompt: str, 
+        image_base64: Optional[str] = None,
+        **generation_kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generates a response from the Vision LLM.
+
+        Args:
+            system_prompt: The system-level instructions for the LLM.
+            user_prompt: The user's query or task.
+            image_base64: Optional Base64 encoded image string.
+            **generation_kwargs: Additional model-specific generation parameters
+
+        Returns:
+            A dictionary containing the generated response and metadata.
+            
+        Raises:
+            GepaLLMError: If there's an error during generation
+            
+        Example:
+            ```python
+            response = client.generate(
+                system_prompt="You are a helpful assistant.",
+                user_prompt="What's in this image?",
+                image_base64="base64_encoded_image"
+            )
+            ```
+        """
+        if not system_prompt or not user_prompt:
+            raise GepaLLMError(
+                "system_prompt and user_prompt are required",
+                ErrorType.VALIDATION_ERROR
+            )
+            
+        try:
+            if self.provider == ProviderType.OPENAI:
+                return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs)
+            elif self.provider in [ProviderType.GOOGLE, ProviderType.GEMINI]:
+                return self._generate_google(system_prompt, user_prompt, image_base64, **generation_kwargs)
+            else:
+                raise GepaLLMError(
+                    f"Provider {self.provider} is not yet supported",
+                    ErrorType.VALIDATION_ERROR
+                )
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Network error during generation: {str(e)}")
+            raise GepaLLMError(
+                f"Network error: {str(e)}",
+                ErrorType.NETWORK_ERROR,
+                getattr(e.response, 'status_code', None) if hasattr(e, 'response') else None
+            ) from e
+        except GepaLLMError:
+            raise
+        except Exception as e:
+            self.logger.error(f"Unexpected error during generation: {str(e)}")
+            raise GepaLLMError(
+                f"Generation failed: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+
+    def _generate_openai(
+        self, 
+        system_prompt: str, 
+        user_prompt: str, 
+        image_base64: Optional[str] = None,
+        **generation_kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response using OpenAI's API with configured parameters.
+        
+        Args:
+            system_prompt: System instructions for the model
+            user_prompt: User's input prompt
+            image_base64: Optional base64 encoded image
+            
+        Returns:
+            Dictionary containing the API response
+            
+        Raises:
+            GepaDependencyError: If API call fails
+        """
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+            "User-Agent": "GepaOptimizer/1.0 (Python)"
+        }
+        
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt}
+                ]
+            }
+        ]
+        
+        if image_base64:
+            # #region agent log
+            import json as _json_debug
+            import time as _time_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            try:
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({
+                        "id": f"log_{int(_time_debug.time() * 1000)}",
+                        "timestamp": int(_time_debug.time() * 1000),
+                        "location": "vision_llm.py:_generate_openai",
+                        "message": "Image base64 BEFORE processing",
+                        "data": {
+                            "image_base64_length": len(image_base64) if image_base64 else 0,
+                            "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False,
+                            "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64,
+                            "is_none": image_base64 is None,
+                            "is_empty": image_base64 == "" if image_base64 else True
+                        },
+                        "sessionId": "debug-session",
+                        "runId": "run1",
+                        "hypothesisId": "A,C,D"
+                    }) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            
+            # Detect and extract image format
+            detected_format = "jpeg"  # Default fallback
+            clean_base64 = image_base64
+            
+            # Extract format from data URI prefix if present
+            if image_base64.startswith("data:image"):
+                # Parse format from prefix: data:image/png;base64,...
+                if "," in image_base64:
+                    prefix_part = image_base64.split(",", 1)[0]
+                    clean_base64 = image_base64.split(",", 1)[1]
+                    # Extract format from "data:image/PNG;base64" or "data:image/png"
+                    if "/" in prefix_part and ";" in prefix_part:
+                        detected_format = prefix_part.split("/")[1].split(";")[0].lower()
+                    elif "/" in prefix_part:
+                        detected_format = prefix_part.split("/")[1].lower()
+                else:
+                    # Fallback: try to extract format
+                    if "/" in image_base64:
+                        detected_format = image_base64.split("/")[1].split(";")[0].lower() if ";" in image_base64 else "jpeg"
+                    clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "")
+            
+            # If no format detected from prefix, try to detect from image data
+            if detected_format == "jpeg" or not detected_format:
+                try:
+                    import base64 as b64
+                    from PIL import Image
+                    import io
+                    image_data = b64.b64decode(clean_base64)
+                    img = Image.open(io.BytesIO(image_data))
+                    if img.format:
+                        detected_format = img.format.lower()
+                        # Normalize format names
+                        if detected_format in ["jpg", "jpeg"]:
+                            detected_format = "jpeg"
+                except Exception:
+                    # If detection fails, keep default
+                    pass
+            
+            # Normalize format for data URI (OpenAI accepts: jpeg, png, gif, webp)
+            format_map = {
+                "jpg": "jpeg",
+                "jpeg": "jpeg",
+                "png": "png",
+                "gif": "gif",
+                "webp": "webp",
+                "bmp": "png",  # Convert BMP to PNG (OpenAI doesn't support BMP)
+                "tiff": "png",  # Convert TIFF to PNG
+                "tif": "png"
+            }
+            final_format = format_map.get(detected_format, "jpeg")
+            
+            final_url = f"data:image/{final_format};base64,{clean_base64}"
+            
+            # #region agent log
+            try:
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({
+                        "id": f"log_{int(_time_debug.time() * 1000)}",
+                        "timestamp": int(_time_debug.time() * 1000),
+                        "location": "vision_llm.py:_generate_openai",
+                        "message": "Image URL AFTER processing",
+                        "data": {
+                            "detected_format": detected_format,
+                            "final_format": final_format,
+                            "clean_base64_length": len(clean_base64),
+                            "final_url_length": len(final_url),
+                            "final_url_prefix": final_url[:60]
+                        },
+                        "sessionId": "debug-session",
+                        "runId": "run1",
+                        "hypothesisId": "A,B"
+                    }) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            
+            messages[1]["content"].append({
+                "type": "image_url",
+                "image_url": {
+                    "url": final_url
+                }
+            })
+        
+        payload = {
+            "model": self.model_name,
+            "messages": messages,
+            # "temperature": self.temperature,
+            # "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty
+        }
+        
+        self.logger.debug(f"Sending request to {self.base_url} with model {self.model_name}")
+        
+        try:
+            self.logger.debug(f"Sending request to {self.model_name}")
+            
+            # Make the API request with retry
+            response = self.session.post(
+                self.base_url,
+                headers=headers,
+                json=payload,
+                timeout=300
+            )
+            
+            # Handle rate limiting
+            if response.status_code == 429:
+                retry_after = int(response.headers.get('Retry-After', 5))
+                self.logger.warning(f"Rate limited. Retrying after {retry_after} seconds...")
+                time.sleep(retry_after)
+                return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs)
+                
+            response.raise_for_status()
+            
+            result = response.json()
+            self.logger.debug(f"Received response from {self.model_name}")
+            
+            # Extract and validate the response
+            try:
+                message = result["choices"][0]["message"]
+                llm_response_content = message["content"]
+                
+                # Log token usage if available
+                if "usage" in result:
+                    usage = result["usage"]
+                    self.logger.info(
+                        f"Tokens used - Prompt: {usage.get('prompt_tokens', 'N/A')}, "
+                        f"Completion: {usage.get('completion_tokens', 'N/A')}, "
+                        f"Total: {usage.get('total_tokens', 'N/A')}"
+                    )
+                
+                # Try to parse JSON if the response looks like JSON
+                if isinstance(llm_response_content, str) and (
+                    llm_response_content.startswith('{') or 
+                    llm_response_content.startswith('[')
+                ):
+                    try:
+                        return json.loads(llm_response_content)
+                    except json.JSONDecodeError:
+                        pass
+                
+                # Default response format
+                return {
+                    "content": llm_response_content,
+                    "role": message.get("role", "assistant"),
+                    "model": self.model_name,
+                    "provider": self.provider.value
+                }
+                
+            except (KeyError, IndexError) as e:
+                self.logger.error(f"Unexpected response format: {result}")
+                raise GepaLLMError(
+                    f"Unexpected response format from {self.provider} API",
+                    ErrorType.API_ERROR,
+                    response.status_code
+                ) from e
+                
+        except requests.exceptions.HTTPError as e:
+            status_code = e.response.status_code if hasattr(e, 'response') else None
+            error_msg = f"HTTP error {status_code} from {self.provider} API"
+            
+            try:
+                error_data = e.response.json()
+                error_msg = error_data.get('error', {}).get('message', error_msg)
+            except Exception:
+                error_msg = str(e)
+                
+            self.logger.error(f"{error_msg}: {error_data if 'error_data' in locals() else str(e)}")
+            raise GepaLLMError(
+                error_msg,
+                ErrorType.RATE_LIMIT if status_code == 429 else ErrorType.API_ERROR,
+                status_code
+            ) from e
+            
+        except requests.exceptions.Timeout:
+            self.logger.error(f"Request to {self.provider} API timed out after {self.timeout} seconds")
+            raise GepaLLMError(
+                f"Request timed out after {self.timeout} seconds",
+                ErrorType.TIMEOUT
+            )
+            
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Network error: {str(e)}")
+            raise GepaLLMError(
+                f"Network error: {str(e)}",
+                ErrorType.NETWORK_ERROR
+            ) from e
+            
+        except Exception as e:
+            self.logger.error(f"Unexpected error: {str(e)}", exc_info=True)
+            raise GepaLLMError(
+                f"Unexpected error: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+
+    def _generate_google(
+        self, 
+        system_prompt: str, 
+        user_prompt: str, 
+        image_base64: Optional[str] = None,
+        **generation_kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response using Google Gemini API with configured parameters.
+        
+        Args:
+            system_prompt: System instructions for the model
+            user_prompt: User's input prompt
+            image_base64: Optional base64 encoded image
+            
+        Returns:
+            Dictionary containing the API response
+            
+        Raises:
+            GepaLLMError: If API call fails
+        """
+        try:
+            import google.generativeai as genai
+            import base64
+            from PIL import Image
+            import io
+        except ImportError as e:
+            raise GepaLLMError(
+                f"Required dependencies for Google Gemini not installed: {str(e)}. "
+                f"Please install: pip install google-generativeai Pillow",
+                ErrorType.VALIDATION_ERROR
+            ) from e
+        
+        # Configure Gemini
+        genai.configure(api_key=self.api_key)
+        
+        # Use the model name directly as specified by the user
+        # No hardcoded mappings or restrictions - fully configurable
+        # The Gemini API will validate if the model exists
+        gemini_model_name = self.model_name
+        
+        try:
+            model = genai.GenerativeModel(gemini_model_name)
+        except Exception as e:
+            raise GepaLLMError(
+                f"Failed to initialize Gemini model {gemini_model_name}: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+        
+        # Prepare content
+        content_parts = []
+        
+        # Add system prompt and user prompt
+        full_prompt = f"{system_prompt}\n\n{user_prompt}"
+        content_parts.append(full_prompt)
+        
+        # Add image if provided
+        if image_base64:
+            # #region agent log
+            import json as _json_debug
+            import time as _time_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            try:
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({
+                        "id": f"log_{int(_time_debug.time() * 1000)}",
+                        "timestamp": int(_time_debug.time() * 1000),
+                        "location": "vision_llm.py:_generate_google",
+                        "message": "Image base64 BEFORE processing (Google)",
+                        "data": {
+                            "image_base64_length": len(image_base64) if image_base64 else 0,
+                            "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False,
+                            "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64,
+                            "is_none": image_base64 is None,
+                            "is_empty": image_base64 == "" if image_base64 else True
+                        },
+                        "sessionId": "debug-session",
+                        "runId": "run1",
+                        "hypothesisId": "A,C,D"
+                    }) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            
+            try:
+                # Strip data URI prefix if present (hypothesis A fix)
+                clean_base64 = image_base64
+                if image_base64.startswith("data:image"):
+                    # Extract just the base64 part after the comma
+                    if "," in image_base64:
+                        clean_base64 = image_base64.split(",", 1)[1]
+                    else:
+                        clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "")
+                
+                # Decode base64 image
+                image_data = base64.b64decode(clean_base64)
+                image = Image.open(io.BytesIO(image_data))
+                content_parts.append(image)
+                self.logger.debug(f"Added image to Gemini request")
+            except Exception as e:
+                self.logger.warning(f"Failed to process image for Gemini: {str(e)}")
+                # Continue without image rather than failing
+        
+        self.logger.debug(f"Sending request to Gemini model {gemini_model_name}")
+        
+        try:
+            # Generate response with retry logic
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    # Configure generation parameters
+                    generation_config = genai.types.GenerationConfig(
+                        temperature=self.temperature,
+                        max_output_tokens=self.max_tokens,
+                        top_p=self.top_p,
+                    )
+                    
+                    response = model.generate_content(
+                        content_parts,
+                        generation_config=generation_config
+                    )
+                    
+                    # Check if response was blocked
+                    if response.prompt_feedback and response.prompt_feedback.block_reason:
+                        raise GepaLLMError(
+                            f"Gemini blocked the prompt: {response.prompt_feedback.block_reason}",
+                            ErrorType.VALIDATION_ERROR
+                        )
+                    
+                    # Check if response was blocked
+                    if not response.text:
+                        if response.candidates and response.candidates[0].finish_reason:
+                            finish_reason = response.candidates[0].finish_reason
+                            if finish_reason == genai.types.FinishReason.SAFETY:
+                                raise GepaLLMError(
+                                    "Gemini response blocked due to safety concerns",
+                                    ErrorType.VALIDATION_ERROR
+                                )
+                            elif finish_reason == genai.types.FinishReason.RECITATION:
+                                raise GepaLLMError(
+                                    "Gemini response blocked due to recitation concerns",
+                                    ErrorType.VALIDATION_ERROR
+                                )
+                        raise GepaLLMError(
+                            "Gemini returned empty response",
+                            ErrorType.API_ERROR
+                        )
+                    
+                    self.logger.debug(f"Received response from Gemini model {gemini_model_name}")
+                    
+                    # Log usage information if available
+                    if hasattr(response, 'usage_metadata') and response.usage_metadata:
+                        usage = response.usage_metadata
+                        self.logger.info(
+                            f"Tokens used - Prompt: {usage.prompt_token_count}, "
+                            f"Completion: {usage.candidates_token_count}, "
+                            f"Total: {usage.total_token_count}"
+                        )
+                    
+                    # Try to parse JSON if the response looks like JSON
+                    response_text = response.text
+                    if isinstance(response_text, str) and (
+                        response_text.startswith('{') or 
+                        response_text.startswith('[')
+                    ):
+                        try:
+                            return json.loads(response_text)
+                        except json.JSONDecodeError:
+                            pass
+                    
+                    # Default response format
+                    return {
+                        "content": response_text,
+                        "role": "assistant",
+                        "model": gemini_model_name,
+                        "provider": "google"
+                    }
+                    
+                except Exception as e:
+                    if attempt < max_retries - 1:
+                        self.logger.warning(f"Gemini API attempt {attempt + 1} failed: {str(e)}. Retrying...")
+                        time.sleep(2 ** attempt)  # Exponential backoff
+                        continue
+                    else:
+                        raise
+                        
+        except GepaLLMError:
+            raise
+        except Exception as e:
+            self.logger.error(f"Unexpected error with Gemini API: {str(e)}")
+            raise GepaLLMError(
+                f"Gemini API error: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
diff --git a/src/gepa_optimizer/models/__init__.py b/src/gepa_optimizer/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1afaf5b607bb0fdddf1d62b08d6348a034e5f8a0
--- /dev/null
+++ b/src/gepa_optimizer/models/__init__.py
@@ -0,0 +1,15 @@
+"""
+Models module for GEPA Optimizer
+"""
+
+from .config import ModelConfig, OptimizationConfig
+from .dataset import DatasetItem
+from .result import OptimizationResult, OptimizedResult
+
+__all__ = [
+    "ModelConfig",
+    "OptimizationConfig",
+    "DatasetItem",
+    "OptimizationResult",
+    "OptimizedResult"
+]
diff --git a/src/gepa_optimizer/models/config.py b/src/gepa_optimizer/models/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f6193d450b891b1d66255966b843fecbc45d25c
--- /dev/null
+++ b/src/gepa_optimizer/models/config.py
@@ -0,0 +1,488 @@
+"""
+Configuration models for GEPA Optimizer
+"""
+
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Union, Tuple
+
+@dataclass
+class ModelConfig:
+    """Configuration for any LLM provider"""
+    provider: str  # Required: "openai", "anthropic", "huggingface", "vllm", etc.
+    model_name: str  # Required: actual model name
+    api_key: str  # Required: API key for the provider
+    base_url: Optional[str] = None  # Optional: custom endpoint URL
+    temperature: float = 0.7
+    max_tokens: int = 2048
+    top_p: float = 1.0
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    
+    def __post_init__(self):
+        """Validate required fields after initialization"""
+        if not self.provider:
+            raise ValueError("Provider is required (e.g., 'openai', 'anthropic', 'huggingface')")
+        if not self.model_name:
+            raise ValueError("Model name is required (e.g., 'gpt-4', 'claude-3-opus')")
+        if not self.api_key:
+            raise ValueError(f"API key is required for {self.provider} provider")
+    
+    @classmethod
+    def from_string(cls, model_string: str) -> 'ModelConfig':
+        """Create ModelConfig from string like 'openai/gpt-4' or 'gpt-4'"""
+        if "/" in model_string:
+            provider, model_name = model_string.split("/", 1)
+        else:
+            # Default to OpenAI if no provider specified
+            provider = "openai"
+            model_name = model_string
+        
+        # Get API key from environment
+        api_key = cls._get_api_key_for_provider(provider)
+        if not api_key:
+            raise ValueError(
+                f"No API key found for {provider}. Please set {provider.upper()}_API_KEY environment variable"
+            )
+        
+        return cls(
+            provider=provider,
+            model_name=model_name,
+            api_key=api_key
+        )
+    
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> 'ModelConfig':
+        """Create ModelConfig from dictionary"""
+        return cls(**config_dict)
+    
+    def to_dict(self) -> dict:
+        """Convert ModelConfig to dictionary"""
+        return {
+            'provider': self.provider,
+            'model_name': self.model_name,
+            'api_key': self.api_key,
+            'base_url': self.base_url,
+            'temperature': self.temperature,
+            'max_tokens': self.max_tokens,
+            'top_p': self.top_p,
+            'frequency_penalty': self.frequency_penalty,
+            'presence_penalty': self.presence_penalty
+        }
+    
+    @staticmethod
+    def _get_api_key_for_provider(provider: str) -> Optional[str]:
+        """Get API key for provider from environment variables"""
+        env_var_map = {
+            "openai": "OPENAI_API_KEY",
+            "anthropic": "ANTHROPIC_API_KEY",
+            "huggingface": "HUGGINGFACE_API_KEY",
+            "cohere": "COHERE_API_KEY",
+            "ai21": "AI21_API_KEY",
+            "together": "TOGETHER_API_KEY",
+            "replicate": "REPLICATE_API_TOKEN",
+            "groq": "GROQ_API_KEY",
+            "ollama": "OLLAMA_API_KEY"
+        }
+        
+        env_var = env_var_map.get(provider.lower())
+        if env_var:
+            return os.getenv(env_var)
+        
+        # Fallback: try generic pattern
+        return os.getenv(f"{provider.upper()}_API_KEY")
+
+@dataclass
+class DataSplitConfig:
+    """Configuration for dataset splitting into train/val/test sets
+    
+    🔥 ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size for optimal results.
+    - Small datasets (< 15): Prioritizes validation set (70/25/5) for reliable candidate ranking
+    - Medium datasets (15-50): Balanced split (60/20/20)
+    - Large datasets (50+): More training data (70/15/15)
+    """
+    
+    # Split ratios (must sum to 1.0) - used as defaults, but adaptive strategy overrides for small datasets
+    train_ratio: float = 0.6  # 60% for training (Dfeedback - reflection examples)
+    val_ratio: float = 0.2    # 20% for validation (Dpareto - Pareto selection)
+    test_ratio: float = 0.2   # 20% for test (held-out final evaluation)
+    
+    # Minimum samples per split
+    min_train_samples: int = 3
+    min_val_samples: int = 3  # 🔥 INCREASED from 2 to 3 for more reliable validation scores
+    min_test_samples: int = 1  # 🔥 REDUCED from 2 to 1 (test set less critical, only used once)
+    
+    # Strategy for handling small datasets
+    small_dataset_strategy: str = 'adaptive'  # 🔥 DEFAULT: 'adaptive', 'duplicate_val', 'no_test', 'error'
+    
+    def __post_init__(self):
+        """Validate split configuration"""
+        total = self.train_ratio + self.val_ratio + self.test_ratio
+        if not (0.99 <= total <= 1.01):  # Allow small floating point errors
+            raise ValueError(
+                f"Split ratios must sum to 1.0, got {total:.3f} "
+                f"(train={self.train_ratio}, val={self.val_ratio}, test={self.test_ratio})"
+            )
+        
+        if self.train_ratio <= 0 or self.val_ratio <= 0 or self.test_ratio < 0:
+            raise ValueError("Split ratios must be positive (test_ratio can be 0 to disable)")
+        
+        if self.small_dataset_strategy not in {'adaptive', 'duplicate_val', 'no_test', 'error'}:
+            raise ValueError(
+                f"Invalid small_dataset_strategy: {self.small_dataset_strategy}. "
+                f"Must be 'adaptive', 'duplicate_val', 'no_test', or 'error'"
+            )
+    
+    def get_adaptive_ratios(self, dataset_size: int) -> Tuple[float, float, float]:
+        """
+        🔥 NEW: Get adaptive split ratios based on dataset size.
+        
+        For prompt optimization:
+        - Small datasets (< 15): Prioritize validation (70/25/5) for reliable candidate ranking
+        - Medium (15-50): Balanced (60/20/20)
+        - Large (50+): More training (70/15/15)
+        
+        Args:
+            dataset_size: Total number of samples in dataset
+            
+        Returns:
+            Tuple of (train_ratio, val_ratio, test_ratio)
+        """
+        if dataset_size < 15:
+            # Small dataset: Prioritize validation for reliable candidate ranking
+            # Validation set is CRITICAL - used for every candidate evaluation
+            return (0.70, 0.25, 0.05)  # 70% train, 25% val, 5% test
+        elif dataset_size < 50:
+            # Medium dataset: Balanced split
+            return (0.60, 0.20, 0.20)  # 60% train, 20% val, 20% test
+        else:
+            # Large dataset: More training data, can reduce validation/test
+            return (0.70, 0.15, 0.15)  # 70% train, 15% val, 15% test
+    
+    def get_split_indices(self, dataset_size: int) -> Tuple[int, int, int, int]:
+        """
+        Calculate split indices for a dataset with adaptive ratios.
+        
+        🔥 ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size.
+        This ensures optimal allocation:
+        - Small datasets: More validation samples for reliable ranking
+        - Medium datasets: Balanced split
+        - Large datasets: More training data
+        
+        Args:
+            dataset_size: Total number of samples in dataset
+            
+        Returns:
+            Tuple of (train_end, val_end, test_end, dataset_size) indices
+            
+        Raises:
+            ValueError: If dataset is too small for configured splits
+        """
+        # 🔥 NEW: Use adaptive ratios if strategy is 'adaptive'
+        if self.small_dataset_strategy == 'adaptive':
+            train_ratio, val_ratio, test_ratio = self.get_adaptive_ratios(dataset_size)
+        else:
+            train_ratio, val_ratio, test_ratio = self.train_ratio, self.val_ratio, self.test_ratio
+        
+        if dataset_size < self.min_train_samples + self.min_val_samples:
+            if self.small_dataset_strategy == 'error':
+                raise ValueError(
+                    f"Dataset too small ({dataset_size} samples). "
+                    f"Need at least {self.min_train_samples + self.min_val_samples} samples."
+                )
+        
+        # Calculate ideal split points with adaptive ratios
+        train_end = max(self.min_train_samples, int(dataset_size * train_ratio))
+        val_end = train_end + max(self.min_val_samples, int(dataset_size * val_ratio))
+        
+        # Adjust for small datasets
+        if val_end >= dataset_size:
+            if self.small_dataset_strategy in {'adaptive', 'duplicate_val'}:
+                # Ensure minimum validation samples, use remainder for test
+                val_end = min(dataset_size, train_end + self.min_val_samples)
+                test_end = dataset_size
+            elif self.small_dataset_strategy == 'no_test':
+                # No test set for small datasets
+                val_end = dataset_size
+                test_end = dataset_size
+            else:  # error
+                raise ValueError(
+                    f"Dataset too small ({dataset_size} samples) for train/val/test split. "
+                    f"Need at least {self.min_train_samples + self.min_val_samples + self.min_test_samples} samples."
+                )
+        else:
+            test_end = dataset_size
+        
+        return train_end, val_end, test_end, dataset_size
+
+@dataclass
+class OptimizationConfig:
+    """Configuration class for GEPA optimization process"""
+    
+    # Core models - REQUIRED by user
+    model: Union[str, ModelConfig]  # No default - user must specify
+    reflection_model: Union[str, ModelConfig]  # No default - user must specify
+    
+    # Optimization parameters - REQUIRED by user
+    max_iterations: int  # No default - user decides their budget
+    max_metric_calls: int  # No default - user sets their budget
+    batch_size: int  # No default - user decides based on memory
+    
+    # Dataset splitting configuration
+    data_split: DataSplitConfig = field(default_factory=DataSplitConfig)
+    
+    # Reflection settings (separate from evaluation batch_size)
+    reflection_examples: int = 3  # Number of examples for each reflection (small!)
+    
+    # Optional optimization settings with sensible fallbacks
+    early_stopping: bool = True
+    learning_rate: float = 0.01
+    
+    # Multi-objective optimization
+    multi_objective: bool = False
+    objectives: List[str] = field(default_factory=lambda: ["accuracy"])
+    
+    # Advanced settings
+    custom_metrics: Optional[Dict[str, Any]] = None
+    use_cache: bool = True
+    parallel_evaluation: bool = False
+    
+    # Backwards compatibility (deprecated)
+    train_split_ratio: Optional[float] = None  # Use data_split instead
+    min_dataset_size: int = 2
+    
+    # Cost and budget - user controlled
+    max_cost_usd: Optional[float] = None
+    timeout_seconds: Optional[int] = None
+    
+    # GEPA-specific optimization parameters (based on actual GEPA library)
+    candidate_selection_strategy: str = 'pareto'  # Use Pareto selection strategy
+    skip_perfect_score: bool = False  # Don't skip perfect scores (set to True for early stopping)
+    reflection_minibatch_size: Optional[int] = None  # Will use reflection_examples if None
+    perfect_score: float = 1.0  # Perfect score threshold
+    module_selector: str = 'round_robin'  # Component selection strategy
+    verbose: bool = True  # Enable detailed GEPA logging
+    
+    # Test set evaluation
+    evaluate_on_test: bool = True  # Evaluate final prompt on held-out test set
+    
+    # 🆕 LLEGO Genetic Operator Parameters (Optional - for faster convergence)
+    # Based on ICLR 2025 paper: "Decision Tree Induction Through LLMs via Semantically-Aware Evolution"
+    # Optimized for small datasets (6-10 samples)
+    use_llego_operators: bool = False  # Enable LLEGO genetic operators
+    
+    # 🔥 HYBRID MODE: Combine GEPA Reflection + LLEGO Operators
+    # When both enabled, candidates are generated from BOTH sources for maximum diversity
+    enable_gepa_reflection_with_llego: bool = False  # Enable hybrid GEPA+LLEGO mode
+    num_gepa_reflection_candidates: int = 3  # Number of GEPA reflection candidates per iteration (default: 3 for better exploration, range: 2-5)
+    
+    # Fitness-guided crossover parameters (FIX #3: Conservative alpha)
+    alpha: float = 0.05  # FIX #3: Fitness extrapolation (0.05 = 5% above best parent, realistic for prompt optimization)
+    n_crossover: int = 2  # Number of offspring from crossover per iteration
+    
+    # Diversity-guided mutation parameters
+    tau: float = 8.0  # Diversity temperature (8.0 = moderate diversity, balanced exploration/exploitation)
+    nu: int = 3  # Parent arity (3 parents optimal for small populations ~6 samples)
+    n_mutation: int = 2  # Number of offspring from mutation per iteration (total 4 offspring with crossover)
+    
+    # Population management (for genetic operators)
+    population_size: int = 8  # Size of prompt population (small but diverse for 6-sample dataset)
+    
+    # 🆕 LLM-as-Judge configuration (Phase 2)
+    use_llm_as_judge: bool = True  # Enable LLM-as-Judge feedback for detailed, actionable analysis
+    llm_as_judge_threshold: float = 0.8  # Use LLM-as-Judge for scores below this threshold
+    llm_as_judge_model: Optional[ModelConfig] = None  # Optional: use different model (defaults to reflection_model)
+    
+    # 🆕 Logging configuration (Phase 3)
+    log_level: str = "INFO"  # Logging level: "DEBUG", "INFO", "WARNING", "ERROR"
+    
+    def __post_init__(self):
+        """Validate and process configuration after initialization"""
+        # Handle backwards compatibility for train_split_ratio
+        if self.train_split_ratio is not None and self.train_split_ratio != 0.8:
+            import warnings
+            warnings.warn(
+                "train_split_ratio is deprecated. Use data_split=DataSplitConfig(...) instead. "
+                "Converting to 3-way split with your ratio.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+            # Convert 2-way split to 3-way: use train_ratio, split remainder between val/test
+            remainder = 1.0 - self.train_split_ratio
+            self.data_split = DataSplitConfig(
+                train_ratio=self.train_split_ratio,
+                val_ratio=remainder * 0.5,
+                test_ratio=remainder * 0.5
+            )
+        
+        # Convert string models to ModelConfig objects
+        self.model = self._parse_model_config(self.model, "model")
+        self.reflection_model = self._parse_model_config(self.reflection_model, "reflection_model")
+        
+        # Set reflection_minibatch_size default
+        if self.reflection_minibatch_size is None:
+            self.reflection_minibatch_size = self.reflection_examples
+        
+        # Validate required parameters
+        self._validate_required_params()
+        
+        # Validate ranges
+        self._validate_ranges()
+    
+    def _parse_model_config(self, model: Union[str, ModelConfig], field_name: str) -> ModelConfig:
+        """Parse string model specification into ModelConfig"""
+        if isinstance(model, ModelConfig):
+            return model
+        
+        if isinstance(model, str):
+            # Parse "provider/model-name" format
+            if "/" in model:
+                provider, model_name = model.split("/", 1)
+            else:
+                # Default to openai if no provider specified
+                provider = "openai"
+                model_name = model
+            
+            # Try to get API key from environment
+            api_key = self._get_api_key_for_provider(provider)
+            if not api_key:
+                raise ValueError(
+                    f"No API key found for {provider}. Please set environment variable "
+                    f"or provide ModelConfig with api_key for {field_name}"
+                )
+            
+            return ModelConfig(
+                provider=provider,
+                model_name=model_name,
+                api_key=api_key
+            )
+        
+        raise ValueError(f"{field_name} must be either a string or ModelConfig object")
+    
+    def _get_api_key_for_provider(self, provider: str) -> Optional[str]:
+        """Get API key for provider from environment variables"""
+        return ModelConfig._get_api_key_for_provider(provider)
+    
+    def _validate_required_params(self):
+        """Validate that all required parameters are provided"""
+        required_fields = {
+            "max_iterations": self.max_iterations,
+            "max_metric_calls": self.max_metric_calls,
+            "batch_size": self.batch_size,
+        }
+        
+        for field_name, value in required_fields.items():
+            if value is None:
+                raise ValueError(f"{field_name} is required and must be specified by user")
+    
+    def _validate_ranges(self):
+        """Validate parameter ranges"""
+        if self.max_iterations <= 0:
+            raise ValueError("max_iterations must be positive")
+        
+        if self.max_metric_calls <= 0:
+            raise ValueError("max_metric_calls must be positive")
+        
+        if self.batch_size <= 0:
+            raise ValueError("batch_size must be positive")
+        
+        if self.reflection_examples <= 0 or self.reflection_examples > 10:
+            raise ValueError("reflection_examples must be between 1 and 10 (recommended: 2-5)")
+        
+        if self.reflection_minibatch_size <= 0:
+            raise ValueError("reflection_minibatch_size must be positive")
+            
+        if hasattr(self.model, 'max_tokens') and self.model.max_tokens <= 0:
+            raise ValueError("model.max_tokens must be a positive integer")
+        
+        # Validate hybrid mode parameters
+        if self.enable_gepa_reflection_with_llego and not self.use_llego_operators:
+            raise ValueError("enable_gepa_reflection_with_llego requires use_llego_operators=True")
+        
+        if self.num_gepa_reflection_candidates <= 0 or self.num_gepa_reflection_candidates > 5:
+            raise ValueError("num_gepa_reflection_candidates must be between 1 and 5 (recommended: 3 for balanced exploration)")
+        
+        # Validate log_level
+        valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"]
+        if self.log_level.upper() not in valid_log_levels:
+            raise ValueError(f"log_level must be one of {valid_log_levels}, got: {self.log_level}")
+            
+    def validate_api_connectivity(self) -> Dict[str, bool]:
+        """Test API connectivity for both models"""
+        results = {}
+        
+        for model_name, model_config in [("model", self.model), ("reflection_model", self.reflection_model)]:
+            try:
+                # This would be implemented to actually test the API
+                # For now, just check if we have the required info
+                if model_config.api_key and model_config.provider and model_config.model_name:
+                    results[model_name] = True
+                else:
+                    results[model_name] = False
+            except Exception:
+                results[model_name] = False
+        
+        return results
+    
+    def get_estimated_cost(self) -> Dict[str, Any]:
+        """Estimate cost based on configuration"""
+        # This would calculate estimated costs based on:
+        # - max_metric_calls
+        # - model pricing
+        # - expected tokens per call
+        return {
+            "max_calls": self.max_metric_calls,
+            "estimated_cost_range": "To be calculated based on provider pricing",
+            "cost_factors": {
+                "model_calls": self.max_metric_calls,
+                "reflection_calls": self.max_iterations,
+                "batch_size": self.batch_size
+            }
+        }
+    
+    @classmethod
+    def create_example_config(cls, provider: str = "openai") -> str:
+        """Generate example configuration code for users"""
+        examples = {
+            "openai": '''
+# Example OpenAI Configuration
+config = OptimizationConfig(
+    model="openai/gpt-4-turbo",  # or ModelConfig(...)
+    reflection_model="openai/gpt-4-turbo",
+    max_iterations=50,  # Your choice based on budget
+    max_metric_calls=300,  # Your choice based on budget
+    batch_size=8,  # Your choice based on memory
+    early_stopping=True,
+    learning_rate=0.01
+)
+''',
+            "anthropic": '''
+# Example Anthropic Configuration
+config = OptimizationConfig(
+    model=ModelConfig(
+        provider="anthropic",
+        model_name="claude-3-opus-20240229",
+        api_key="your-anthropic-key",
+        temperature=0.7
+    ),
+    reflection_model="anthropic/claude-3-sonnet-20240229",
+    max_iterations=30,
+    max_metric_calls=200,
+    batch_size=4
+)
+''',
+            "mixed": '''
+# Example Mixed Providers Configuration
+config = OptimizationConfig(
+    model="openai/gpt-4-turbo",  # Main model
+    reflection_model="anthropic/claude-3-opus",  # Reflection model
+    max_iterations=25,
+    max_metric_calls=250,
+    batch_size=6,
+    max_cost_usd=100.0,  # Budget limit
+    timeout_seconds=3600  # 1 hour limit
+)
+'''
+        }
+        
+        return examples.get(provider, examples["openai"])
diff --git a/src/gepa_optimizer/models/dataset.py b/src/gepa_optimizer/models/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..82bf5264c258f4e6cd5c89dbde63f78693761c45
--- /dev/null
+++ b/src/gepa_optimizer/models/dataset.py
@@ -0,0 +1,89 @@
+"""
+Dataset models for GEPA Optimizer
+"""
+
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import uuid
+
+@dataclass
+class DatasetItem:
+    """Single item in a dataset"""
+    
+    # Identifiers
+    item_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    
+    # Core data
+    input_data: Any = ""
+    expected_output: Optional[str] = None
+    image_base64: Optional[str] = None
+    
+    # Metadata
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    tags: List[str] = field(default_factory=list)
+    
+    # File references
+    file_paths: List[str] = field(default_factory=list)
+    
+    # Quality indicators
+    quality_score: float = 1.0
+    is_validated: bool = False
+    validation_notes: List[str] = field(default_factory=list)
+    
+    def __post_init__(self):
+        """Validate item after initialization"""
+        if self.quality_score < 0 or self.quality_score > 1:
+            raise ValueError("quality_score must be between 0 and 1")
+    
+    def add_tag(self, tag: str):
+        """Add a tag to this item"""
+        if tag not in self.tags:
+            self.tags.append(tag)
+    
+    def mark_validated(self, notes: Optional[List[str]] = None):
+        """Mark item as validated"""
+        self.is_validated = True
+        if notes:
+            self.validation_notes.extend(notes)
+
+@dataclass 
+class ProcessedDataset:
+    """Dataset after processing for GEPA optimization"""
+    
+    # Identifiers
+    dataset_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    name: str = "Untitled Dataset"
+    
+    # Data
+    items: List[DatasetItem] = field(default_factory=list)
+    train_split: List[DatasetItem] = field(default_factory=list)
+    val_split: List[DatasetItem] = field(default_factory=list)
+    
+    # Metadata
+    source_info: Dict[str, Any] = field(default_factory=dict)
+    processing_stats: Dict[str, Any] = field(default_factory=dict)
+    
+    # Quality metrics
+    total_items: int = 0
+    validated_items: int = 0
+    avg_quality_score: float = 0.0
+    
+    def __post_init__(self):
+        """Calculate derived fields"""
+        self.total_items = len(self.items)
+        
+        if self.items:
+            self.validated_items = sum(1 for item in self.items if item.is_validated)
+            self.avg_quality_score = sum(item.quality_score for item in self.items) / len(self.items)
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get dataset statistics"""
+        return {
+            'total_items': self.total_items,
+            'validated_items': self.validated_items,
+            'validation_rate': self.validated_items / self.total_items if self.total_items > 0 else 0,
+            'avg_quality_score': self.avg_quality_score,
+            'train_size': len(self.train_split),
+            'val_size': len(self.val_split),
+            'has_expected_outputs': sum(1 for item in self.items if item.expected_output),
+        }
diff --git a/src/gepa_optimizer/models/result.py b/src/gepa_optimizer/models/result.py
new file mode 100644
index 0000000000000000000000000000000000000000..95d11cda56bf40a7faae1fea211bd572a9c4dbe5
--- /dev/null
+++ b/src/gepa_optimizer/models/result.py
@@ -0,0 +1,204 @@
+"""
+Result models for GEPA Optimizer
+"""
+
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, Any, Optional, List
+import uuid
+
+@dataclass
+class OptimizationResult:
+    """Complete optimization result with all metadata"""
+    
+    # Identifiers
+    session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    
+    # Core results
+    original_prompt: str = ""
+    optimized_prompt: str = ""
+    
+    # Performance metrics
+    improvement_data: Dict[str, Any] = field(default_factory=dict)
+    baseline_metrics: Dict[str, float] = field(default_factory=dict)
+    final_metrics: Dict[str, float] = field(default_factory=dict)
+    
+    # Process metadata
+    optimization_time: float = 0.0
+    dataset_size: int = 0
+    total_iterations: int = 0
+    
+    # Status and error handling
+    status: str = "pending"  # pending, running, completed, failed
+    error_message: Optional[str] = None
+    
+    # Timestamps
+    created_at: datetime = field(default_factory=datetime.now)
+    completed_at: Optional[datetime] = None
+    
+    # Reflection history
+    reflection_history: List[Dict[str, Any]] = field(default_factory=list)
+    
+    # Cost and resource usage
+    estimated_cost: Optional[float] = None
+    api_calls_made: int = 0
+    
+    def mark_completed(self):
+        """Mark optimization as completed"""
+        self.status = "completed"
+        self.completed_at = datetime.now()
+    
+    def mark_failed(self, error: str):
+        """Mark optimization as failed"""
+        self.status = "failed"
+        self.error_message = error
+        self.completed_at = datetime.now()
+
+class OptimizedResult:
+    """
+    User-facing result class that provides clean interface
+    """
+    
+    def __init__(self, 
+                 original_prompt: str = "",
+                 optimized_prompt: str = "",
+                 improvement_data: Dict[str, Any] = None,
+                 optimization_time: float = 0.0,
+                 dataset_size: int = 0,
+                 total_iterations: int = 0,
+                 status: str = "pending",
+                 error_message: Optional[str] = None,
+                 detailed_result: Optional[OptimizationResult] = None,
+                 session_id: Optional[str] = None):
+        """
+        Initialize OptimizedResult with individual parameters
+        
+        Args:
+            original_prompt: Original seed prompt
+            optimized_prompt: Optimized prompt
+            improvement_data: Performance improvement data
+            optimization_time: Time taken for optimization
+            dataset_size: Size of dataset used
+            total_iterations: Number of optimization iterations
+            status: Optimization status
+            error_message: Error message if failed
+            detailed_result: Optional detailed OptimizationResult
+            session_id: Optional session ID
+        """
+        if improvement_data is None:
+            improvement_data = {}
+            
+        # Create internal OptimizationResult
+        self._result = OptimizationResult(
+            session_id=session_id or str(uuid.uuid4()),
+            original_prompt=original_prompt,
+            optimized_prompt=optimized_prompt,
+            improvement_data=improvement_data,
+            optimization_time=optimization_time,
+            dataset_size=dataset_size,
+            total_iterations=total_iterations,
+            status=status,
+            error_message=error_message
+        )
+        
+        # If detailed_result is provided, use it instead
+        if detailed_result is not None:
+            self._result = detailed_result
+    
+    @property
+    def prompt(self) -> str:
+        """The optimized prompt ready for production use"""
+        return self._result.optimized_prompt
+    
+    @property
+    def original_prompt(self) -> str:
+        """The original seed prompt for reference"""
+        return self._result.original_prompt
+    
+    @property
+    def session_id(self) -> str:
+        """Unique session identifier"""
+        return self._result.session_id
+    
+    @property
+    def improvement_data(self) -> Dict[str, Any]:
+        """Performance improvement data"""
+        return self._result.improvement_data
+    
+    @property
+    def status(self) -> str:
+        """Optimization status"""
+        return self._result.status
+    
+    @property
+    def error_message(self) -> Optional[str]:
+        """Error message if optimization failed"""
+        return self._result.error_message
+
+    @property
+    def is_successful(self) -> bool:
+        """Whether optimization completed successfully"""
+        return (
+            self._result.status == "completed" and 
+            self._result.error_message is None
+        )
+    
+    @property
+    def optimization_time(self) -> float:
+        """Time taken for optimization in seconds"""
+        return self._result.optimization_time
+    
+    @property
+    def dataset_size(self) -> int:
+        """Size of dataset used for optimization"""
+        return self._result.dataset_size
+    
+    @property
+    def total_iterations(self) -> int:
+        """Total optimization iterations performed"""
+        return self._result.total_iterations
+    
+    @property
+    def estimated_cost(self) -> Optional[float]:
+        """Estimated cost in USD"""
+        return self._result.estimated_cost
+    
+    def get_improvement_summary(self) -> Dict[str, Any]:
+        """Get summary of improvements made"""
+        summary = {
+            'has_improvement': bool(self._result.improvement_data),
+            'optimization_time': self.optimization_time,
+            'iterations': self.total_iterations,
+            'dataset_size': self.dataset_size
+        }
+        
+        # Add improvement percentage if available
+        if 'improvement_percent' in self._result.improvement_data:
+            summary['improvement_percent'] = self._result.improvement_data['improvement_percent']
+        
+        return summary
+    
+    def get_reflection_summary(self) -> Dict[str, Any]:
+        """Get summary of reflection process"""
+        if not self._result.reflection_history:
+            return {'total_reflections': 0}
+        
+        return {
+            'total_reflections': len(self._result.reflection_history),
+            'reflection_points': [
+                r.get('summary', 'No summary') 
+                for r in self._result.reflection_history[:3]  # First 3
+            ]
+        }
+    
+    def get_detailed_result(self) -> OptimizationResult:
+        """Get the full detailed result for advanced users"""
+        return self._result
+    
+    def __str__(self) -> str:
+        """String representation"""
+        status_emoji = "✅" if self.is_successful else "❌" if self.status == "failed" else "⏳"
+        return f"OptimizedResult({status_emoji} {self.status}, time={self.optimization_time:.2f}s)"
+    
+    def __repr__(self) -> str:
+        return self.__str__()
diff --git a/src/gepa_optimizer/operators/__init__.py b/src/gepa_optimizer/operators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2f3b6b5092b597b55a76fbabb6654f58e51f04d
--- /dev/null
+++ b/src/gepa_optimizer/operators/__init__.py
@@ -0,0 +1,45 @@
+"""
+LLEGO Genetic Operators for GEPA.
+
+This module provides genetic operators for prompt optimization:
+- FitnessGuidedCrossover: Combines high-performing prompts
+- DiversityGuidedMutation: Explores diverse variations
+- LLEGOIntegrationLayer: Manages the genetic algorithm workflow
+
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+
+# Base interfaces (SOLID: Interface Segregation)
+from .base_operator import (
+    BaseGeneticOperator,
+    BaseCrossoverOperator,
+    BaseMutationOperator,
+)
+
+# Data models
+from .models import (
+    PromptCandidate,
+    PromptMetadata,
+)
+
+# Concrete operators (SOLID: Single Responsibility)
+from .crossover import FitnessGuidedCrossover
+from .mutation import DiversityGuidedMutation
+
+# Integration layer
+from .llego_operators import LLEGOIntegrationLayer
+
+__all__ = [
+    # Base interfaces
+    'BaseGeneticOperator',
+    'BaseCrossoverOperator',
+    'BaseMutationOperator',
+    # Data models
+    'PromptCandidate',
+    'PromptMetadata',
+    # Operators
+    'FitnessGuidedCrossover',
+    'DiversityGuidedMutation',
+    # Integration
+    'LLEGOIntegrationLayer',
+]
diff --git a/src/gepa_optimizer/operators/base_operator.py b/src/gepa_optimizer/operators/base_operator.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff3a2772abec80e3800aeec1857dc371cccdb608
--- /dev/null
+++ b/src/gepa_optimizer/operators/base_operator.py
@@ -0,0 +1,107 @@
+"""
+Base Genetic Operator Interface.
+
+Defines the abstract interface for all genetic operators following
+the Interface Segregation Principle (ISP) of SOLID.
+"""
+
+from abc import ABC, abstractmethod
+from typing import List, Callable
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class BaseGeneticOperator(ABC):
+    """
+    Abstract base class for genetic operators.
+    
+    All genetic operators (crossover, mutation, etc.) should inherit from this
+    class and implement the __call__ method.
+    
+    Design Principles:
+    - Single Responsibility: Each operator does one thing
+    - Open/Closed: Extend via inheritance, don't modify
+    - Liskov Substitution: Any operator works where base is expected
+    - Interface Segregation: Minimal required interface
+    - Dependency Inversion: Depend on abstractions (LLM callable)
+    """
+    
+    @abstractmethod
+    def __call__(self, *args, **kwargs) -> str:
+        """
+        Execute the genetic operation.
+        
+        Returns:
+            str: New prompt generated by the operation
+        """
+        pass
+    
+    @abstractmethod
+    def _build_prompt(self, *args, **kwargs) -> str:
+        """
+        Build the LLM prompt for this operation.
+        
+        Returns:
+            str: Prompt to send to the LLM
+        """
+        pass
+
+
+class BaseCrossoverOperator(BaseGeneticOperator):
+    """
+    Abstract base class for crossover operators.
+    
+    Crossover combines multiple parent prompts to create offspring
+    that inherit good traits from both parents.
+    """
+    
+    @abstractmethod
+    def __call__(
+        self,
+        parents: List,  # List[PromptCandidate]
+        target_fitness: float,
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Combine parent prompts to create offspring.
+        
+        Args:
+            parents: List of parent PromptCandidate objects
+            target_fitness: Desired fitness for offspring
+            llm: Language model callable
+            
+        Returns:
+            str: Offspring prompt
+        """
+        pass
+
+
+class BaseMutationOperator(BaseGeneticOperator):
+    """
+    Abstract base class for mutation operators.
+    
+    Mutation creates variations of a parent prompt to explore
+    new regions of the search space.
+    """
+    
+    @abstractmethod
+    def __call__(
+        self,
+        parent,  # PromptCandidate
+        population: List,  # List[PromptCandidate]
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Mutate a parent prompt to create a variation.
+        
+        Args:
+            parent: Parent PromptCandidate to mutate
+            population: Current population for diversity guidance
+            llm: Language model callable
+            
+        Returns:
+            str: Mutated prompt
+        """
+        pass
+
diff --git a/src/gepa_optimizer/operators/crossover.py b/src/gepa_optimizer/operators/crossover.py
new file mode 100644
index 0000000000000000000000000000000000000000..fff82d7d619550b72e33299a8f78c2ccd28b7e48
--- /dev/null
+++ b/src/gepa_optimizer/operators/crossover.py
@@ -0,0 +1,120 @@
+"""
+Fitness-Guided Crossover Operator.
+
+Adapts LLEGO's fitness-guided crossover for text prompts.
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+
+from typing import List, Callable, TYPE_CHECKING
+import logging
+
+from .base_operator import BaseCrossoverOperator
+
+if TYPE_CHECKING:
+    from .models import PromptCandidate
+
+logger = logging.getLogger(__name__)
+
+
+class FitnessGuidedCrossover(BaseCrossoverOperator):
+    """
+    Fitness-guided crossover for text prompts.
+    
+    Combines high-performing parent prompts to generate offspring
+    that target specific fitness levels using LLM semantic understanding.
+    
+    From LLEGO paper:
+    "Fitness-guided crossover exploits high-performing regions of the search space
+    by combining parent trees targeting a desired fitness level f* = f_max + α(f_max - f_min)"
+    
+    Reference: https://github.com/nicolashuynh/LLEGO
+    """
+    
+    def __init__(self, alpha: float = 0.1):
+        """
+        Initialize crossover operator.
+        
+        Args:
+            alpha: Fitness extrapolation parameter.
+                   Higher α = target higher fitness than parents.
+                   Default 0.1 from LLEGO paper (target 10% above best parent).
+        """
+        self.alpha = alpha
+        logger.debug(f"FitnessGuidedCrossover initialized with α={alpha}")
+    
+    def __call__(
+        self,
+        parents: List["PromptCandidate"],
+        target_fitness: float,
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Combine parent prompts targeting specific fitness.
+        
+        Args:
+            parents: List of PromptCandidate objects (2+ parents)
+            target_fitness: Desired fitness for offspring
+            llm: Language model callable
+            
+        Returns:
+            str: Offspring prompt
+            
+        Raises:
+            ValueError: If fewer than 2 parents provided
+        """
+        if len(parents) < 2:
+            raise ValueError("Crossover requires at least 2 parents")
+        
+        # Sort parents by fitness (best first)
+        sorted_parents = sorted(parents, key=lambda p: p.fitness, reverse=True)
+        
+        logger.debug(f"Crossover: {len(parents)} parents, target fitness={target_fitness:.3f}")
+        
+        # Build crossover prompt and call LLM
+        crossover_prompt = self._build_prompt(sorted_parents, target_fitness)
+        new_prompt = llm(crossover_prompt)
+        
+        return new_prompt
+    
+    def _build_prompt(
+        self, 
+        parents: List["PromptCandidate"],
+        target_fitness: float
+    ) -> str:
+        """
+        Build LLM prompt for crossover operation.
+        
+        Args:
+            parents: Sorted list of parent candidates (best first)
+            target_fitness: Target fitness for offspring
+            
+        Returns:
+            str: Prompt for LLM
+        """
+        # Truncate parents to prevent safety filter issues
+        MAX_PARENT_LENGTH = 350
+        
+        # Build parent descriptions (limit to top 2)
+        parent_descriptions = []
+        for i, parent in enumerate(parents[:2]):
+            truncated = parent.prompt[:MAX_PARENT_LENGTH]
+            if len(parent.prompt) > MAX_PARENT_LENGTH:
+                truncated += "..."
+            parent_descriptions.append(
+                f"P{i+1} (f={parent.fitness:.2f}): {truncated}\n"
+            )
+        
+        prompt = f"""Combine these prompts into ONE improved version (target fitness: {target_fitness:.2f}).
+
+{' '.join(parent_descriptions)}
+Instructions:
+1. Merge the best rules/principles from both parents
+2. Organize logic clearly (e.g., "For X tasks: do Y", "If Z: then A")
+3. Add structure to handle different cases systematically
+4. Keep output format (Element: X, Description:, Reason:)
+5. Max 600 chars
+
+Output ONLY the combined prompt:"""
+        
+        return prompt
+
diff --git a/src/gepa_optimizer/operators/llego_operators.py b/src/gepa_optimizer/operators/llego_operators.py
new file mode 100644
index 0000000000000000000000000000000000000000..6be082eea011484ab510e6e63789db43a0c06ff6
--- /dev/null
+++ b/src/gepa_optimizer/operators/llego_operators.py
@@ -0,0 +1,364 @@
+"""
+LLEGO Integration Layer for GEPA.
+
+This module provides the integration layer that wraps LLEGO genetic operators
+for use with the GEPA optimization framework.
+
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+GitHub: https://github.com/nicolashuynh/LLEGO
+"""
+
+from typing import List, Callable, Dict, Any, Optional, Literal
+import numpy as np
+import logging
+
+# Import from modular files (SOLID: Single Responsibility)
+from .models import PromptCandidate, PromptMetadata
+from .crossover import FitnessGuidedCrossover
+from .mutation import DiversityGuidedMutation
+
+logger = logging.getLogger(__name__)
+
+
+class LLEGOIntegrationLayer:
+    """
+    Integration layer that wraps LLEGO operators for GEPA.
+    
+    This class manages the genetic algorithm workflow:
+    - Population initialization
+    - Parent selection (fitness-based)
+    - Crossover and mutation operations
+    - Population management
+    
+    Design Principles:
+    - Composition over inheritance (uses crossover_op, mutation_op)
+    - Single Responsibility: Only manages GA workflow
+    - Open/Closed: New operators can be added without modifying this class
+    """
+    
+    def __init__(
+        self,
+        alpha: float = 0.05,
+        tau: float = 10.0,
+        nu: int = 4,
+        population_size: int = 10,
+        n_crossover: int = 2,
+        n_mutation: int = 3
+    ):
+        """
+        Initialize LLEGO integration layer.
+        
+        Args:
+            alpha: Fitness extrapolation for crossover (default 0.05)
+            tau: Diversity temperature for mutation
+            nu: Parent arity for diversity sampling
+            population_size: Maximum population size
+            n_crossover: Number of crossover offspring per generation
+            n_mutation: Number of mutation offspring per generation
+        """
+        self.crossover_op = FitnessGuidedCrossover(alpha=alpha)
+        self.mutation_op = DiversityGuidedMutation(tau=tau, nu=nu)
+        self.population_size = population_size
+        self.n_crossover = n_crossover
+        self.n_mutation = n_mutation
+        self.population: List[PromptCandidate] = []
+        self.current_generation = 0
+        
+        # Track metadata for prompts generated in current generation
+        self._generation_metadata: Dict[str, PromptMetadata] = {}
+        
+        logger.debug(f"LLEGO initialized: pop_size={population_size}, crossover={n_crossover}, mutation={n_mutation}")
+    
+    def initialize_population(self, seed_prompt: str, initial_fitness: float = 0.5):
+        """Initialize population with seed prompt."""
+        seed_candidate = PromptCandidate(
+            prompt=seed_prompt,
+            fitness=initial_fitness,
+            metadata={
+                'generation': 0,
+                'operator': 'seed',
+                'parent_indices': None,
+                'parent_prompts': None,
+                'target_fitness': None,
+                'diversity_score': None,
+                'sample_scores': None,
+                'num_diverse_parents': None
+            }
+        )
+        self.population = [seed_candidate]
+        logger.debug(f"Population initialized with seed prompt ({len(seed_prompt)} chars)")
+    
+    def create_candidate_with_metadata(
+        self,
+        prompt: str,
+        fitness: float,
+        generation: int,
+        operator: Literal['crossover', 'mutation'],
+        parent_indices: Optional[List[int]] = None,
+        parent_prompts: Optional[List[str]] = None,
+        target_fitness: Optional[float] = None,
+        diversity_score: Optional[float] = None,
+        sample_scores: Optional[List[float]] = None,
+        num_diverse_parents: Optional[int] = None
+    ) -> PromptCandidate:
+        """Create a PromptCandidate with properly populated metadata."""
+        return PromptCandidate(
+            prompt=prompt,
+            fitness=fitness,
+            metadata={
+                'generation': generation,
+                'operator': operator,
+                'parent_indices': parent_indices,
+                'parent_prompts': parent_prompts,
+                'target_fitness': target_fitness,
+                'diversity_score': diversity_score,
+                'sample_scores': sample_scores,
+                'num_diverse_parents': num_diverse_parents
+            }
+        )
+    
+    def evolve_generation(
+        self,
+        llm: Callable[[str], str],
+        pareto_front: List[PromptCandidate]
+    ) -> List[str]:
+        """
+        Evolve one generation using LLEGO operators.
+        
+        When crossover cannot run (< 2 parents with scores), it is skipped.
+        The caller should compensate by generating extra GEPA reflection candidates.
+        
+        Args:
+            llm: Language model callable
+            pareto_front: Current Pareto front (non-dominated prompts with scores)
+            
+        Returns:
+            List of new prompt candidates to evaluate
+        """
+        new_prompts = []
+        self.current_generation += 1
+        self._generation_metadata = {}
+        
+        # Track crossover status for caller to handle compensation
+        self._crossover_skipped = False
+        self._crossover_deficit = 0
+        self._actual_crossover_count = 0
+        
+        logger.info(f"🧬 LLEGO Generation {self.current_generation}: pareto_front={len(pareto_front)}, population={len(self.population)}")
+        
+        # Crossover: Combine BEST parents (requires >= 2 parents WITH SCORES)
+        if len(pareto_front) >= 2:
+            # Sort by fitness - always use TOP scored parents for crossover
+            sorted_front = sorted(pareto_front, key=lambda p: p.fitness, reverse=True)
+            
+            for i in range(self.n_crossover):
+                # Always use top 2 highest-scored parents
+                parents = sorted_front[:2]
+                target_fitness = self._calculate_target_fitness(parents)
+                
+                offspring = self.crossover_op(parents, target_fitness, llm)
+                new_prompts.append(offspring)
+                self._actual_crossover_count += 1
+                
+                # Store metadata with parent fitness info
+                self._generation_metadata[offspring] = {
+                    'generation': self.current_generation,
+                    'operator': 'crossover',
+                    'parent_indices': [self.population.index(p) for p in parents if p in self.population],
+                    'parent_prompts': [p.prompt for p in parents],
+                    'parent_fitnesses': [p.fitness for p in parents],
+                    'target_fitness': target_fitness,
+                    'diversity_score': None,
+                    'sample_scores': None,
+                    'num_diverse_parents': len(parents)
+                }
+                
+                logger.info(f"   Oₓₒ{i+1}: Crossed top parents (f={parents[0].fitness:.3f} × f={parents[1].fitness:.3f}) → target f*={target_fitness:.3f}")
+        else:
+            # Signal that crossover was skipped - caller should compensate with GEPA
+            self._crossover_skipped = True
+            self._crossover_deficit = self.n_crossover
+            logger.info(f"⚠️  Crossover SKIPPED: need 2+ scored parents, have {len(pareto_front)}")
+            logger.info(f"   → Caller should compensate with {self._crossover_deficit} extra GEPA reflection candidates")
+        
+        # Mutation: Explore diverse variations (requires >= 1 parent)
+        # Use pareto_front if available, otherwise fall back to population
+        mutation_source = pareto_front if pareto_front else self.population
+        
+        if len(mutation_source) >= 1:
+            for i in range(self.n_mutation):
+                parent = self._select_parent_for_mutation(mutation_source)
+                
+                offspring = self.mutation_op(parent, self.population, llm)
+                new_prompts.append(offspring)
+                
+                parent_idx = self.population.index(parent) if parent in self.population else -1
+                self._generation_metadata[offspring] = {
+                    'generation': self.current_generation,
+                    'operator': 'mutation',
+                    'parent_indices': [parent_idx] if parent_idx >= 0 else None,
+                    'parent_prompts': [parent.prompt],
+                    'parent_fitness': parent.fitness,
+                    'target_fitness': None,
+                    'diversity_score': None,
+                    'sample_scores': None,
+                    'num_diverse_parents': min(self.mutation_op.nu, len(self.population))
+                }
+        
+        crossover_count = len([p for p in new_prompts if self._generation_metadata.get(p, {}).get('operator') == 'crossover'])
+        mutation_count = len([p for p in new_prompts if self._generation_metadata.get(p, {}).get('operator') == 'mutation'])
+        
+        logger.info(f"🧬 LLEGO Generated {len(new_prompts)} candidates: {crossover_count} crossover, {mutation_count} mutation")
+        
+        return new_prompts
+    
+    def get_prompt_metadata(self, prompt: str) -> Optional[PromptMetadata]:
+        """Retrieve metadata for a prompt generated in the current generation."""
+        return self._generation_metadata.get(prompt)
+    
+    def _convert_gepa_pareto_to_candidates(
+        self, 
+        gepa_pareto_front: List[Dict[str, Any]]
+    ) -> List[PromptCandidate]:
+        """
+        Convert GEPA Pareto front entries to PromptCandidate format.
+        
+        Args:
+            gepa_pareto_front: List of dicts with 'prompt', 'score', 'type', 'notation'
+            
+        Returns:
+            List of PromptCandidate objects
+        """
+        if not gepa_pareto_front:
+            return []
+        
+        # De-duplicate Pareto front
+        seen_prompts = set()
+        deduplicated_front = []
+        
+        for entry in gepa_pareto_front:
+            if isinstance(entry, dict) and 'prompt' in entry:
+                prompt_text = entry['prompt']
+                if prompt_text not in seen_prompts:
+                    seen_prompts.add(prompt_text)
+                    deduplicated_front.append(entry)
+        
+        candidates = []
+        
+        for idx, entry in enumerate(deduplicated_front):
+            try:
+                if not isinstance(entry, dict):
+                    continue
+                
+                prompt = entry.get('prompt')
+                if not prompt or not isinstance(prompt, str):
+                    continue
+                
+                score = entry.get('score')
+                if score is None:
+                    continue
+                
+                try:
+                    fitness = float(score)
+                except (ValueError, TypeError):
+                    continue
+                
+                candidate_type = entry.get('type', 'unknown')
+                notation = entry.get('notation', 'S')
+                
+                metadata: PromptMetadata = {
+                    'generation': self.current_generation,
+                    'operator': 'gepa_pareto_front',
+                    'parent_indices': None,
+                    'parent_prompts': None,
+                    'target_fitness': None,
+                    'diversity_score': None,
+                    'sample_scores': None,
+                    'num_diverse_parents': None,
+                    'candidate_type': candidate_type,
+                    'notation': notation,
+                    'prompt_length': len(prompt),
+                    'word_count': len(prompt.split()),
+                    'from_gepa_pareto': True
+                }
+                
+                candidate = PromptCandidate(
+                    prompt=prompt,
+                    fitness=fitness,
+                    metadata=metadata
+                )
+                
+                candidates.append(candidate)
+                
+            except Exception as e:
+                logger.error(f"Error converting Pareto entry #{idx+1}: {e}")
+                continue
+        
+        return candidates
+    
+    def update_population(self, new_candidates: List[PromptCandidate]):
+        """Update population with new evaluated candidates."""
+        self.population.extend(new_candidates)
+        
+        # Remove duplicates
+        seen_prompts = set()
+        unique_population = []
+        for p in self.population:
+            normalized = p.prompt.strip().strip('"\'')
+            if normalized not in seen_prompts:
+                seen_prompts.add(normalized)
+                unique_population.append(p)
+        self.population = unique_population
+        
+        # Keep top population_size by fitness
+        self.population.sort(key=lambda p: p.fitness, reverse=True)
+        self.population = self.population[:self.population_size]
+        
+        if self.population:
+            logger.debug(f"Population updated: {len(self.population)} candidates, best={self.population[0].fitness:.3f}")
+    
+    def _select_parents_for_crossover(self, pareto_front: List[PromptCandidate], k: int = 2) -> List[PromptCandidate]:
+        """Select top-k parents for crossover."""
+        sorted_front = sorted(pareto_front, key=lambda p: p.fitness, reverse=True)
+        return sorted_front[:k]
+    
+    def _select_parent_for_mutation(self, pareto_front: List[PromptCandidate]) -> PromptCandidate:
+        """Select a parent for mutation (fitness-proportionate)."""
+        if len(pareto_front) == 1:
+            return pareto_front[0]
+        
+        fitnesses = np.array([p.fitness for p in pareto_front])
+        fitnesses = np.maximum(fitnesses, 0.01)
+        probs = fitnesses / fitnesses.sum()
+        
+        idx = np.random.choice(len(pareto_front), p=probs)
+        return pareto_front[idx]
+    
+    def _calculate_target_fitness(self, parents: List[PromptCandidate]) -> float:
+        """Calculate target fitness for crossover using LLEGO formula: f* = f_max + α(f_max - f_min)"""
+        fitnesses = [p.fitness for p in parents]
+        f_max = max(fitnesses)
+        f_min = min(fitnesses)
+        
+        target_fitness = f_max + self.crossover_op.alpha * (f_max - f_min)
+        return min(target_fitness, 1.0)
+    
+    def get_best_candidate(self) -> Optional[PromptCandidate]:
+        """Get current best prompt."""
+        if not self.population:
+            return None
+        return max(self.population, key=lambda p: p.fitness)
+    
+    def get_stats(self) -> Dict[str, Any]:
+        """Get population statistics."""
+        if not self.population:
+            return {"population_size": 0, "best_fitness": 0.0, "avg_fitness": 0.0}
+        
+        fitnesses = [p.fitness for p in self.population]
+        return {
+            "population_size": len(self.population),
+            "best_fitness": max(fitnesses),
+            "avg_fitness": np.mean(fitnesses),
+            "min_fitness": min(fitnesses),
+            "fitness_std": np.std(fitnesses)
+        }
diff --git a/src/gepa_optimizer/operators/models.py b/src/gepa_optimizer/operators/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..45d92fb0843e843dc205fcd4da3ea64ec87bbe1c
--- /dev/null
+++ b/src/gepa_optimizer/operators/models.py
@@ -0,0 +1,60 @@
+"""
+Data models for LLEGO genetic operators.
+
+Contains the core data structures used across all genetic operators.
+"""
+
+from typing import List, Optional, Literal
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import TypedDict
+
+
+class PromptMetadata(TypedDict, total=False):
+    """
+    Metadata for tracking prompt evolution history and performance.
+    
+    This enables debugging, analysis, and visualization of the genetic algorithm's
+    evolution process by tracking how each prompt was created and its characteristics.
+    """
+    generation: int                                      # Which iteration created this prompt
+    operator: Literal['seed', 'crossover', 'mutation']  # How the prompt was created
+    parent_indices: Optional[List[int]]                  # Indices of parent prompts
+    parent_prompts: Optional[List[str]]                  # Actual parent prompt texts
+    target_fitness: Optional[float]                      # Target fitness for crossover
+    diversity_score: Optional[float]                     # Diversity from population
+    sample_scores: Optional[List[float]]                 # Performance per sample
+    num_diverse_parents: Optional[int]                   # Diverse parents count (mutation)
+    created_at: str                                      # Creation timestamp
+    prompt_length: int                                   # Character count
+    word_count: int                                      # Word count
+    candidate_type: Optional[str]                        # Type for GEPA notation
+
+
+@dataclass
+class PromptCandidate:
+    """
+    Represents a prompt candidate with fitness score and evolution metadata.
+    
+    Attributes:
+        prompt: The actual prompt text
+        fitness: Fitness score (0-1) from evaluation
+        metadata: Tracking information about prompt creation and performance
+    """
+    prompt: str
+    fitness: float
+    metadata: Optional[PromptMetadata] = field(default_factory=dict)
+    
+    def __post_init__(self):
+        """Initialize metadata if not provided."""
+        if self.metadata is None:
+            self.metadata = {}
+        
+        # Auto-populate prompt statistics
+        if 'prompt_length' not in self.metadata:
+            self.metadata['prompt_length'] = len(self.prompt)
+        if 'word_count' not in self.metadata:
+            self.metadata['word_count'] = len(self.prompt.split())
+        if 'created_at' not in self.metadata:
+            self.metadata['created_at'] = datetime.now().isoformat()
+
diff --git a/src/gepa_optimizer/operators/mutation.py b/src/gepa_optimizer/operators/mutation.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc850d93bb20c0e770f48f857e932bf01d6d7394
--- /dev/null
+++ b/src/gepa_optimizer/operators/mutation.py
@@ -0,0 +1,185 @@
+"""
+Diversity-Guided Mutation Operator.
+
+Adapts LLEGO's diversity-guided mutation for text prompts.
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+
+from typing import List, Callable, TYPE_CHECKING
+import numpy as np
+import logging
+
+from .base_operator import BaseMutationOperator
+
+if TYPE_CHECKING:
+    from .models import PromptCandidate
+
+logger = logging.getLogger(__name__)
+
+
+class DiversityGuidedMutation(BaseMutationOperator):
+    """
+    Diversity-guided mutation for text prompts.
+    
+    Explores the search space by generating diverse prompt variations
+    using temperature-controlled LLM sampling.
+    
+    From LLEGO paper:
+    "Diversity-guided mutation enables efficient global exploration by sampling
+    diverse parents with temperature parameter τ"
+    
+    Reference: https://github.com/nicolashuynh/LLEGO
+    """
+    
+    def __init__(self, tau: float = 10.0, nu: int = 4):
+        """
+        Initialize mutation operator.
+        
+        Args:
+            tau: Diversity temperature (higher = more exploration).
+                 Default 10.0 from LLEGO paper.
+            nu: Parent arity (number of parents to sample for diversity).
+                Default 4 from LLEGO paper.
+        """
+        self.tau = tau
+        self.nu = nu
+        logger.debug(f"DiversityGuidedMutation initialized with τ={tau}, ν={nu}")
+    
+    def __call__(
+        self,
+        parent: "PromptCandidate",
+        population: List["PromptCandidate"],
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Mutate a parent prompt to explore new regions.
+        
+        Args:
+            parent: Parent PromptCandidate to mutate
+            population: Current population for diversity guidance
+            llm: Language model callable
+            
+        Returns:
+            str: Mutated prompt
+        """
+        logger.debug(f"Mutation: parent fitness={parent.fitness:.3f}")
+        
+        # Sample diverse parents for context
+        diverse_parents = self._sample_diverse_parents(parent, population)
+        
+        # Build mutation prompt and call LLM
+        mutation_prompt = self._build_prompt(parent, diverse_parents)
+        mutated_prompt = llm(mutation_prompt)
+        
+        return mutated_prompt
+    
+    def _sample_diverse_parents(
+        self,
+        parent: "PromptCandidate",
+        population: List["PromptCandidate"]
+    ) -> List["PromptCandidate"]:
+        """
+        Sample diverse parents using temperature-based selection.
+        
+        Args:
+            parent: Current parent
+            population: Population to sample from
+            
+        Returns:
+            List of diverse parent candidates
+        """
+        # Calculate diversity scores
+        diversity_scores = []
+        for candidate in population:
+            if candidate.prompt != parent.prompt:
+                diversity = self._calculate_diversity(parent.prompt, candidate.prompt)
+                diversity_scores.append((candidate, diversity))
+        
+        if not diversity_scores:
+            return [parent]
+        
+        # Temperature-based sampling
+        scores = np.array([score for _, score in diversity_scores])
+        probs = np.exp(scores / self.tau)
+        probs /= probs.sum()
+        
+        # Sample nu diverse parents
+        n_samples = min(self.nu, len(diversity_scores))
+        indices = np.random.choice(
+            len(diversity_scores),
+            size=n_samples,
+            replace=False,
+            p=probs
+        )
+        
+        return [diversity_scores[i][0] for i in indices]
+    
+    def _calculate_diversity(self, prompt1: str, prompt2: str) -> float:
+        """
+        Calculate semantic diversity between two prompts.
+        
+        Uses Jaccard distance on words as a simple diversity metric.
+        
+        Args:
+            prompt1: First prompt
+            prompt2: Second prompt
+            
+        Returns:
+            float: Diversity score (0-1, higher = more diverse)
+        """
+        words1 = set(prompt1.lower().split())
+        words2 = set(prompt2.lower().split())
+        
+        intersection = len(words1 & words2)
+        union = len(words1 | words2)
+        
+        jaccard_similarity = intersection / union if union > 0 else 0
+        return 1 - jaccard_similarity  # Higher = more diverse
+    
+    def _build_prompt(
+        self,
+        parent: "PromptCandidate",
+        diverse_parents: List["PromptCandidate"]
+    ) -> str:
+        """
+        Build LLM prompt for mutation operation.
+        
+        Args:
+            parent: Parent candidate to mutate
+            diverse_parents: Diverse parents for context
+            
+        Returns:
+            str: Prompt for LLM
+        """
+        MAX_PARENT_LENGTH = 350
+        MAX_DIVERSE_LENGTH = 200
+        
+        parent_truncated = parent.prompt[:MAX_PARENT_LENGTH]
+        if len(parent.prompt) > MAX_PARENT_LENGTH:
+            parent_truncated += "..."
+        
+        # Build diversity context
+        diversity_context = []
+        for i, diverse_parent in enumerate(diverse_parents[:2]):
+            truncated = diverse_parent.prompt[:MAX_DIVERSE_LENGTH]
+            if len(diverse_parent.prompt) > MAX_DIVERSE_LENGTH:
+                truncated += "..."
+            diversity_context.append(f"V{i+1}: {truncated}")
+        
+        prompt = f"""Create a variation of this prompt with different decision logic (fitness: {parent.fitness:.2f}).
+
+Parent: {parent_truncated}
+
+{chr(10).join(diversity_context) if diversity_context else ""}
+
+Instructions:
+1. Explore NEW ways to categorize tasks (e.g., by element type, by action, by hierarchy)
+2. Add handling for edge cases the parent might miss
+3. Keep the structured, logical approach
+4. Keep format (Element: X, Description:, Reason:)
+5. Max 600 chars
+
+Output ONLY the new prompt:"""
+        
+        return prompt
+
diff --git a/src/gepa_optimizer/types.py b/src/gepa_optimizer/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..5df1de9d2dbdaf50a0e048ded5e2345cf72fc1b6
--- /dev/null
+++ b/src/gepa_optimizer/types.py
@@ -0,0 +1,245 @@
+"""
+Type definitions for GEPA Optimizer.
+
+This module contains type aliases, TypedDicts, and Protocol classes
+used throughout the GEPA Optimizer codebase for strict typing.
+"""
+
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Protocol,
+    Tuple,
+    TypedDict,
+    TypeVar,
+    Union,
+)
+
+
+# ============================================================================
+# Dataset Types
+# ============================================================================
+
+class DatasetItem(TypedDict, total=False):
+    """Standard dataset item format for GEPA optimization."""
+    input: str  # Input text/command
+    output: str  # Expected output
+    image_base64: str  # Base64-encoded image (optional)
+    metadata: Dict[str, Any]  # Additional metadata
+    reflection_input: str  # Simplified input for reflection (optional)
+
+
+DatasetSplit = Literal["train", "val", "test", "trainset", "valset", "testset"]
+DatasetList = List[DatasetItem]
+
+# Train/Val/Test split tuple
+DatasetSplitTuple = Tuple[DatasetList, DatasetList, DatasetList]
+
+
+# ============================================================================
+# Evaluation Types
+# ============================================================================
+
+class EvaluationResult(TypedDict, total=False):
+    """Result from evaluating a single sample."""
+    score: float  # Primary score [0.0, 1.0]
+    composite_score: float  # Weighted composite score
+    is_match: bool  # Whether prediction matches expected
+    predicted: str  # Model's prediction
+    expected: str  # Expected output
+    metrics: Dict[str, float]  # Detailed metric scores
+    feedback: str  # Human-readable feedback
+
+
+class EvaluationSummary(TypedDict):
+    """Summary of evaluation results."""
+    total_samples: int
+    correct_predictions: int
+    accuracy: float
+    average_score: float
+
+
+# ============================================================================
+# LLM Types
+# ============================================================================
+
+class LLMResponse(TypedDict, total=False):
+    """Response from LLM generation."""
+    content: str  # Generated text
+    usage: Dict[str, int]  # Token usage stats
+    model: str  # Model used
+    finish_reason: str  # Why generation stopped
+    source: str  # Source (gepa_reflection, llego_crossover, etc.)
+
+
+class LLMClientProtocol(Protocol):
+    """Protocol for LLM client implementations."""
+    
+    def generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs: Any
+    ) -> Union[str, Dict[str, Any]]:
+        """Generate a response from the LLM."""
+        ...
+
+
+class BatchLLMClientProtocol(Protocol):
+    """Protocol for batch LLM client implementations."""
+    
+    def submit_batch(
+        self,
+        tasks: List[Dict[str, Any]],
+        **kwargs: Any
+    ) -> str:
+        """Submit a batch of tasks. Returns batch ID."""
+        ...
+    
+    def get_batch_results(
+        self,
+        batch_id: str,
+        **kwargs: Any
+    ) -> List[Dict[str, Any]]:
+        """Get results for a submitted batch."""
+        ...
+
+
+# ============================================================================
+# Evaluator Types
+# ============================================================================
+
+class EvaluatorProtocol(Protocol):
+    """Protocol for evaluator implementations."""
+    
+    def evaluate(
+        self,
+        predicted: str,
+        expected: str,
+        **kwargs: Any
+    ) -> Dict[str, float]:
+        """Evaluate a prediction against expected output."""
+        ...
+    
+    def get_composite_score(
+        self,
+        metrics: Dict[str, float]
+    ) -> float:
+        """Calculate composite score from individual metrics."""
+        ...
+
+
+# ============================================================================
+# Optimization Types
+# ============================================================================
+
+class CandidateDict(TypedDict, total=False):
+    """A prompt candidate in the optimization process."""
+    system_prompt: str  # The prompt text
+    prompt: str  # Alias for system_prompt
+    fitness: float  # Fitness score
+    score: float  # Alias for fitness
+    source: str  # Source of candidate (seed, gepa_reflection, llego_crossover, etc.)
+    type: str  # Type alias for source
+    notation: str  # Mathematical notation (S₀, Sᵣ, Oₓₒ, Oₘᵤₜ)
+    index: int  # Candidate index
+
+
+class ParetoCandidate(TypedDict):
+    """A candidate in the Pareto front."""
+    prompt: str
+    score: float
+    type: str
+    notation: str
+
+
+class OptimizationState(TypedDict, total=False):
+    """Current state of optimization."""
+    iteration: int
+    best_score: float
+    best_prompt: str
+    pareto_front: List[ParetoCandidate]
+    baseline_score: Optional[float]
+
+
+# ============================================================================
+# Configuration Types
+# ============================================================================
+
+class DataSplitConfig(TypedDict, total=False):
+    """Configuration for dataset splitting."""
+    train_ratio: float
+    val_ratio: float
+    test_ratio: float
+    shuffle: bool
+    seed: Optional[int]
+
+
+class LLEGOConfig(TypedDict, total=False):
+    """Configuration for LLEGO operators."""
+    mode: Literal["hybrid", "llego_only", "disabled"]
+    population_size: int
+    num_crossover_candidates: int
+    num_mutation_candidates: int
+    crossover_enabled: bool
+    mutation_enabled: bool
+
+
+# ============================================================================
+# Type Variables
+# ============================================================================
+
+T = TypeVar("T")
+DatasetT = TypeVar("DatasetT", bound=Dict[str, Any])
+ResultT = TypeVar("ResultT")
+
+
+# ============================================================================
+# Callback Types
+# ============================================================================
+
+EvaluationCallback = Callable[[str, str], EvaluationResult]
+GenerationCallback = Callable[[str], str]
+ProgressCallback = Callable[[int, int, float], None]
+
+
+# ============================================================================
+# Export
+# ============================================================================
+
+__all__ = [
+    # Dataset
+    "DatasetItem",
+    "DatasetSplit",
+    "DatasetList",
+    "DatasetSplitTuple",
+    # Evaluation
+    "EvaluationResult",
+    "EvaluationSummary",
+    "EvaluatorProtocol",
+    # LLM
+    "LLMResponse",
+    "LLMClientProtocol",
+    "BatchLLMClientProtocol",
+    # Optimization
+    "CandidateDict",
+    "ParetoCandidate",
+    "OptimizationState",
+    # Configuration
+    "DataSplitConfig",
+    "LLEGOConfig",
+    # Callbacks
+    "EvaluationCallback",
+    "GenerationCallback",
+    "ProgressCallback",
+    # Type Variables
+    "T",
+    "DatasetT",
+    "ResultT",
+]
+
diff --git a/src/gepa_optimizer/utils/__init__.py b/src/gepa_optimizer/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec5d3d6813c17c86cf77e1ad00d5c9bff4aae705
--- /dev/null
+++ b/src/gepa_optimizer/utils/__init__.py
@@ -0,0 +1,40 @@
+"""
+Utility functions for GEPA Optimizer
+"""
+
+from .helpers import sanitize_prompt
+from .logging import setup_logging
+from .metrics import calculate_metrics
+from .api_keys import APIKeyManager
+from .exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError
+from .universal_judge_prompt import (
+    build_universal_judge_prompt,
+    get_universal_judge_system_prompt,
+    format_universal_judge_feedback,
+    build_empty_output_feedback
+)
+from .format_detection import (
+    detect_output_format,
+    build_format_aware_reflection_prompt,
+    generate_format_feedback
+)
+
+__all__ = [
+    "sanitize_prompt",
+    "setup_logging", 
+    "calculate_metrics",
+    "APIKeyManager",
+    "GepaOptimizerError",
+    "GepaDependencyError",
+    "InvalidInputError",
+    "DatasetError",
+    # Universal judge prompt utilities
+    "build_universal_judge_prompt",
+    "get_universal_judge_system_prompt",
+    "format_universal_judge_feedback",
+    "build_empty_output_feedback",
+    # Format detection utilities
+    "detect_output_format",
+    "build_format_aware_reflection_prompt",
+    "generate_format_feedback"
+]
diff --git a/src/gepa_optimizer/utils/api_keys.py b/src/gepa_optimizer/utils/api_keys.py
new file mode 100644
index 0000000000000000000000000000000000000000..37cec5c12d4768e9bf76240ef78f0a4ab3daa3f3
--- /dev/null
+++ b/src/gepa_optimizer/utils/api_keys.py
@@ -0,0 +1,109 @@
+"""
+API Key Management for GEPA Optimizer
+"""
+
+import os
+from dotenv import load_dotenv
+from typing import Optional, Dict, List
+
+class APIKeyManager:
+    """Handles API keys securely without hardcoding"""
+    
+    def __init__(self):
+        # Load .env file if present
+        load_dotenv()
+        self._keys: Dict[str, str] = {}
+        self._load_from_env()
+
+    def _load_from_env(self):
+        """Load API keys from environment variables"""
+        env_mappings = {
+            'openai': 'OPENAI_API_KEY',
+            'anthropic': 'ANTHROPIC_API_KEY',
+            'huggingface': 'HUGGINGFACE_API_KEY',
+            'cohere': 'COHERE_API_KEY',
+            'ai21': 'AI21_API_KEY',
+            'together': 'TOGETHER_API_KEY',
+            'replicate': 'REPLICATE_API_TOKEN',
+            'groq': 'GROQ_API_KEY',
+            'ollama': 'OLLAMA_API_KEY',
+            'google': 'GEMINI_API_KEY',
+            'gemini': 'GEMINI_API_KEY' 
+        }
+        
+        for provider, env_var in env_mappings.items():
+            key = os.getenv(env_var)
+            if key:
+                self._keys[provider] = key
+
+    def get_api_key(self, provider: str) -> Optional[str]:
+        """Get API key for a specific provider"""
+        return self._keys.get(provider.lower())
+    
+    def set_api_key(self, provider: str, key: str):
+        """Set API key for a provider at runtime"""
+        provider_lower = provider.lower()
+        self._keys[provider_lower] = key
+    
+    # Handle aliases - if setting google, also set gemini and vice versa
+        if provider_lower == 'google':
+            self._keys['gemini'] = key
+        elif provider_lower == 'gemini':
+            self._keys['google'] = key
+    
+    def has_key(self, provider: str) -> bool:
+        """Check if API key exists for provider"""
+        return provider.lower() in self._keys
+    
+    def get_missing_keys(self, providers: List[str]) -> List[str]:
+        """Get list of providers missing API keys"""
+        return [p for p in providers if not self.has_key(p)]
+    
+    def validate_keys(self, providers: List[str]) -> Dict[str, bool]:
+        """Validate API keys for multiple providers"""
+        return {provider: self.has_key(provider) for provider in providers}
+
+    # Legacy methods for backward compatibility
+    def set_openai_key(self, key: str):
+        """Set OpenAI API key at runtime"""
+        self.set_api_key('openai', key)
+
+    def set_anthropic_key(self, key: str):
+        """Set Anthropic API key at runtime"""
+        self.set_api_key('anthropic', key)
+
+    def set_google_key(self, key: str):
+        """Set Google API key at runtime"""
+        self.set_api_key('google', key)
+
+    def set_gemini_key(self, key: str):
+        """Set Gemini API key at runtime (alias for Google)"""
+        self.set_api_key('google', key)
+
+    def get_openai_key(self) -> str:
+        """Get OpenAI key or raise error if missing"""
+        key = self.get_api_key('openai')
+        if not key:
+            raise RuntimeError(
+                "OpenAI API key missing. Set via:\n"
+                "1. Environment variable: OPENAI_API_KEY=your_key\n"
+                "2. .env file: OPENAI_API_KEY=your_key\n"
+                "3. Code: api_manager.set_api_key('openai', 'your_key')"
+            )
+        return key
+
+    def get_anthropic_key(self) -> Optional[str]:
+        """Get Anthropic key (optional)"""
+        return self.get_api_key('anthropic')
+
+    def get_google_key(self) -> Optional[str]:
+        """Get Google key (optional)"""
+        return self.get_api_key('google')
+
+    def get_gemini_key(self) -> Optional[str]:
+        """Get Gemini key (alias for Google)"""
+        return self.get_api_key('google')
+
+    def has_required_keys(self) -> bool:
+        """Check if required keys are available"""
+        return bool(self.get_api_key('openai'))
diff --git a/src/gepa_optimizer/utils/candidate_collector.py b/src/gepa_optimizer/utils/candidate_collector.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f2ce7836104778ec70239fd22c77ff2256100b3
--- /dev/null
+++ b/src/gepa_optimizer/utils/candidate_collector.py
@@ -0,0 +1,313 @@
+"""
+Candidate and Feedback Collector for Presentation
+
+This module collects all candidates generated during optimization along with
+their feedback, scores, and metadata for presentation purposes.
+"""
+
+import json
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+from dataclasses import dataclass, asdict, field
+
+
+@dataclass
+class CandidateInfo:
+    """Information about a single candidate prompt"""
+    iteration: int
+    candidate_id: str
+    source: str  # "GEPA_Reflection", "LLEGO_Crossover", "LLEGO_Mutation", "Seed"
+    prompt: str
+    score: Optional[float] = None
+    feedback: Optional[str] = None
+    feedback_details: Optional[Dict[str, Any]] = None
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+
+@dataclass
+class IterationInfo:
+    """Information about a single optimization iteration"""
+    iteration: int
+    candidates: List[CandidateInfo] = field(default_factory=list)
+    best_candidate: Optional[CandidateInfo] = None
+    best_score: Optional[float] = None
+    timestamp: str = field(default_factory=lambda: datetime.now().isoformat())
+
+
+class CandidateCollector:
+    """
+    Collects all candidates and feedback during optimization for presentation.
+    """
+    
+    def __init__(self, output_dir: str = "presentation_data"):
+        """
+        Initialize the collector.
+        
+        Args:
+            output_dir: Directory to save collected data
+        """
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        
+        self.iterations: List[IterationInfo] = []
+        self.current_iteration: Optional[IterationInfo] = None
+        self.all_candidates: List[CandidateInfo] = []
+        
+        # Track seed prompt
+        self.seed_prompt: Optional[str] = None
+        
+    def set_seed_prompt(self, seed_prompt: str):
+        """Set the seed prompt for reference"""
+        self.seed_prompt = seed_prompt
+        
+    def start_iteration(self, iteration: int):
+        """Start tracking a new iteration"""
+        self.current_iteration = IterationInfo(iteration=iteration)
+        self.iterations.append(self.current_iteration)
+        
+    def add_candidate(
+        self,
+        iteration: int,
+        candidate_id: str,
+        source: str,
+        prompt: str,
+        score: Optional[float] = None,
+        feedback: Optional[str] = None,
+        feedback_details: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Add a candidate to the collection.
+        
+        Args:
+            iteration: Iteration number
+            candidate_id: Unique identifier for the candidate
+            source: Source of the candidate ("GEPA_Reflection", "LLEGO_Crossover", etc.)
+            prompt: The candidate prompt text
+            score: Evaluation score (if available)
+            feedback: Feedback text (if available)
+            feedback_details: Additional feedback details (if available)
+        """
+        candidate = CandidateInfo(
+            iteration=iteration,
+            candidate_id=candidate_id,
+            source=source,
+            prompt=prompt,
+            score=score,
+            feedback=feedback,
+            feedback_details=feedback_details
+        )
+        
+        # Add to current iteration
+        if self.current_iteration and self.current_iteration.iteration == iteration:
+            self.current_iteration.candidates.append(candidate)
+            
+            # Update best candidate if this is better
+            if score is not None:
+                if (self.current_iteration.best_score is None or 
+                    score > self.current_iteration.best_score):
+                    self.current_iteration.best_candidate = candidate
+                    self.current_iteration.best_score = score
+        
+        # Add to all candidates list
+        self.all_candidates.append(candidate)
+        
+    def add_feedback(
+        self,
+        candidate_id: str,
+        feedback: str,
+        feedback_details: Optional[Dict[str, Any]] = None
+    ):
+        """
+        Add feedback to an existing candidate.
+        
+        Args:
+            candidate_id: ID of the candidate to update
+            feedback: Feedback text
+            feedback_details: Additional feedback details
+        """
+        for candidate in self.all_candidates:
+            if candidate.candidate_id == candidate_id:
+                candidate.feedback = feedback
+                candidate.feedback_details = feedback_details
+                break
+        
+        # Also update in iterations
+        for iteration in self.iterations:
+            for candidate in iteration.candidates:
+                if candidate.candidate_id == candidate_id:
+                    candidate.feedback = feedback
+                    candidate.feedback_details = feedback_details
+                    break
+    
+    def add_score(
+        self,
+        candidate_id: str,
+        score: float
+    ):
+        """
+        Add score to an existing candidate.
+        
+        Args:
+            candidate_id: ID of the candidate to update
+            score: Evaluation score
+        """
+        for candidate in self.all_candidates:
+            if candidate.candidate_id == candidate_id:
+                candidate.score = score
+                break
+        
+        # Also update in iterations
+        for iteration in self.iterations:
+            for candidate in iteration.candidates:
+                if candidate.candidate_id == candidate_id:
+                    candidate.score = score
+                    # Update best candidate if needed
+                    if (iteration.best_score is None or score > iteration.best_score):
+                        iteration.best_candidate = candidate
+                        iteration.best_score = score
+                    break
+    
+    def save_to_json(self, filename: Optional[str] = None) -> Path:
+        """
+        Save collected data to JSON file.
+        
+        Args:
+            filename: Optional filename (auto-generated if not provided)
+            
+        Returns:
+            Path to saved file
+        """
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"candidates_and_feedback_{timestamp}.json"
+        
+        filepath = self.output_dir / filename
+        
+        data = {
+            "seed_prompt": self.seed_prompt,
+            "total_iterations": len(self.iterations),
+            "total_candidates": len(self.all_candidates),
+            "iterations": [asdict(iter_info) for iter_info in self.iterations],
+            "all_candidates": [asdict(candidate) for candidate in self.all_candidates],
+            "timestamp": datetime.now().isoformat()
+        }
+        
+        with open(filepath, 'w', encoding='utf-8') as f:
+            json.dump(data, f, indent=2, ensure_ascii=False)
+        
+        return filepath
+    
+    def save_to_markdown(self, filename: Optional[str] = None) -> Path:
+        """
+        Save collected data to Markdown file (presentation-ready format).
+        
+        Args:
+            filename: Optional filename (auto-generated if not provided)
+            
+        Returns:
+            Path to saved file
+        """
+        if filename is None:
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+            filename = f"candidates_and_feedback_{timestamp}.md"
+        
+        filepath = self.output_dir / filename
+        
+        with open(filepath, 'w', encoding='utf-8') as f:
+            # Header
+            f.write("# Optimization Candidates and Feedback\n\n")
+            f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+            f.write(f"**Total Iterations:** {len(self.iterations)}\n")
+            f.write(f"**Total Candidates:** {len(self.all_candidates)}\n\n")
+            
+            # Seed Prompt
+            if self.seed_prompt:
+                f.write("---\n\n")
+                f.write("## 🌱 Seed Prompt\n\n")
+                f.write("```\n")
+                f.write(self.seed_prompt)
+                f.write("\n```\n\n")
+            
+            # Iterations
+            for iter_info in self.iterations:
+                f.write("---\n\n")
+                f.write(f"## 🔄 Iteration {iter_info.iteration}\n\n")
+                
+                # Best candidate for this iteration
+                if iter_info.best_candidate:
+                    f.write(f"### 🏆 Best Candidate (Score: {iter_info.best_score:.4f})\n\n")
+                    f.write(f"**Source:** {iter_info.best_candidate.source}\n\n")
+                    f.write(f"**Prompt:**\n```\n")
+                    f.write(iter_info.best_candidate.prompt)
+                    f.write("\n```\n\n")
+                    
+                    if iter_info.best_candidate.feedback:
+                        f.write(f"**Feedback:**\n\n")
+                        f.write(f"{iter_info.best_candidate.feedback}\n\n")
+                
+                # All candidates in this iteration
+                f.write(f"### 📝 All Candidates ({len(iter_info.candidates)})\n\n")
+                
+                for idx, candidate in enumerate(iter_info.candidates, 1):
+                    f.write(f"#### Candidate {idx}: {candidate.source}\n\n")
+                    f.write(f"**ID:** `{candidate.candidate_id}`\n\n")
+                    
+                    if candidate.score is not None:
+                        f.write(f"**Score:** `{candidate.score:.4f}`\n\n")
+                    
+                    f.write(f"**Prompt:**\n```\n")
+                    f.write(candidate.prompt)
+                    f.write("\n```\n\n")
+                    
+                    if candidate.feedback:
+                        f.write(f"**Feedback:**\n\n")
+                        f.write(f"{candidate.feedback}\n\n")
+                    
+                    if candidate.feedback_details:
+                        f.write(f"**Feedback Details:**\n\n")
+                        f.write("```json\n")
+                        f.write(json.dumps(candidate.feedback_details, indent=2))
+                        f.write("\n```\n\n")
+                    
+                    f.write("---\n\n")
+            
+            # Summary by source
+            f.write("---\n\n")
+            f.write("## 📊 Summary by Source\n\n")
+            
+            sources = {}
+            for candidate in self.all_candidates:
+                if candidate.source not in sources:
+                    sources[candidate.source] = []
+                sources[candidate.source].append(candidate)
+            
+            for source, candidates in sources.items():
+                f.write(f"### {source} ({len(candidates)} candidates)\n\n")
+                for candidate in candidates:
+                    score_str = f"Score: {candidate.score:.4f}" if candidate.score else "No score"
+                    f.write(f"- **{candidate.candidate_id}** (Iteration {candidate.iteration}, {score_str})\n")
+                f.write("\n")
+        
+        return filepath
+    
+    def get_summary(self) -> Dict[str, Any]:
+        """Get a summary of collected data"""
+        sources = {}
+        for candidate in self.all_candidates:
+            if candidate.source not in sources:
+                sources[candidate.source] = 0
+            sources[candidate.source] += 1
+        
+        scored_candidates = [c for c in self.all_candidates if c.score is not None]
+        avg_score = sum(c.score for c in scored_candidates) / len(scored_candidates) if scored_candidates else None
+        
+        return {
+            "total_iterations": len(self.iterations),
+            "total_candidates": len(self.all_candidates),
+            "candidates_by_source": sources,
+            "candidates_with_scores": len(scored_candidates),
+            "average_score": avg_score,
+            "candidates_with_feedback": len([c for c in self.all_candidates if c.feedback])
+        }
+
diff --git a/src/gepa_optimizer/utils/clean_logger.py b/src/gepa_optimizer/utils/clean_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..167c596e412ca1649b214317cef94eaf17bbc75d
--- /dev/null
+++ b/src/gepa_optimizer/utils/clean_logger.py
@@ -0,0 +1,160 @@
+"""
+Clean Logger for GEPA + LLEGO Optimization
+Provides simple, visual logging similar to diagram format.
+
+Uses the centralized logging infrastructure with a custom handler
+for clean, user-friendly console output.
+"""
+
+import logging
+import sys
+from typing import List, Optional
+
+# Create dedicated logger for clean output
+_clean_output_logger = logging.getLogger("gepa_optimizer.clean_output")
+
+
+def _setup_clean_logger():
+    """Setup the clean output logger with minimal formatting."""
+    if not _clean_output_logger.handlers:
+        handler = logging.StreamHandler(sys.stdout)
+        handler.setLevel(logging.INFO)
+        # Minimal formatter - just the message
+        handler.setFormatter(logging.Formatter("%(message)s"))
+        _clean_output_logger.addHandler(handler)
+        _clean_output_logger.setLevel(logging.INFO)
+        # Don't propagate to root logger to avoid duplicate output
+        _clean_output_logger.propagate = False
+
+
+# Initialize on module load
+_setup_clean_logger()
+
+
+class CleanLogger:
+    """
+    Simple, visual logging for optimization workflow.
+    
+    Uses a dedicated logger with minimal formatting to produce
+    clean, user-friendly console output.
+    """
+    
+    def __init__(self):
+        self.current_iteration = 0
+        self.gepa_reflection_count = 0
+        self.llego_crossover_count = 0
+        self.llego_mutation_count = 0
+        self._logger = _clean_output_logger
+        
+    def log_iteration_start(self, iteration: int, seed_prompt: Optional[str] = None):
+        """Log start of new iteration."""
+        self.current_iteration = iteration
+        self.gepa_reflection_count = 0
+        self.llego_crossover_count = 0
+        self.llego_mutation_count = 0
+        
+        self._logger.info("")
+        self._logger.info("═" * 80)
+        # FIX: More accurate description - we evaluate first, then generate
+        if iteration == 1:
+            self._logger.info(f"  ITERATION {iteration}: EVALUATING SEED PROMPT")
+        else:
+            self._logger.info(f"  ITERATION {iteration}: EVALUATING & GENERATING CANDIDATES")
+        self._logger.info("═" * 80)
+        
+        if seed_prompt and iteration == 0:
+            self._logger.info("")
+            self._logger.info("SEED PROMPT:")
+            self._logger.info("─" * 80)
+            self._logger.info(seed_prompt)
+            self._logger.info("─" * 80)
+    
+    def log_candidate_generation_summary(self):
+        """Log summary of candidates generated this iteration."""
+        total = self.gepa_reflection_count + self.llego_crossover_count + self.llego_mutation_count
+        
+        self._logger.info("")
+        self._logger.info("CANDIDATES GENERATED THIS ITERATION:")
+        self._logger.info(f"   GEPA Reflection:  {self.gepa_reflection_count}")
+        self._logger.info(f"   LLEGO Crossover:  {self.llego_crossover_count}")
+        self._logger.info(f"   LLEGO Mutation:   {self.llego_mutation_count}")
+        self._logger.info(f"   TOTAL:            {total}")
+    
+    def log_gepa_reflection_candidate(self, candidate_num: int, prompt: str):
+        """Log a GEPA reflection candidate."""
+        self.gepa_reflection_count += 1
+        self._logger.info("")
+        self._logger.info(f"GEPA Reflection Candidate #{candidate_num}:")
+        self._logger.info("─" * 80)
+        if prompt and prompt.strip():
+            self._logger.info(prompt)  # Show full prompt at INFO level
+        else:
+            self._logger.warning("⚠️  Empty candidate prompt!")
+        self._logger.info("─" * 80)
+    
+    def log_llego_crossover_candidate(self, candidate_num: int, prompt: str):
+        """Log a LLEGO crossover candidate."""
+        self.llego_crossover_count += 1
+        self._logger.info("")
+        self._logger.info(f"LLEGO Crossover Candidate #{candidate_num}:")
+        self._logger.info("─" * 80)
+        if prompt and prompt.strip():
+            self._logger.info(prompt)  # Show full prompt at INFO level
+        else:
+            self._logger.warning("⚠️  Empty candidate prompt!")
+        self._logger.info("─" * 80)
+    
+    def log_llego_mutation_candidate(self, candidate_num: int, prompt: str):
+        """Log a LLEGO mutation candidate."""
+        self.llego_mutation_count += 1
+        self._logger.info("")
+        self._logger.info(f"LLEGO Mutation Candidate #{candidate_num}:")
+        self._logger.info("─" * 80)
+        if prompt and prompt.strip():
+            self._logger.info(prompt)  # Show full prompt at INFO level
+        else:
+            self._logger.warning("⚠️  Empty candidate prompt!")
+        self._logger.info("─" * 80)
+    
+    def log_evaluation_results(self, candidate_prompts: List[str], scores: List[float]):
+        """Log evaluation results for all candidates."""
+        self._logger.info("")
+        self._logger.info("═" * 80)
+        self._logger.info("  EVALUATION RESULTS")
+        self._logger.info("═" * 80)
+        
+        for i, (prompt, score) in enumerate(zip(candidate_prompts, scores), 1):
+            self._logger.info(f"")
+            self._logger.info(f"Candidate #{i}:")
+            self._logger.info(f"   Score: {score:.4f}")
+            self._logger.info(f"   Prompt Preview: {prompt[:100]}...")
+    
+    def log_pareto_front_update(self, pareto_size: int, best_score: float):
+        """Log Pareto front update."""
+        self._logger.info("")
+        self._logger.info("═" * 80)
+        self._logger.info("  PARETO FRONT UPDATE")
+        self._logger.info("═" * 80)
+        self._logger.info(f"   Front Size: {pareto_size} candidates")
+        self._logger.info(f"   Best Score: {best_score:.4f}")
+    
+    def log_iteration_summary(self, iteration: int, total_candidates: int, best_score: float):
+        """Log iteration summary."""
+        self._logger.info("")
+        self._logger.info("═" * 80)
+        self._logger.info(f"  ITERATION {iteration} SUMMARY")
+        self._logger.info("═" * 80)
+        self._logger.info(f"   Candidates Evaluated: {total_candidates}")
+        self._logger.info(f"   Best Score: {best_score:.4f}")
+        self._logger.info(f"   GEPA Reflection: {self.gepa_reflection_count}")
+        self._logger.info(f"   LLEGO Crossover: {self.llego_crossover_count}")
+        self._logger.info(f"   LLEGO Mutation: {self.llego_mutation_count}")
+
+
+# Global instance
+_clean_logger_instance = CleanLogger()
+
+
+def get_clean_logger() -> CleanLogger:
+    """Get global clean logger instance."""
+    return _clean_logger_instance
diff --git a/src/gepa_optimizer/utils/exceptions.py b/src/gepa_optimizer/utils/exceptions.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e4e043e6edafbab84261debe19151cf208cfdef
--- /dev/null
+++ b/src/gepa_optimizer/utils/exceptions.py
@@ -0,0 +1,27 @@
+"""
+Custom exceptions for GEPA Optimizer
+"""
+
+class GepaOptimizerError(Exception):
+    """Base class for all GEPA Optimizer exceptions"""
+    pass
+
+class GepaDependencyError(GepaOptimizerError):
+    """Exception raised for errors related to the GEPA library dependency"""
+    pass
+
+class InvalidInputError(GepaOptimizerError):
+    """Exception raised for invalid user inputs"""
+    pass
+
+class DatasetError(GepaOptimizerError):
+    """Exception raised for errors related to the dataset"""
+    pass
+
+class TestSetEvaluationError(GepaOptimizerError):
+    """Exception raised when test set evaluation fails"""
+    pass
+
+class ConfigurationError(GepaOptimizerError):
+    """Exception raised for invalid configuration"""
+    pass
diff --git a/src/gepa_optimizer/utils/format_detection.py b/src/gepa_optimizer/utils/format_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..e574cd3f73c98eafb1cbf2cf5d07c30fdf2e4c65
--- /dev/null
+++ b/src/gepa_optimizer/utils/format_detection.py
@@ -0,0 +1,391 @@
+"""
+Format Detection Utilities for GEPA Optimizer.
+
+This module provides utilities to automatically detect output format patterns
+from expected outputs and generate format constraints for reflection prompts.
+
+Key Features:
+1. Auto-detect JSON, key-value, tabular, or free-text formats
+2. Generate format specifications from examples
+3. Create format constraint strings for prompt injection
+"""
+
+import re
+import json
+from typing import List, Dict, Any, Optional, Tuple
+
+
+def detect_output_format(expected_outputs: List[str]) -> Dict[str, Any]:
+    """
+    Analyze expected outputs to detect the common format pattern.
+    
+    Args:
+        expected_outputs: List of expected output strings from the dataset
+        
+    Returns:
+        Dictionary containing:
+        - format_type: 'json', 'key_value', 'tabular', 'structured_text', 'free_text'
+        - format_spec: Human-readable format specification
+        - format_example: Example showing the format
+        - format_constraint: Constraint text to add to prompts
+        - detected_keys: List of keys/fields detected (for structured formats)
+        - avg_length: Average length of outputs (to enforce conciseness)
+    """
+    if not expected_outputs:
+        return {
+            'format_type': 'unknown',
+            'format_spec': 'Unknown format',
+            'format_example': '',
+            'format_constraint': '',
+            'detected_keys': [],
+            'avg_length': 0
+        }
+    
+    # Filter out empty outputs
+    valid_outputs = [o for o in expected_outputs if o and o.strip()]
+    if not valid_outputs:
+        return _create_format_result('unknown', 'Unknown format', '', [], 0)
+    
+    # Calculate average length for conciseness constraint
+    avg_length = sum(len(o) for o in valid_outputs) // len(valid_outputs)
+    max_length = max(len(o) for o in valid_outputs)
+    
+    # Try to detect format type (in order of specificity)
+    
+    # 1. Check for JSON format
+    json_result = _detect_json_format(valid_outputs, avg_length, max_length)
+    if json_result:
+        return json_result
+    
+    # 2. Check for key-value format (e.g., "Department: X | Sentiment: Y")
+    kv_result = _detect_key_value_format(valid_outputs, avg_length, max_length)
+    if kv_result:
+        return kv_result
+    
+    # 3. Check for bullet/list format
+    list_result = _detect_list_format(valid_outputs, avg_length, max_length)
+    if list_result:
+        return list_result
+    
+    # 4. Check for tabular/structured text
+    structured_result = _detect_structured_text(valid_outputs, avg_length, max_length)
+    if structured_result:
+        return structured_result
+    
+    # 5. Default to free text with length constraint
+    return _create_format_result(
+        'free_text',
+        f'Free-form text response (typically {avg_length} characters)',
+        valid_outputs[0][:100] if valid_outputs else '',
+        [],
+        avg_length,
+        max_length
+    )
+
+
+def _detect_json_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+    """Detect if outputs are JSON format."""
+    json_count = 0
+    all_keys = []
+    
+    for output in outputs:
+        stripped = output.strip()
+        if stripped.startswith('{') and stripped.endswith('}'):
+            try:
+                parsed = json.loads(stripped)
+                if isinstance(parsed, dict):
+                    json_count += 1
+                    all_keys.extend(parsed.keys())
+            except json.JSONDecodeError:
+                pass
+    
+    # If majority are JSON
+    if json_count >= len(outputs) * 0.7:
+        # Find common keys
+        key_counts = {}
+        for key in all_keys:
+            key_counts[key] = key_counts.get(key, 0) + 1
+        
+        common_keys = [k for k, v in key_counts.items() if v >= json_count * 0.5]
+        
+        # Build format spec
+        format_spec = f"JSON object with keys: {', '.join(common_keys)}"
+        format_example = outputs[0][:200] if outputs else '{}'
+        
+        return _create_format_result(
+            'json',
+            format_spec,
+            format_example,
+            common_keys,
+            avg_length,
+            max_length
+        )
+    
+    return None
+
+
+def _detect_key_value_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+    """Detect key-value formats like 'Department: X | Sentiment: Y'."""
+    # Common separators for key-value pairs
+    separators = ['|', '\n', ';', ',']
+    key_patterns = [
+        r'([A-Za-z_][A-Za-z0-9_\s]*)\s*[:=]\s*([^|;\n,]+)',  # Key: Value or Key = Value
+    ]
+    
+    all_keys = []
+    kv_count = 0
+    detected_separator = None
+    
+    for output in outputs:
+        # Try to find key-value pairs
+        for pattern in key_patterns:
+            matches = re.findall(pattern, output)
+            if len(matches) >= 2:  # At least 2 key-value pairs
+                kv_count += 1
+                for key, _ in matches:
+                    all_keys.append(key.strip())
+                
+                # Detect separator
+                for sep in separators:
+                    if sep in output:
+                        detected_separator = sep
+                        break
+                break
+    
+    # If majority are key-value
+    if kv_count >= len(outputs) * 0.6:
+        # Find common keys
+        key_counts = {}
+        for key in all_keys:
+            normalized = key.strip().lower()
+            key_counts[normalized] = key_counts.get(normalized, 0) + 1
+        
+        common_keys = [k for k, v in sorted(key_counts.items(), key=lambda x: -x[1]) 
+                       if v >= kv_count * 0.4][:5]  # Top 5 keys
+        
+        # Determine the exact format pattern
+        sep_display = detected_separator if detected_separator else ' | '
+        format_spec = f"Key-value pairs: {sep_display.join([f'{k}: [value]' for k in common_keys])}"
+        format_example = outputs[0] if outputs else ''
+        
+        return _create_format_result(
+            'key_value',
+            format_spec,
+            format_example,
+            common_keys,
+            avg_length,
+            max_length
+        )
+    
+    return None
+
+
+def _detect_list_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+    """Detect bullet/numbered list formats."""
+    list_patterns = [
+        r'^[-*•]\s+',  # Bullet points
+        r'^\d+[.)]\s+',  # Numbered list
+    ]
+    
+    list_count = 0
+    
+    for output in outputs:
+        lines = output.strip().split('\n')
+        list_lines = 0
+        for line in lines:
+            for pattern in list_patterns:
+                if re.match(pattern, line.strip()):
+                    list_lines += 1
+                    break
+        
+        if list_lines >= len(lines) * 0.5:  # Majority are list items
+            list_count += 1
+    
+    if list_count >= len(outputs) * 0.6:
+        return _create_format_result(
+            'list',
+            'Bullet or numbered list format',
+            outputs[0][:200] if outputs else '',
+            [],
+            avg_length,
+            max_length
+        )
+    
+    return None
+
+
+def _detect_structured_text(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]:
+    """Detect structured text with consistent patterns."""
+    # Check for consistent line patterns
+    line_counts = [len(o.strip().split('\n')) for o in outputs]
+    avg_lines = sum(line_counts) // len(line_counts) if line_counts else 1
+    
+    if avg_lines >= 2:
+        return _create_format_result(
+            'structured_text',
+            f'Structured text with ~{avg_lines} lines',
+            outputs[0][:200] if outputs else '',
+            [],
+            avg_length,
+            max_length
+        )
+    
+    return None
+
+
+def _create_format_result(
+    format_type: str,
+    format_spec: str,
+    format_example: str,
+    detected_keys: List[str],
+    avg_length: int,
+    max_length: int = 0
+) -> Dict[str, Any]:
+    """Create a standardized format detection result."""
+    # Generate format constraint based on type
+    if format_type == 'json':
+        constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Return ONLY a valid JSON object
+- Required keys: {', '.join(detected_keys) if detected_keys else 'as shown in examples'}
+- NO explanations, NO prose, NO markdown code blocks
+- Maximum length: ~{max_length} characters
+- Example format: {format_example[:150]}"""
+    
+    elif format_type == 'key_value':
+        constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Return ONLY in key-value format: {format_spec}
+- NO explanations, NO reasoning, NO additional text
+- Be CONCISE - output should be ~{avg_length} characters max
+- Example: {format_example}"""
+    
+    elif format_type == 'list':
+        constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Return as a bullet or numbered list
+- NO explanations before or after the list
+- Keep it concise (~{avg_length} characters)"""
+    
+    elif format_type == 'structured_text':
+        constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Follow the structured format shown in examples
+- NO additional explanations or commentary
+- Keep output concise (~{avg_length} characters)"""
+    
+    else:
+        constraint = f"""OUTPUT FORMAT REQUIREMENT:
+- Keep response CONCISE and DIRECT
+- NO lengthy explanations or reasoning
+- Target length: ~{avg_length} characters (max {max_length})
+- Match the format/style of the expected examples"""
+    
+    return {
+        'format_type': format_type,
+        'format_spec': format_spec,
+        'format_example': format_example[:200] if format_example else '',
+        'format_constraint': constraint,
+        'detected_keys': detected_keys,
+        'avg_length': avg_length,
+        'max_length': max_length
+    }
+
+
+def build_format_aware_reflection_prompt(
+    base_prompt: str,
+    format_info: Dict[str, Any],
+    include_example: bool = True
+) -> str:
+    """
+    Enhance a reflection prompt with format awareness.
+    
+    Args:
+        base_prompt: The original reflection prompt
+        format_info: Format detection result from detect_output_format()
+        include_example: Whether to include format example
+        
+    Returns:
+        Enhanced prompt with format constraints
+    """
+    if not format_info or format_info.get('format_type') == 'unknown':
+        return base_prompt
+    
+    format_section = f"""
+
+🎯 CRITICAL FORMAT REQUIREMENT:
+The optimized prompt MUST produce outputs that match this EXACT format:
+
+{format_info['format_constraint']}
+
+⚠️ COMMON FAILURE MODES TO AVOID:
+1. Generating explanations when only the answer is needed
+2. Adding "Here's the analysis..." or similar preambles
+3. Producing verbose output when concise is required
+4. Wrong structure (e.g., prose instead of key-value pairs)
+"""
+    
+    if include_example and format_info.get('format_example'):
+        format_section += f"""
+📋 EXAMPLE OF CORRECT OUTPUT FORMAT:
+{format_info['format_example']}
+"""
+    
+    # Insert format section near the end of the prompt but before any final instructions
+    return base_prompt + format_section
+
+
+def generate_format_feedback(
+    predicted_output: str,
+    expected_output: str,
+    format_info: Dict[str, Any]
+) -> str:
+    """
+    Generate specific feedback about format compliance.
+    
+    Args:
+        predicted_output: What the model actually produced
+        expected_output: The ground truth output
+        format_info: Format detection result
+        
+    Returns:
+        Specific format-related feedback
+    """
+    predicted_len = len(predicted_output) if predicted_output else 0
+    expected_len = len(expected_output) if expected_output else 0
+    
+    issues = []
+    
+    # Check length discrepancy
+    if format_info.get('avg_length', 0) > 0:
+        if predicted_len > format_info['avg_length'] * 3:
+            issues.append(f"OUTPUT TOO VERBOSE: Generated {predicted_len} chars, expected ~{format_info['avg_length']} chars")
+        elif predicted_len > format_info.get('max_length', predicted_len) * 2:
+            issues.append(f"OUTPUT TOO LONG: {predicted_len} chars vs max expected {format_info.get('max_length', 'unknown')}")
+    
+    # Check format type compliance
+    format_type = format_info.get('format_type', 'unknown')
+    
+    if format_type == 'json':
+        try:
+            json.loads(predicted_output.strip() if predicted_output else '{}')
+        except json.JSONDecodeError:
+            issues.append("FORMAT ERROR: Expected JSON but got non-JSON output")
+    
+    elif format_type == 'key_value':
+        # Check if output has key-value structure
+        if predicted_output and ':' not in predicted_output:
+            issues.append("FORMAT ERROR: Expected key-value pairs (Key: Value) but output lacks this structure")
+    
+    # Check for common verbose patterns
+    verbose_indicators = [
+        'let me', 'i will', 'here is', "here's", 'analysis:', 'step-by-step',
+        'first,', 'to begin', 'in order to', 'the following', 'please note'
+    ]
+    
+    if predicted_output:
+        lower_output = predicted_output.lower()
+        found_verbose = [v for v in verbose_indicators if v in lower_output]
+        if found_verbose:
+            issues.append(f"VERBOSITY WARNING: Output contains explanatory phrases: {', '.join(found_verbose[:3])}")
+    
+    if not issues:
+        return ""
+    
+    return "\n🚨 FORMAT ISSUES DETECTED:\n" + "\n".join(f"  • {issue}" for issue in issues)
diff --git a/src/gepa_optimizer/utils/helpers.py b/src/gepa_optimizer/utils/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..cbd57f5d498d68feecf2b293159ffdf665376162
--- /dev/null
+++ b/src/gepa_optimizer/utils/helpers.py
@@ -0,0 +1,23 @@
+"""
+Helper functions for GEPA Optimizer
+"""
+
+def sanitize_prompt(prompt: str) -> str:
+    """
+    Sanitize and validate prompt string
+    
+    Args:
+        prompt: Input prompt string to sanitize
+        
+    Returns:
+        str: Cleaned and validated prompt
+    """
+    if not isinstance(prompt, str):
+        prompt = str(prompt)
+    
+    prompt = prompt.strip()
+    
+    if not prompt:
+        prompt = "You are a helpful assistant."
+    
+    return prompt
diff --git a/src/gepa_optimizer/utils/llm_judge_prompt.py b/src/gepa_optimizer/utils/llm_judge_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..fcab0497cde56ad02b9db146d556594add9881cd
--- /dev/null
+++ b/src/gepa_optimizer/utils/llm_judge_prompt.py
@@ -0,0 +1,322 @@
+"""
+LLM-as-Judge Prompt for Index Caching Use Case
+
+This module provides a specialized LLM-as-Judge prompt template for analyzing
+index caching evaluation results and generating actionable feedback for prompt improvement.
+"""
+
+from typing import Dict, Any, Optional
+
+
+def build_index_caching_judge_prompt(
+    task_command: str,
+    predicted_dict: Dict[str, Any],
+    expected_dict: Dict[str, Any],
+    predicted_output: str,
+    expected_output: str,
+    current_prompt: Optional[str] = None,
+    evaluation_results: Optional[Dict[str, Any]] = None,
+    image_base64: Optional[str] = None
+) -> str:
+    """
+    Build LLM-as-Judge prompt for index caching use case.
+    
+    This prompt analyzes why the LLM failed to correctly identify:
+    - is_index_based (boolean)
+    - index_value (int or null)
+    - parent_element_id (string or null)
+    - element_id_of_nth_child_of_parent (string or null)
+    - selected_element_is_correct (boolean)
+    
+    Args:
+        task_command: The natural language command
+        predicted_dict: Parsed predicted JSON output
+        expected_dict: Parsed expected JSON output
+        predicted_output: Raw predicted output string
+        expected_output: Raw expected output string
+        current_prompt: Current system prompt being optimized
+        evaluation_results: Full evaluation results with field scores
+        image_base64: Optional base64 encoded screenshot
+        
+    Returns:
+        Formatted judge prompt string
+    """
+    
+    # Extract field values for comparison
+    pred_is_index = predicted_dict.get("is_index_based")
+    exp_is_index = expected_dict.get("is_index_based")
+    pred_index_val = predicted_dict.get("index_value")
+    exp_index_val = expected_dict.get("index_value")
+    pred_parent = predicted_dict.get("parent_element_id")
+    exp_parent = expected_dict.get("parent_element_id")
+    pred_element = predicted_dict.get("element_id_of_nth_child_of_parent")
+    exp_element = expected_dict.get("element_id_of_nth_child_of_parent")
+    pred_selected = predicted_dict.get("selected_element_is_correct")
+    exp_selected = expected_dict.get("selected_element_is_correct")
+    
+    # Extract notes/reasoning if available
+    pred_notes = predicted_dict.get("notes", "")
+    exp_notes = expected_dict.get("notes", "")
+    
+    # Get field scores from evaluation results
+    field_scores = {}
+    if evaluation_results:
+        field_scores = {
+            "is_index_based": evaluation_results.get("is_index_based_match", 0.0),
+            "index_value": evaluation_results.get("index_value_match", 0.0),
+            "parent_element_id": evaluation_results.get("parent_element_id_match", 0.0),
+            "element_id_of_nth_child": evaluation_results.get("element_id_of_nth_child_match", 0.0),
+            "selected_element_is_correct": evaluation_results.get("selected_element_correct_match", 0.0),
+        }
+    
+    # Build field-by-field comparison
+    field_comparisons = []
+    
+    # 1. is_index_based
+    is_index_match = pred_is_index == exp_is_index
+    field_comparisons.append(f"""
+1. **is_index_based** ({'✅ CORRECT' if is_index_match else '❌ WRONG'}):
+   - Expected: {exp_is_index}
+   - Predicted: {pred_is_index}
+   - Score: {field_scores.get('is_index_based', 0.0):.0%}
+""")
+    
+    # 2. index_value
+    index_val_match = pred_index_val == exp_index_val
+    field_comparisons.append(f"""
+2. **index_value** ({'✅ CORRECT' if index_val_match else '❌ WRONG'}):
+   - Expected: {exp_index_val}
+   - Predicted: {pred_index_val}
+   - Score: {field_scores.get('index_value', 0.0):.0%}
+""")
+    
+    # 3. parent_element_id
+    parent_match = pred_parent == exp_parent
+    field_comparisons.append(f"""
+3. **parent_element_id** ({'✅ CORRECT' if parent_match else '❌ WRONG'}):
+   - Expected: {exp_parent}
+   - Predicted: {pred_parent}
+   - Score: {field_scores.get('parent_element_id', 0.0):.0%}
+""")
+    
+    # 4. element_id_of_nth_child_of_parent
+    element_match = pred_element == exp_element
+    field_comparisons.append(f"""
+4. **element_id_of_nth_child_of_parent** ({'✅ CORRECT' if element_match else '❌ WRONG'}):
+   - Expected: {exp_element}
+   - Predicted: {pred_element}
+   - Score: {field_scores.get('element_id_of_nth_child', 0.0):.0%}
+""")
+    
+    # 5. selected_element_is_correct
+    selected_match = pred_selected == exp_selected
+    field_comparisons.append(f"""
+5. **selected_element_is_correct** ({'✅ CORRECT' if selected_match else '❌ WRONG'}):
+   - Expected: {exp_selected}
+   - Predicted: {pred_selected}
+   - Score: {field_scores.get('selected_element_is_correct', 0.0):.0%}
+""")
+    
+    # Visual analysis instruction
+    visual_instruction = ""
+    if image_base64:
+        visual_instruction = """
+🖼️ VISUAL ANALYSIS (You can see the screenshot):
+- Look at the annotated screenshot with bounding boxes
+- Identify which element is highlighted (the target element)
+- Understand the UI structure and hierarchy
+- Analyze why the LLM might have misidentified the parent container or nth child
+"""
+    
+    judge_prompt = f"""You are an expert prompt engineer specializing in mobile UI automation and index-based element selection prompts.
+
+{"You can SEE the mobile app screenshot with annotated bounding boxes." if image_base64 else "You are analyzing text descriptions only (no image provided)."}
+
+TASK: Improve the SYSTEM PROMPT to better guide the LLM in correctly identifying index-based element selection.
+
+CONTEXT:
+- Task Command: "{task_command}"
+
+FULL EXPECTED OUTPUT (Ground Truth JSON):
+```json
+{expected_output}
+```
+
+FULL PREDICTED OUTPUT (What the LLM Actually Returned):
+```json
+{predicted_output}
+```
+
+FIELD-BY-FIELD COMPARISON:
+{''.join(field_comparisons)}
+{visual_instruction if image_base64 else ""}
+
+EXPECTED REASONING (from notes):
+{exp_notes if exp_notes else "N/A - No reasoning provided in expected output"}
+
+PREDICTED REASONING (from notes):
+{pred_notes if pred_notes else "N/A - No reasoning provided in predicted output"}
+
+CURRENT SYSTEM PROMPT (being optimized):
+{current_prompt if current_prompt else "N/A"}
+
+ANALYSIS REQUIRED:
+
+1. **is_index_based Analysis** (CRITICAL):
+   - Why did the LLM classify this as {"index-based" if pred_is_index else "non-index-based"} when it should be {"index-based" if exp_is_index else "non-index-based"}?
+   - What specific words or patterns in the command "{task_command}" should have led to the correct classification?
+   - What instruction in the prompt failed to guide correct classification?
+   - What edge case or ambiguity caused the misclassification?
+
+2. **index_value Analysis** (if is_index_based should be true):
+   - Why did the LLM extract index_value={pred_index_val} when it should be {exp_index_val}?
+   - What ordinal word ("first", "second", "third", etc.) in "{task_command}" should have been converted to {exp_index_val}?
+   - Did the LLM fail to recognize the ordinal, or did it count incorrectly?
+   - What instruction would help the LLM correctly parse ordinals?
+
+3. **parent_element_id Analysis** (if is_index_based should be true):
+   - Why did the LLM identify parent_element_id="{pred_parent}" when it should be "{exp_parent}"?
+   - What container in the XML hierarchy should have been identified as the parent?
+   - Did the LLM fail to walk up the hierarchy correctly?
+   - Did the LLM include non-item children (like headers) in the parent container?
+   - What instruction would help the LLM identify the correct parent container?
+
+4. **element_id_of_nth_child_of_parent Analysis** (if is_index_based should be true):
+   - Why did the LLM identify element_id_of_nth_child_of_parent="{pred_element}" when it should be "{exp_element}"?
+   - What is the outermost component representing the nth item?
+   - Did the LLM select a nested child instead of the full item?
+   - Did the LLM count items incorrectly (wrong nth position)?
+   - What instruction would help the LLM identify the correct outermost item?
+
+5. **selected_element_is_correct Analysis**:
+   - Why did the LLM determine selected_element_is_correct={pred_selected} when it should be {exp_selected}?
+   - Is the highlighted element actually the correct target for the command?
+   - What visual or structural cue did the LLM miss or misinterpret?
+
+6. **Prompt Weakness Identification**:
+   - Which specific instruction in the current system prompt is missing, unclear, or misleading?
+   - What concept from the expected reasoning should the prompt emphasize more?
+   - What edge case handling is missing?
+   - What example or clarification would help?
+
+7. **Actionable Prompt Improvement**:
+   - What exact instruction should be ADDED to fix each failing field?
+   - What should be REMOVED or CLARIFIED?
+   - What specific wording would guide the LLM to the correct field values?
+   - How can the prompt help the LLM follow the same logic as the expected output?
+
+OUTPUT FORMAT (JSON):
+{{
+    "is_index_based_error": "Specific explanation of why is_index_based classification was wrong. Reference the command and explain what pattern should have been recognized.",
+    "index_value_error": "If index_value was wrong, explain why. What ordinal word should have been converted to which number?",
+    "parent_element_id_error": "If parent_element_id was wrong, explain why. What container should have been identified and why?",
+    "element_id_of_nth_child_error": "If element_id_of_nth_child_of_parent was wrong, explain why. What item should have been selected and why?",
+    "selected_element_correct_error": "If selected_element_is_correct was wrong, explain why. Is the highlighted element actually correct?",
+    "key_weakness": "The single most important prompt weakness that caused the most errors",
+    "missing_instruction": "What specific instruction should be added to address the key weakness",
+    "improvement_suggestion": "Specific, actionable prompt improvement that addresses all field errors",
+    "example_instruction": "An example instruction that would help the LLM correctly identify all 5 fields"
+}}
+
+CRITICAL: Your analysis must focus on WHY each of the 5 fields was wrong. Be specific about:
+- Command interpretation (for is_index_based)
+- Ordinal parsing (for index_value)
+- XML hierarchy traversal (for parent_element_id and element_id_of_nth_child_of_parent)
+- Element correctness assessment (for selected_element_is_correct)
+
+Reference the task command, expected vs predicted values, and provide actionable improvements to the system prompt."""
+    
+    return judge_prompt
+
+
+def format_index_caching_judge_feedback(
+    judge_output: str,
+    predicted_dict: Dict[str, Any],
+    expected_dict: Dict[str, Any],
+    task_command: str,
+    field_scores: Dict[str, float]
+) -> str:
+    """
+    Format LLM-as-Judge output into structured feedback.
+    
+    Args:
+        judge_output: Raw output from LLM-as-Judge
+        predicted_dict: Parsed predicted JSON
+        expected_dict: Parsed expected JSON
+        task_command: The task command
+        field_scores: Field-by-field scores from evaluation
+        
+    Returns:
+        Formatted feedback string
+    """
+    import json
+    import re
+    
+    # Try to parse JSON from judge output
+    json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', judge_output, re.DOTALL)
+    if json_match:
+        try:
+            analysis = json.loads(json_match.group(0))
+            
+            # Build formatted feedback
+            feedback = f"""❌ INDEX CACHING EVALUATION FAILURE
+
+📋 FIELD-BY-FIELD ANALYSIS:
+
+🔍 is_index_based Error:
+   Expected: {expected_dict.get('is_index_based')}, Predicted: {predicted_dict.get('is_index_based')}
+   {analysis.get('is_index_based_error', 'N/A')}
+
+🔍 index_value Error:
+   Expected: {expected_dict.get('index_value')}, Predicted: {predicted_dict.get('index_value')}
+   {analysis.get('index_value_error', 'N/A')}
+
+🔍 parent_element_id Error:
+   Expected: {expected_dict.get('parent_element_id')}, Predicted: {predicted_dict.get('parent_element_id')}
+   {analysis.get('parent_element_id_error', 'N/A')}
+
+🔍 element_id_of_nth_child_of_parent Error:
+   Expected: {expected_dict.get('element_id_of_nth_child_of_parent')}, Predicted: {predicted_dict.get('element_id_of_nth_child_of_parent')}
+   {analysis.get('element_id_of_nth_child_error', 'N/A')}
+
+🔍 selected_element_is_correct Error:
+   Expected: {expected_dict.get('selected_element_is_correct')}, Predicted: {predicted_dict.get('selected_element_is_correct')}
+   {analysis.get('selected_element_correct_error', 'N/A')}
+
+🔍 KEY WEAKNESS:
+{analysis.get('key_weakness', 'N/A')}
+
+💡 MISSING INSTRUCTION:
+{analysis.get('missing_instruction', 'N/A')}
+
+💡 IMPROVEMENT SUGGESTION:
+{analysis.get('improvement_suggestion', 'N/A')}
+
+📝 EXAMPLE INSTRUCTION:
+{analysis.get('example_instruction', 'N/A')}
+
+💭 CONTEXT:
+- Task: "{task_command}"
+- Field Scores: is_index_based={field_scores.get('is_index_based', 0.0):.0%}, index_value={field_scores.get('index_value', 0.0):.0%}, parent_element_id={field_scores.get('parent_element_id', 0.0):.0%}, element_id_of_nth_child={field_scores.get('element_id_of_nth_child', 0.0):.0%}, selected_element_is_correct={field_scores.get('selected_element_is_correct', 0.0):.0%}"""
+            
+            return feedback
+        except json.JSONDecodeError:
+            pass
+    
+    # Fallback to raw output
+    return f"LLM-as-Judge Analysis (Index Caching):\n{judge_output}"
+
+
+# System prompt for LLM-as-Judge
+INDEX_CACHING_JUDGE_SYSTEM_PROMPT = """You are an expert prompt engineer analyzing mobile UI automation prompts for index-based element selection.
+
+Your task is to analyze why an LLM failed to correctly identify index-based element selection fields and provide actionable feedback to improve the system prompt.
+
+Focus on:
+- Command interpretation (is_index_based classification)
+- Ordinal parsing (index_value extraction)
+- XML hierarchy traversal (parent_element_id and element_id_of_nth_child_of_parent)
+- Element correctness assessment (selected_element_is_correct)
+
+You can see the screenshot with annotated bounding boxes if provided. Analyze the visual structure to understand why the LLM made errors."""
+
diff --git a/src/gepa_optimizer/utils/log_parser.py b/src/gepa_optimizer/utils/log_parser.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9889a1e09f0bbbfd1378caf3b230bcc5d527a25
--- /dev/null
+++ b/src/gepa_optimizer/utils/log_parser.py
@@ -0,0 +1,298 @@
+"""
+Log Parser for Extracting Candidates and Feedback
+
+Parses optimization logs to extract candidate prompts, feedback, and scores.
+"""
+
+import re
+from typing import List, Dict, Optional, Tuple
+from pathlib import Path
+
+
+class OptimizationLogParser:
+    """Parse optimization logs to extract candidates and feedback"""
+    
+    def __init__(self, log_file: str):
+        """
+        Initialize parser with log file path.
+        
+        Args:
+            log_file: Path to log file
+        """
+        self.log_file = Path(log_file)
+        self.content = ""
+        if self.log_file.exists():
+            with open(self.log_file, 'r', encoding='utf-8') as f:
+                self.content = f.read()
+    
+    def extract_iterations(self) -> List[Dict]:
+        """Extract iteration information from logs"""
+        iterations = []
+        
+        # Pattern to find iteration markers
+        iteration_pattern = r'Iteration\s+(\d+)|Starting GEPA optimization|🚀 Starting GEPA optimization'
+        
+        # Find all iteration starts
+        for match in re.finditer(iteration_pattern, self.content):
+            # Try to extract iteration number
+            iter_num = 1
+            if match.group(1):
+                iter_num = int(match.group(1))
+            
+            # Find the section for this iteration
+            start_pos = match.start()
+            next_match = list(re.finditer(iteration_pattern, self.content))
+            next_idx = next((i for i, m in enumerate(next_match) if m.start() > start_pos), None)
+            
+            if next_idx is not None:
+                end_pos = next_match[next_idx].start()
+                iter_content = self.content[start_pos:end_pos]
+            else:
+                iter_content = self.content[start_pos:]
+            
+            iterations.append({
+                'iteration': iter_num,
+                'content': iter_content,
+                'start_pos': start_pos
+            })
+        
+        return iterations
+    
+    def extract_candidates(self, iteration_content: str) -> List[Dict]:
+        """
+        Extract candidate prompts from iteration content.
+        
+        Args:
+            iteration_content: Content for a single iteration
+            
+        Returns:
+            List of candidate dictionaries
+        """
+        candidates = []
+        
+        # Pattern 1: GEPA Reflection candidates
+        # Look for "PROPOSED PROMPT" or "📝 PROPOSED PROMPT"
+        gepa_patterns = [
+            r'📝 PROPOSED PROMPT.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+            r'PROPOSED PROMPT.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+            r'GEPA REFLECTION.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+        ]
+        
+        for pattern in gepa_patterns:
+            for match in re.finditer(pattern, iteration_content, re.DOTALL):
+                prompt = match.group(1).strip()
+                if prompt and len(prompt) > 20:  # Valid prompt
+                    candidates.append({
+                        'source': 'GEPA_Reflection',
+                        'prompt': prompt,
+                        'position': match.start()
+                    })
+        
+        # Pattern 2: LLEGO Crossover candidates
+        crossover_patterns = [
+            r'🧬 Crossover.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+            r'Crossover.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+        ]
+        
+        for pattern in crossover_patterns:
+            for match in re.finditer(pattern, iteration_content, re.DOTALL):
+                prompt = match.group(1).strip()
+                if prompt and len(prompt) > 20:
+                    candidates.append({
+                        'source': 'LLEGO_Crossover',
+                        'prompt': prompt,
+                        'position': match.start()
+                    })
+        
+        # Pattern 3: LLEGO Mutation candidates
+        mutation_patterns = [
+            r'🎲 Mutation.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+            r'Mutation.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+        ]
+        
+        for pattern in mutation_patterns:
+            for match in re.finditer(pattern, iteration_content, re.DOTALL):
+                prompt = match.group(1).strip()
+                if prompt and len(prompt) > 20:
+                    candidates.append({
+                        'source': 'LLEGO_Mutation',
+                        'prompt': prompt,
+                        'position': match.start()
+                    })
+        
+        # Pattern 4: Generic candidate markers
+        # Look for prompts in quotes or code blocks
+        generic_patterns = [
+            r'"([^"]{50,})"',  # Quoted prompts
+            r'```\s*(.*?)\s*```',  # Code blocks
+        ]
+        
+        for pattern in generic_patterns:
+            for match in re.finditer(pattern, iteration_content, re.DOTALL):
+                prompt = match.group(1).strip()
+                # Check if it looks like a prompt (contains task instructions)
+                if (len(prompt) > 50 and 
+                    any(keyword in prompt.lower() for keyword in 
+                        ['you are', 'task', 'instruction', 'element', 'identify', 'select'])):
+                    # Check if we haven't already captured this
+                    if not any(c['prompt'] == prompt for c in candidates):
+                        candidates.append({
+                            'source': 'Unknown',
+                            'prompt': prompt,
+                            'position': match.start()
+                        })
+        
+        # Sort by position
+        candidates.sort(key=lambda x: x['position'])
+        
+        return candidates
+    
+    def extract_feedback(self, iteration_content: str) -> List[Dict]:
+        """
+        Extract feedback from iteration content.
+        
+        Args:
+            iteration_content: Content for a single iteration
+            
+        Returns:
+            List of feedback dictionaries
+        """
+        feedback_list = []
+        
+        # Pattern 1: Explicit feedback markers
+        feedback_patterns = [
+            r'💬 FEEDBACK:\s*(.*?)(?=\n\n|\n📊|\n🚀|\n💡|$)',
+            r'FEEDBACK:\s*(.*?)(?=\n\n|\n📊|\n🚀|\n💡|$)',
+            r'Feedback:\s*(.*?)(?=\n\n|\n📊|\n🚀|\n💡|$)',
+        ]
+        
+        for pattern in feedback_patterns:
+            for match in re.finditer(pattern, iteration_content, re.DOTALL):
+                feedback_text = match.group(1).strip()
+                if feedback_text and len(feedback_text) > 10:
+                    feedback_list.append({
+                        'feedback': feedback_text,
+                        'position': match.start()
+                    })
+        
+        # Pattern 2: LLM-as-Judge feedback
+        judge_patterns = [
+            r'LLM-as-Judge.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+            r'Judge Feedback.*?----------------------------------------\s*(.*?)(?=----------------------------------------|📊|🚀|$)',
+        ]
+        
+        for pattern in judge_patterns:
+            for match in re.finditer(pattern, iteration_content, re.DOTALL):
+                feedback_text = match.group(1).strip()
+                if feedback_text and len(feedback_text) > 10:
+                    feedback_list.append({
+                        'feedback': feedback_text,
+                        'position': match.start(),
+                        'source': 'LLM-as-Judge'
+                    })
+        
+        # Sort by position
+        feedback_list.sort(key=lambda x: x['position'])
+        
+        return feedback_list
+    
+    def extract_scores(self, iteration_content: str) -> List[Dict]:
+        """
+        Extract scores from iteration content.
+        
+        Args:
+            iteration_content: Content for a single iteration
+            
+        Returns:
+            List of score dictionaries
+        """
+        scores = []
+        
+        # Pattern for scores
+        score_patterns = [
+            r'Score:\s*([\d.]+)',
+            r'Average score:\s*([\d.]+)',
+            r'🎯 SCORE:\s*([\d.]+)',
+            r'📊 Score:\s*([\d.]+)',
+        ]
+        
+        for pattern in score_patterns:
+            for match in re.finditer(pattern, iteration_content):
+                score_value = float(match.group(1))
+                scores.append({
+                    'score': score_value,
+                    'position': match.start()
+                })
+        
+        # Sort by position
+        scores.sort(key=lambda x: x['position'])
+        
+        return scores
+    
+    def parse_all(self) -> Dict:
+        """
+        Parse entire log file and extract all information.
+        
+        Returns:
+            Dictionary with all extracted information
+        """
+        iterations = self.extract_iterations()
+        
+        result = {
+            'iterations': [],
+            'total_iterations': len(iterations),
+            'all_candidates': [],
+            'all_feedback': []
+        }
+        
+        for iter_info in iterations:
+            iter_num = iter_info['iteration']
+            iter_content = iter_info['content']
+            
+            candidates = self.extract_candidates(iter_content)
+            feedback = self.extract_feedback(iter_content)
+            scores = self.extract_scores(iter_content)
+            
+            # Try to associate scores with candidates
+            for i, candidate in enumerate(candidates):
+                # Find nearest score after this candidate
+                candidate_pos = candidate['position']
+                nearest_score = None
+                min_distance = float('inf')
+                
+                for score_info in scores:
+                    if score_info['position'] > candidate_pos:
+                        distance = score_info['position'] - candidate_pos
+                        if distance < min_distance:
+                            min_distance = distance
+                            nearest_score = score_info['score']
+                
+                if nearest_score is not None:
+                    candidate['score'] = nearest_score
+                
+                # Try to associate feedback
+                nearest_feedback = None
+                min_distance = float('inf')
+                
+                for feedback_info in feedback:
+                    if feedback_info['position'] > candidate_pos:
+                        distance = feedback_info['position'] - candidate_pos
+                        if distance < min_distance and distance < 5000:  # Within reasonable distance
+                            min_distance = distance
+                            nearest_feedback = feedback_info['feedback']
+                
+                if nearest_feedback:
+                    candidate['feedback'] = nearest_feedback
+            
+            result['iterations'].append({
+                'iteration': iter_num,
+                'candidates': candidates,
+                'feedback': feedback,
+                'scores': scores
+            })
+            
+            result['all_candidates'].extend(candidates)
+            result['all_feedback'].extend(feedback)
+        
+        return result
+
diff --git a/src/gepa_optimizer/utils/logging.py b/src/gepa_optimizer/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..401a95751534521d769f664e15639e8240d26be9
--- /dev/null
+++ b/src/gepa_optimizer/utils/logging.py
@@ -0,0 +1,107 @@
+"""
+Logging setup for GEPA Optimizer.
+
+This module provides backward-compatible logging functions that delegate
+to the centralized logging infrastructure.
+
+For new code, prefer importing directly from infrastructure.logging:
+    from gepa_optimizer.infrastructure.logging import get_logger, configure_logging
+"""
+
+import logging
+from pathlib import Path
+from datetime import datetime
+from typing import Optional, Union
+
+# Import from centralized infrastructure
+from ..infrastructure.logging import (
+    get_logger as _get_logger,
+    configure_logging as _configure_logging,
+    LogLevel,
+)
+
+
+def setup_logging(
+    level: str = "INFO",
+    log_file: Optional[Union[str, bool]] = None,
+    use_colors: bool = True,
+    include_emoji: bool = True,
+) -> None:
+    """
+    Configure logging for GEPA Optimizer with optional file logging.
+
+    This function provides backward compatibility with existing code.
+    New code should use configure_logging() from infrastructure.logging.
+
+    Args:
+        level: Logging level (e.g. "DEBUG", "INFO", "WARNING")
+        log_file: Path to log file. 
+                 - None: Auto-generates timestamped filename in logs/
+                 - False: Disables file logging
+                 - str: Uses specified path
+        use_colors: Whether to use colored output in console
+        include_emoji: Whether to include emoji in log messages
+        
+    Example:
+        # Basic setup
+        setup_logging(level="INFO")
+        
+        # With file logging
+        setup_logging(level="DEBUG", log_file="optimization.log")
+        
+        # Console only, no colors
+        setup_logging(level="INFO", log_file=False, use_colors=False)
+    """
+    # Handle auto-generated log file
+    actual_log_file: Optional[str] = None
+    
+    if log_file is None:
+        # Auto-generate log filename with timestamp
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        log_dir = Path("logs")
+        log_dir.mkdir(exist_ok=True)
+        actual_log_file = str(log_dir / f"optimization_{timestamp}.log")
+    elif log_file is not False:
+        # Use specified path
+        log_path = Path(log_file)
+        log_path.parent.mkdir(parents=True, exist_ok=True)
+        actual_log_file = str(log_file)
+    
+    # Delegate to centralized configuration
+    _configure_logging(
+        level=level,
+        log_file=actual_log_file,
+        use_colors=use_colors,
+        include_emoji=include_emoji,
+    )
+    
+    # Log configuration info
+    logger = _get_logger(__name__)
+    if actual_log_file:
+        logger.info(f"Logging to file: {actual_log_file}")
+        logger.info(f"Logging configured at {level} level (console + file)")
+    else:
+        logger.info(f"Logging configured at {level} level (console only)")
+
+
+def get_logger(name: str) -> logging.Logger:
+    """
+    Get a logger for a specific module.
+    
+    This function provides backward compatibility. New code should use:
+        from gepa_optimizer.infrastructure.logging import get_logger
+    
+    Args:
+        name: Module name (typically __name__)
+        
+    Returns:
+        Configured Logger instance
+    """
+    return _get_logger(name)
+
+
+# Re-export for convenience
+__all__ = [
+    "setup_logging",
+    "get_logger",
+]
diff --git a/src/gepa_optimizer/utils/metrics.py b/src/gepa_optimizer/utils/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e1bbfb9884fd93d548a93a199cbfafd3d8edd96
--- /dev/null
+++ b/src/gepa_optimizer/utils/metrics.py
@@ -0,0 +1,220 @@
+"""
+Comprehensive metrics calculations for GEPA Optimizer
+"""
+
+from typing import Dict, List, Optional, Any
+import re
+import time
+from collections import Counter
+
+def calculate_metrics(original_prompt: str, 
+                     optimized_prompt: str,
+                     performance_data: Optional[Dict[str, Any]] = None) -> Dict[str, float]:
+    """
+    Calculate comprehensive improvement metrics between original and optimized prompts
+    
+    Args:
+        original_prompt: Original seed prompt
+        optimized_prompt: GEPA-optimized prompt
+        performance_data: Optional performance metrics from GEPA
+        
+    Returns:
+        Dict[str, float]: Comprehensive metrics dictionary
+    """
+    metrics = {}
+    
+    # Basic length metrics
+    orig_len = len(original_prompt)
+    opt_len = len(optimized_prompt)
+    
+    if orig_len > 0:
+        metrics['length_change_percent'] = ((opt_len - orig_len) / orig_len) * 100
+    else:
+        metrics['length_change_percent'] = 0.0
+    
+    metrics['original_length'] = orig_len
+    metrics['optimized_length'] = opt_len
+    
+    # Word count metrics
+    orig_words = len(original_prompt.split())
+    opt_words = len(optimized_prompt.split())
+    
+    if orig_words > 0:
+        metrics['word_change_percent'] = ((opt_words - orig_words) / orig_words) * 100
+    else:
+        metrics['word_change_percent'] = 0.0
+    
+    metrics['original_words'] = orig_words
+    metrics['optimized_words'] = opt_words
+    
+    # Complexity metrics
+    metrics['original_complexity'] = calculate_text_complexity(original_prompt)
+    metrics['optimized_complexity'] = calculate_text_complexity(optimized_prompt)
+    metrics['complexity_change'] = metrics['optimized_complexity'] - metrics['original_complexity']
+    
+    # Similarity metrics
+    metrics['similarity_score'] = calculate_similarity(original_prompt, optimized_prompt)
+    
+    # Include GEPA performance data if available
+    if performance_data:
+        for key, value in performance_data.items():
+            if isinstance(value, (int, float)):
+                metrics[f'gepa_{key}'] = float(value)
+    
+    return metrics
+
+def calculate_text_complexity(text: str) -> float:
+    """
+    Calculate a simple complexity score for text
+    
+    Args:
+        text: Text to analyze
+        
+    Returns:
+        float: Complexity score (higher = more complex)
+    """
+    if not text:
+        return 0.0
+    
+    # Count various complexity indicators
+    sentence_count = len(re.findall(r'[.!?]+', text))
+    word_count = len(text.split())
+    char_count = len(text)
+    unique_words = len(set(text.lower().split()))
+    
+    # Avoid division by zero
+    if word_count == 0:
+        return 0.0
+    
+    # Simple complexity calculation
+    avg_word_length = char_count / word_count
+    lexical_diversity = unique_words / word_count
+    avg_sentence_length = word_count / max(sentence_count, 1)
+    
+    # Weighted complexity score
+    complexity = (
+        avg_word_length * 0.3 +
+        lexical_diversity * 0.4 +
+        avg_sentence_length * 0.3
+    )
+    
+    return round(complexity, 3)
+
+def calculate_similarity(text1: str, text2: str) -> float:
+    """
+    Calculate similarity between two texts using simple word overlap
+    
+    Args:
+        text1: First text
+        text2: Second text
+        
+    Returns:
+        float: Similarity score between 0 and 1
+    """
+    if not text1 or not text2:
+        return 0.0
+    
+    # Convert to lowercase and split into words
+    words1 = set(text1.lower().split())
+    words2 = set(text2.lower().split())
+    
+    # Calculate Jaccard similarity
+    intersection = len(words1.intersection(words2))
+    union = len(words1.union(words2))
+    
+    if union == 0:
+        return 0.0
+    
+    similarity = intersection / union
+    return round(similarity, 3)
+
+def track_optimization_progress(iteration: int, 
+                               score: float, 
+                               improvement: float,
+                               time_elapsed: float) -> Dict[str, Any]:
+    """
+    Track progress during optimization iterations
+    
+    Args:
+        iteration: Current iteration number
+        score: Current performance score
+        improvement: Improvement over baseline
+        time_elapsed: Time elapsed in seconds
+        
+    Returns:
+        Dict[str, Any]: Progress metrics
+    """
+    return {
+        'iteration': iteration,
+        'score': round(score, 4),
+        'improvement': round(improvement, 4),
+        'time_elapsed': round(time_elapsed, 2),
+        'score_per_second': round(score / max(time_elapsed, 0.001), 4)
+    }
+
+def calculate_cost_efficiency(improvement_percent: float, 
+                            estimated_cost: float) -> Dict[str, float]:
+    """
+    Calculate cost efficiency metrics
+    
+    Args:
+        improvement_percent: Performance improvement percentage
+        estimated_cost: Estimated cost in USD
+        
+    Returns:
+        Dict[str, float]: Cost efficiency metrics
+    """
+    if estimated_cost <= 0:
+        return {'improvement_per_dollar': 0.0, 'cost_efficiency': 0.0}
+    
+    improvement_per_dollar = improvement_percent / estimated_cost
+    
+    # Cost efficiency score (higher is better)
+    cost_efficiency = min(improvement_per_dollar / 10.0, 1.0)  # Normalized to 0-1
+    
+    return {
+        'improvement_per_dollar': round(improvement_per_dollar, 3),
+        'cost_efficiency': round(cost_efficiency, 3),
+        'estimated_cost': estimated_cost
+    }
+
+def summarize_optimization_results(metrics: Dict[str, float]) -> str:
+    """
+    Create a human-readable summary of optimization results
+    
+    Args:
+        metrics: Metrics dictionary from calculate_metrics
+        
+    Returns:
+        str: Human-readable summary
+    """
+    summary_parts = []
+    
+    # Length changes
+    length_change = metrics.get('length_change_percent', 0)
+    if length_change > 5:
+        summary_parts.append(f"Prompt expanded by {length_change:.1f}%")
+    elif length_change < -5:
+        summary_parts.append(f"Prompt condensed by {abs(length_change):.1f}%")
+    else:
+        summary_parts.append("Prompt length remained similar")
+    
+    # Complexity changes
+    complexity_change = metrics.get('complexity_change', 0)
+    if complexity_change > 0.1:
+        summary_parts.append("increased complexity")
+    elif complexity_change < -0.1:
+        summary_parts.append("reduced complexity")
+    else:
+        summary_parts.append("maintained similar complexity")
+    
+    # Similarity
+    similarity = metrics.get('similarity_score', 1.0)
+    if similarity > 0.8:
+        summary_parts.append(f"high similarity to original ({similarity:.2f})")
+    elif similarity > 0.5:
+        summary_parts.append(f"moderate changes from original ({similarity:.2f})")
+    else:
+        summary_parts.append(f"significant changes from original ({similarity:.2f})")
+    
+    return f"Optimization results: {', '.join(summary_parts)}"
diff --git a/src/gepa_optimizer/utils/pareto_logger.py b/src/gepa_optimizer/utils/pareto_logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..923873788f0f1a9655b4274552ecd48ef6a136a7
--- /dev/null
+++ b/src/gepa_optimizer/utils/pareto_logger.py
@@ -0,0 +1,461 @@
+"""
+Pareto Front Logger - Tracks candidate comparisons and Pareto front updates
+"""
+
+from typing import Dict, List, Optional
+from collections import defaultdict
+import logging
+
+logger = logging.getLogger(__name__)
+
+class ParetoLogger:
+    """Tracks evaluations and Pareto front updates"""
+    
+    def __init__(self):
+        self.candidates_evaluated = []  # List of (prompt, score, type, dataset)
+        self.pareto_front = []  # Current Pareto front (prompt, score, type)
+        self.baseline_score = None
+        
+    def log_candidate_evaluation(self, prompt: str, score: float, candidate_type: str, dataset_type: str):
+        """Log a candidate evaluation"""
+        self.candidates_evaluated.append({
+            'prompt': prompt,
+            'score': score,
+            'type': candidate_type,
+            'dataset': dataset_type
+        })
+        
+        # If evaluated on Dpareto, check against Pareto front
+        if dataset_type == 'dpareto':
+            self._check_pareto_update(prompt, score, candidate_type)
+    
+    def _check_pareto_update(self, prompt: str, score: float, candidate_type: str):
+        """Check if candidate should be added to Pareto front
+        
+        🔥 CRITICAL RULE: Candidate must be better than baseline (f(S₀)) to enter Pareto front
+        Exception: Seed prompt (S₀) itself is always added as baseline
+        """
+        # Get notation for candidate with better mapping
+        if candidate_type == 'gepa_reflection':
+            cand_notation = 'Sᵣ'
+        elif candidate_type == 'llego_crossover' or candidate_type == 'llego_crossover1' or candidate_type == 'llego_crossover2':
+            cand_notation = 'Oₓₒ'
+        elif candidate_type == 'llego_mutation' or candidate_type == 'llego_mutation1' or candidate_type == 'llego_mutation2':
+            cand_notation = 'Oₘᵤₜ'
+        elif candidate_type == 'seed':
+            cand_notation = 'S₀'
+        elif candidate_type == 'unknown' or not candidate_type:
+            cand_notation = 'S'  # Default for unknown
+        else:
+            # For any other type, use base notation
+            cand_notation = 'S'
+        
+        logger.info("\n" + "═" * 80)
+        logger.info(f"📊 PARETO FRONT P ANALYSIS - Evaluating {cand_notation}")
+        logger.info("═" * 80)
+        
+        logger.info(f"\n   📊 Evaluating: {cand_notation} with f({cand_notation}) = {score:.4f}")
+        
+        # 🔥 CRITICAL BASELINE CHECK: Candidate must be better than baseline (unless it's the seed itself)
+        # Rule: Only candidates with f(candidate) > f(S₀) can enter Pareto front
+        # Exception: Seed prompt (S₀) itself is always added as the baseline
+        if candidate_type == 'seed':
+            logger.info(f"\n   ✅ {cand_notation} is seed prompt - always added as baseline")
+            
+            # Set baseline if not already set (safety check - adapter should have done this)
+            if self.baseline_score is None:
+                self.baseline_score = score
+                logger.info(f"   💡 Setting baseline: f(S₀) = {score:.4f}")
+            
+            # Add seed to Pareto front immediately (no dominance check needed)
+            self.pareto_front.append({
+                'prompt': prompt,
+                'score': score,
+                'type': candidate_type,
+                'notation': cand_notation
+            })
+            self.pareto_front.sort(key=lambda x: x['score'], reverse=True)
+            
+            # Display Pareto front with seed
+            front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+            logger.info(f"\n   ✅ ADDED to Pareto Front P (baseline)")
+            logger.info(f"      P = {{{', '.join(front_notations)}}}")
+            self._display_pareto_front()
+            
+            return  # Seed is always added - skip dominance check
+        else:
+            # For non-seed candidates, must be better than baseline to proceed
+            if self.baseline_score is not None:
+                if score > self.baseline_score:
+                    logger.info(f"\n   ✅ {cand_notation} meets baseline requirement:")
+                    logger.info(f"      f(S₀) = {self.baseline_score:.4f} (baseline)")
+                    logger.info(f"      f({cand_notation}) = {score:.4f}")
+                    logger.info(f"      f({cand_notation}) > f(S₀) → Can be added to Pareto front")
+                    logger.info(f"      Improvement over baseline: +{score - self.baseline_score:.4f}")
+                else:
+                    logger.info(f"\n   ❌ {cand_notation} does NOT meet baseline requirement:")
+                    logger.info(f"      f(S₀) = {self.baseline_score:.4f} (baseline)")
+                    logger.info(f"      f({cand_notation}) = {score:.4f}")
+                    logger.info(f"      f({cand_notation}) ≤ f(S₀) → NOT ADDED to Pareto front")
+                    logger.info(f"      💡 Only candidates better than baseline can enter Pareto front")
+                    logger.info(f"      💡 Difference: {score - self.baseline_score:.4f} (needs to be > 0)")
+                    return  # Skip Pareto front update - candidate is not better than baseline
+            else:
+                # CRITICAL: Baseline must be set before evaluating any non-seed candidates
+                logger.error(f"\n   ❌ CRITICAL ERROR: Baseline score not set!")
+                logger.error(f"      Cannot evaluate {cand_notation} without baseline f(S₀)")
+                logger.error(f"      💡 Seed prompt must be evaluated on Dpareto first")
+                logger.error(f"      💡 Rejecting candidate to maintain correctness")
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "B", "location": "pareto_logger.py:baseline_not_set", "message": "CRITICAL: Baseline not set when checking Pareto", "data": {"candidate_type": candidate_type, "candidate_notation": cand_notation, "score": score}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+                return  # Reject candidate - baseline is required
+        
+        # Check if this candidate dominates any in current front
+        dominated = []
+        for i, front_candidate in enumerate(self.pareto_front):
+            front_score = front_candidate['score']
+            front_notation = front_candidate.get('notation', 'S')
+            
+            # Simple dominance: higher score dominates
+            if score > front_score:
+                dominated.append(i)
+                logger.info(f"\n   ✅ {cand_notation} DOMINATES P{i+1}:")
+                logger.info(f"      f(P{i+1}) = {front_score:.4f}")
+                logger.info(f"      f({cand_notation}) = {score:.4f}")
+                logger.info(f"      f({cand_notation}) > f({front_notation}) → DOMINANCE")
+                logger.info(f"      Improvement: +{score - front_score:.4f}")
+        
+        if dominated:
+            # Remove dominated candidates
+            for i in reversed(dominated):
+                removed = self.pareto_front.pop(i)
+                removed_notation = removed.get('notation', 'S')
+                logger.info(f"      ➡️  Removing {removed_notation} from Pareto front P (dominated by {cand_notation})")
+            
+            # Add new candidate
+            self.pareto_front.append({
+                'prompt': prompt,
+                'score': score,
+                'type': candidate_type,
+                'notation': cand_notation
+            })
+            
+            # Sort by score
+            self.pareto_front.sort(key=lambda x: x['score'], reverse=True)
+            
+            # Display Pareto front with candidate notations
+            front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+            logger.info(f"\n   ✅ ADDED to Pareto Front P")
+            logger.info(f"      P = {{{', '.join(front_notations)}}}")
+        else:
+            # Check if any in front dominates this candidate
+            is_dominated = False
+            for i, front_candidate in enumerate(self.pareto_front):
+                if front_candidate['score'] > score:
+                    front_notation = front_candidate.get('notation', 'S')
+                    logger.info(f"\n   ❌ {cand_notation} is DOMINATED by {front_notation}:")
+                    logger.info(f"      f({front_notation}) = {front_candidate['score']:.4f}")
+                    logger.info(f"      f({cand_notation}) = {score:.4f}")
+                    logger.info(f"      f({front_notation}) > f({cand_notation}) → DOMINATED")
+                    logger.info(f"      Difference: {score - front_candidate['score']:.4f}")
+                    is_dominated = True
+                    break
+            
+            if not is_dominated:
+                # Check for equal scores (for single-objective, we can add if non-dominated)
+                equal_candidates = [c.get('notation', 'S') for c in self.pareto_front if abs(c['score'] - score) < 1e-6]
+                
+                # Non-dominated: add to front
+                self.pareto_front.append({
+                    'prompt': prompt,
+                    'score': score,
+                    'type': candidate_type,
+                    'notation': cand_notation
+                })
+                self.pareto_front.sort(key=lambda x: x['score'], reverse=True)
+                
+                # Display Pareto front with candidate notations
+                front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+                if equal_candidates:
+                    logger.info(f"\n   ✅ ADDED to Pareto Front P (non-dominated)")
+                    logger.info(f"      f({cand_notation}) = {score:.4f} (same score as {', '.join(equal_candidates)})")
+                    logger.info(f"      P = {{{', '.join(front_notations)}}}")
+                else:
+                    logger.info(f"\n   ✅ ADDED to Pareto Front P (non-dominated)")
+                    logger.info(f"      {cand_notation} is non-dominated → kept in P")
+                    logger.info(f"      P = {{{', '.join(front_notations)}}}")
+            else:
+                # Show all dominating candidates with their notations
+                dominating_list = [(c.get('notation', 'S'), c['score']) for c in self.pareto_front if c['score'] > score]
+                if dominating_list:
+                    for dom_notation, dom_score in dominating_list:
+                        logger.info(f"\n   ❌ {cand_notation} is DOMINATED by {dom_notation}:")
+                        logger.info(f"      f({dom_notation}) = {dom_score:.4f}")
+                        logger.info(f"      f({cand_notation}) = {score:.4f}")
+                        logger.info(f"      f({dom_notation}) > f({cand_notation}) → DOMINATED")
+                logger.info(f"\n   ❌ NOT ADDED to Pareto Front P (dominated)")
+        
+        self._display_pareto_front()
+    
+    def _display_pareto_front(self):
+        """Display current Pareto front with candidate notation"""
+        logger.info(f"\n📋 CURRENT PARETO FRONT P (Size: |P| = {len(self.pareto_front)}):")
+        logger.info("─" * 80)
+        
+        if not self.pareto_front:
+            logger.info("   P = {} (Empty - no candidates added yet)")
+            logger.info("   💡 NOTATION: P = Pareto front (non-dominated solutions)")
+            return
+        
+        # Display Pareto front using candidate notations instead of P1, P2, etc.
+        front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+        logger.info(f"   P = {{{', '.join(front_notations)}}}")
+        
+        for candidate in self.pareto_front:
+            notation = candidate.get('notation', 'S')
+            
+            # Enhanced type labels with full notation
+            type_labels = {
+                'seed': ('🌱 Seed Prompt', 'S₀'),
+                'gepa_reflection': ('📝 GEPA Reflection Candidate', 'Sᵣ'),
+                'llego_crossover': ('🔀 LLEGO Crossover Offspring', 'Oₓₒ'),
+                'llego_mutation': ('🎲 LLEGO Mutation Offspring', 'Oₘᵤₜ'),
+                'unknown': ('🔄 Unknown Candidate', 'S')
+            }
+            
+            cand_type = candidate.get('type', 'unknown')
+            type_label, type_notation = type_labels.get(cand_type, (f'🔄 {cand_type}', notation))
+            
+            # Use the notation from the candidate if available, otherwise use type notation
+            display_notation = notation if notation != 'S' else type_notation
+            
+            logger.info(f"\n   {display_notation}: {type_label}")
+            logger.info(f"      f({display_notation}) = {candidate['score']:.4f}")
+            prompt_preview = candidate['prompt'][:150] if len(candidate['prompt']) > 150 else candidate['prompt']
+            logger.info(f"      Prompt ({len(candidate['prompt'])} chars): {prompt_preview}{'...' if len(candidate['prompt']) > 150 else ''}")
+        
+        logger.info(f"\n   💡 NOTATION EXPLANATION:")
+        logger.info(f"      P = Pareto front (set of non-dominated solutions)")
+        logger.info(f"      S₀ = Seed prompt (baseline)")
+        logger.info(f"      Sᵣ = GEPA Reflection candidate")
+        logger.info(f"      Oₓₒ = LLEGO Crossover offspring (combines parents)")
+        logger.info(f"      Oₘᵤₜ = LLEGO Mutation offspring (explores variations)")
+        logger.info(f"      f({', '.join(front_notations[:3])}) = Fitness scores of candidates in Pareto front")
+        logger.info("─" * 80)
+    
+    def set_baseline(self, score: float):
+        """Set baseline score for comparison"""
+        self.baseline_score = score
+        # Add seed to Pareto front if we have it
+        if self.pareto_front:
+            seed_candidate = self.pareto_front[0]  # First is usually seed
+            seed_candidate['baseline_score'] = score
+    
+    def batch_update_pareto_front(self, candidates_with_scores: List[Dict]) -> List[Dict]:
+        """
+        🔥 BATCH PARETO FRONT UPDATE
+        
+        Efficiently update Pareto front with multiple candidates in one operation.
+        
+        Steps:
+        1. Filter by baseline (score > baseline_score)
+        2. Find non-dominated among filtered candidates
+        3. Compare with current Pareto front
+        4. Update Pareto front (remove dominated, add non-dominated)
+        
+        Args:
+            candidates_with_scores: List of dicts with keys:
+                - 'prompt': str
+                - 'score': float
+                - 'type': str (candidate_type)
+                - 'notation': str (optional, will be generated if missing)
+        
+        Returns:
+            List of candidates that were added to Pareto front
+        """
+        if not candidates_with_scores:
+            return []
+        
+        logger.info("\n" + "═" * 80)
+        logger.info(f"🔥 BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates")
+        logger.info("═" * 80)
+        
+        # Step 1: Filter by baseline (score > baseline_score)
+        if self.baseline_score is None:
+            logger.error("❌ Baseline score not set - cannot perform batch update")
+            logger.error("   💡 Seed prompt must be evaluated on Dpareto first")
+            return []
+        
+        baseline = self.baseline_score
+        filtered = []
+        
+        for cand in candidates_with_scores:
+            score = cand.get('score', 0.0)
+            cand_type = cand.get('type', 'unknown')
+            
+            # Seed is always included (it's the baseline)
+            if cand_type == 'seed':
+                filtered.append(cand)
+                continue
+            
+            # Non-seed candidates must be better than baseline
+            if score > baseline:
+                filtered.append(cand)
+                logger.info(f"   ✅ {cand.get('notation', 'S')} passes baseline: f={score:.4f} > f(S₀)={baseline:.4f}")
+            else:
+                notation = cand.get('notation', 'S')
+                logger.info(f"   ❌ {notation} fails baseline: f={score:.4f} ≤ f(S₀)={baseline:.4f}")
+        
+        if not filtered:
+            logger.info(f"\n   ❌ No candidates pass baseline filter (baseline: {baseline:.4f})")
+            logger.info("   💡 All candidates are worse than or equal to seed prompt")
+            return []
+        
+        logger.info(f"\n   📊 After baseline filter: {len(filtered)}/{len(candidates_with_scores)} candidates remain")
+        
+        # Step 2: Find non-dominated among filtered candidates
+        # Sort by score (descending) for easier dominance checking
+        filtered_sorted = sorted(filtered, key=lambda x: x.get('score', 0.0), reverse=True)
+        non_dominated_batch = []
+        
+        for i, cand in enumerate(filtered_sorted):
+            cand_score = cand.get('score', 0.0)
+            cand_notation = cand.get('notation', 'S')
+            is_dominated = False
+            
+            # Check if dominated by any other candidate in batch
+            for other in filtered_sorted[:i]:  # Only check candidates with higher scores
+                other_score = other.get('score', 0.0)
+                if other_score > cand_score:
+                    other_notation = other.get('notation', 'S')
+                    logger.info(f"   ❌ {cand_notation} dominated by {other_notation} in batch: f({other_notation})={other_score:.4f} > f({cand_notation})={cand_score:.4f}")
+                    is_dominated = True
+                    break
+            
+            if not is_dominated:
+                non_dominated_batch.append(cand)
+                logger.info(f"   ✅ {cand_notation} is non-dominated in batch: f={cand_score:.4f}")
+        
+        logger.info(f"\n   📊 After batch dominance check: {len(non_dominated_batch)}/{len(filtered)} non-dominated candidates")
+        
+        if not non_dominated_batch:
+            logger.info("   ❌ No non-dominated candidates in batch")
+            return []
+        
+        # Step 3: Compare with current Pareto front and update
+        added_to_front = []
+        candidates_to_remove = []
+        
+        # First, check which current front candidates are dominated by new batch
+        for front_cand in self.pareto_front:
+            front_score = front_cand.get('score', 0.0)
+            front_notation = front_cand.get('notation', 'S')
+            
+            # Check if any new candidate dominates this front candidate
+            for new_cand in non_dominated_batch:
+                new_score = new_cand.get('score', 0.0)
+                new_notation = new_cand.get('notation', 'S')
+                
+                if new_score > front_score:
+                    candidates_to_remove.append(front_cand)
+                    logger.info(f"   ➡️  {front_notation} will be removed (dominated by {new_notation}): f({front_notation})={front_score:.4f} < f({new_notation})={new_score:.4f}")
+                    break
+        
+        # Remove dominated candidates from front
+        for cand_to_remove in candidates_to_remove:
+            if cand_to_remove in self.pareto_front:
+                self.pareto_front.remove(cand_to_remove)
+        
+        # Now add non-dominated new candidates (check they're not dominated by remaining front)
+        for new_cand in non_dominated_batch:
+            new_score = new_cand.get('score', 0.0)
+            new_notation = new_cand.get('notation', 'S')
+            new_type = new_cand.get('type', 'unknown')
+            new_prompt = new_cand.get('prompt', '')
+            
+            # Check if dominated by any remaining front candidate
+            is_dominated_by_front = False
+            for front_cand in self.pareto_front:
+                front_score = front_cand.get('score', 0.0)
+                if front_score > new_score:
+                    front_notation = front_cand.get('notation', 'S')
+                    logger.info(f"   ❌ {new_notation} dominated by existing {front_notation}: f({front_notation})={front_score:.4f} > f({new_notation})={new_score:.4f}")
+                    is_dominated_by_front = True
+                    break
+            
+            if not is_dominated_by_front:
+                # Generate notation if missing
+                if 'notation' not in new_cand:
+                    if new_type == 'gepa_reflection':
+                        new_notation = 'Sᵣ'
+                    elif new_type.startswith('llego_crossover'):
+                        new_notation = 'Oₓₒ'
+                    elif new_type.startswith('llego_mutation'):
+                        new_notation = 'Oₘᵤₜ'
+                    elif new_type == 'seed':
+                        new_notation = 'S₀'
+                    else:
+                        new_notation = 'S'
+                
+                # Add to Pareto front
+                front_entry = {
+                    'prompt': new_prompt,
+                    'score': new_score,
+                    'type': new_type,
+                    'notation': new_notation
+                }
+                self.pareto_front.append(front_entry)
+                added_to_front.append(new_cand)
+                
+                # Also log to candidates_evaluated for tracking
+                self.candidates_evaluated.append({
+                    'prompt': new_prompt,
+                    'score': new_score,
+                    'type': new_type,
+                    'dataset': 'dpareto'
+                })
+                
+                logger.info(f"   ✅ {new_notation} ADDED to Pareto front: f={new_score:.4f}")
+        
+        # Sort Pareto front by score
+        self.pareto_front.sort(key=lambda x: x.get('score', 0.0), reverse=True)
+        
+        # Display updated Pareto front
+        logger.info(f"\n{'═'*80}")
+        logger.info(f"✅ BATCH UPDATE COMPLETE")
+        logger.info(f"   Added: {len(added_to_front)} candidates")
+        logger.info(f"   Removed: {len(candidates_to_remove)} dominated candidates")
+        logger.info(f"   Pareto front size: |P| = {len(self.pareto_front)}")
+        
+        front_notations = [c.get('notation', 'S') for c in self.pareto_front]
+        logger.info(f"   P = {{{', '.join(front_notations)}}}")
+        self._display_pareto_front()
+        logger.info("═" * 80 + "\n")
+        
+        return added_to_front
+
+# Global instance
+_pareto_logger = ParetoLogger()
+
+def get_pareto_logger() -> ParetoLogger:
+    """Get global Pareto logger instance"""
+    return _pareto_logger
+
+def reset_pareto_logger() -> ParetoLogger:
+    """Reset global Pareto logger instance (for new runs)"""
+    global _pareto_logger
+    _pareto_logger = ParetoLogger()
+    # #region agent log
+    import json as _json_debug
+    import time as _time_debug
+    _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+    with open(_debug_log_path, "a") as _f:
+        _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "pareto_logger.py:reset", "message": "Pareto logger reset", "data": {"baseline_score": _pareto_logger.baseline_score, "pareto_front_size": len(_pareto_logger.pareto_front)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+    # #endregion
+    return _pareto_logger
+
diff --git a/src/gepa_optimizer/utils/universal_judge_prompt.py b/src/gepa_optimizer/utils/universal_judge_prompt.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ab8d9152f41ae6a087ebb66a3b40d38e9e285d
--- /dev/null
+++ b/src/gepa_optimizer/utils/universal_judge_prompt.py
@@ -0,0 +1,317 @@
+"""
+Universal LLM-as-Judge Prompt Builder for ANY prompt optimization use case.
+
+This module provides prompts for semantic comparison and feedback generation
+that work for text, JSON, structured outputs, and any other task type.
+
+NO UI-specific assumptions. NO element IDs. NO bounding boxes.
+Pure semantic and structural comparison for universal prompt optimization.
+"""
+
+from typing import Dict, Any, Optional
+
+
+def build_universal_judge_prompt(
+    task_input: str,
+    predicted_output: str,
+    expected_output: str,
+    current_prompt: Optional[str] = None,
+    evaluation_results: Optional[Dict[str, Any]] = None,
+    image_base64: Optional[str] = None
+) -> str:
+    """
+    Build a universal LLM-as-Judge prompt for ANY task type.
+    
+    Works for:
+    - Text extraction (NER, summarization, translation)
+    - JSON generation (structured data extraction)
+    - Classification tasks (sentiment, category)
+    - Question answering
+    - Code generation
+    - Multi-modal tasks (with images)
+    
+    Args:
+        task_input: The input given to the LLM (task/question/text to process)
+        predicted_output: What the LLM actually returned
+        expected_output: The ground truth / desired output
+        current_prompt: The system prompt being optimized
+        evaluation_results: Optional evaluation scores
+        image_base64: Optional image for multi-modal tasks
+        
+    Returns:
+        Formatted judge prompt string
+    """
+    # Handle empty outputs
+    if not predicted_output or predicted_output.strip() == '':
+        predicted_display = "[EMPTY - No output generated]"
+    else:
+        predicted_display = predicted_output
+    
+    if not expected_output or expected_output.strip() == '':
+        expected_display = "[EMPTY - No expected output provided]"
+    else:
+        expected_display = expected_output
+    
+    # Build evaluation context if available
+    eval_context = ""
+    if evaluation_results:
+        score = evaluation_results.get('composite_score', 0.0)
+        semantic = evaluation_results.get('semantic_similarity', 0.0)
+        structural = evaluation_results.get('structural_similarity', 0.0)
+        eval_context = f"""
+EVALUATION SCORES:
+- Composite Score: {score:.2%}
+- Semantic Similarity: {semantic:.2%}
+- Structural Similarity: {structural:.2%}
+"""
+    
+    # Image context for multi-modal
+    image_context = ""
+    if image_base64:
+        image_context = """
+NOTE: An image was provided with this task. The LLM should have analyzed the image content.
+Consider whether the predicted output accurately reflects the image content.
+"""
+    
+    # Build the universal judge prompt - OPTIMIZED for complex enterprise use cases
+    # Uses 3-Layer Forensic Analysis: Syntax -> Structure -> Semantics
+    judge_prompt = f"""<system_role>
+You are a **Principal Forensic Prompt Auditor**. Your specialty is analyzing failures in Enterprise AI systems.
+Your goal is to compare a [PREDICTED_OUTPUT] against an [EXPECTED_OUTPUT] to identify the *exact* root cause of failure in the [SYSTEM_PROMPT].
+</system_role>
+
+<context>
+    <task_input>
+{task_input}
+    </task_input>
+
+    <system_prompt_under_test>
+{current_prompt if current_prompt else "[No system prompt provided - Baseline Test]"}
+    </system_prompt_under_test>
+</context>
+
+<evidence>
+    <expected_ground_truth>
+{expected_display}
+    </expected_ground_truth>
+
+    <actual_model_prediction>
+{predicted_display}
+    </actual_model_prediction>
+{eval_context}
+{image_context}
+</evidence>
+
+<forensic_analysis_protocol>
+You must evaluate the prediction using a 3-Layer Depth approach:
+
+1. **SYNTAX LAYER (Format)**:
+   - Is the output valid JSON/XML/Code?
+   - Are data types correct? (e.g., string "100" vs number 100).
+   - Are required headers or markdown tags present?
+
+2. **STRUCTURAL LAYER (Schema)**:
+   - For JSON: Do specific paths match? (e.g., check `orders[0].items[3].price`).
+   - For Lists: Is the count correct? Are items in the correct order?
+   - **CRITICAL**: Identify the *exact* nested key that failed.
+
+3. **SEMANTIC LAYER (Meaning)**:
+   - "Phone" vs "Mobile Device" (Acceptable Synonym).
+   - "User is 25" vs "Age: 25" (Acceptable Logic).
+   - Hallucinations: Did the model invent data not in the source?
+</forensic_analysis_protocol>
+
+<output_instructions>
+Return a JSON object analyzing the failure. NO preamble.
+{{
+    "match_status": "FULL_MATCH" | "PARTIAL_MATCH" | "CRITICAL_FAILURE",
+    "structural_analysis": {{
+        "format_valid": true,
+        "schema_compliance": true,
+        "deep_diff": ["List specific paths that failed, e.g., 'data.users[0].id expected int, got string'"]
+    }},
+    "semantic_analysis": {{
+        "meaning_preserved": true,
+        "hallucinations": ["List specific invented facts"],
+        "missed_constraints": ["List specific constraints from prompt that were ignored"]
+    }},
+    "root_cause_hypothesis": "Why did the prompt fail? (e.g., 'Ambiguity in field naming', 'Lack of negative constraint for X')",
+    "surgical_fix": "The EXACT instruction to add/change. (e.g., 'Change: Extract entities -> To: Extract entities and return as JSON list of objects')"
+}}
+</output_instructions>"""
+    
+    return judge_prompt
+
+
+def get_universal_judge_system_prompt(has_image: bool = False) -> str:
+    """
+    Get the system prompt for the universal LLM-as-Judge.
+    
+    Args:
+        has_image: Whether an image is involved in the task
+        
+    Returns:
+        System prompt string for the judge
+    """
+    base_prompt = """You are a **Principal Forensic Prompt Auditor** specializing in Enterprise AI system failures.
+
+Your task is to:
+1. Perform 3-Layer Analysis: SYNTAX (format) → STRUCTURE (schema) → SEMANTICS (meaning)
+2. Identify the EXACT nested path that failed (e.g., `data.items[2].price`)
+3. Provide a ROOT CAUSE hypothesis for why the prompt failed
+4. Deliver a SURGICAL FIX - the exact instruction to add or change
+
+Key principles:
+- DEEP DIFF: Traverse nested JSON structures to find exact failure points
+- SEMANTIC FLEXIBILITY: "Phone" == "Mobile Device" (synonyms OK)
+- STRICT DATA: Wrong IDs, numbers, or hallucinated facts = CRITICAL_FAILURE
+- ROOT CAUSE: Explain WHY the prompt failed (ambiguity? missing constraint?)
+
+Return your analysis as valid JSON only. No preamble."""
+
+    if has_image:
+        base_prompt += """
+
+Note: This task involved image analysis. Factor visual content accuracy into your 
+SEMANTIC LAYER analysis. Did the model correctly interpret the image?"""
+    
+    return base_prompt
+
+
+def format_universal_judge_feedback(
+    judge_output: str,
+    task_input: str,
+    predicted_output: str,
+    expected_output: str,
+    score: float = 0.0
+) -> str:
+    """
+    Format the LLM-as-Judge output into readable feedback.
+    
+    Handles the new forensic analysis JSON schema with structural/semantic layers.
+    
+    Args:
+        judge_output: Raw output from the judge LLM
+        task_input: The original task input
+        predicted_output: The LLM's predicted output
+        expected_output: The expected output
+        score: Evaluation score
+        
+    Returns:
+        Formatted feedback string
+    """
+    import json
+    import re
+    
+    # Try to parse JSON from judge output
+    json_match = re.search(r'\{[\s\S]*\}', judge_output)
+    
+    if json_match:
+        try:
+            analysis = json.loads(json_match.group(0))
+            
+            # Determine status icon based on match_status
+            match_status = analysis.get('match_status', 'CRITICAL_FAILURE')
+            status_icon = '✅' if match_status == 'FULL_MATCH' else '⚠️' if match_status == 'PARTIAL_MATCH' else '❌'
+            
+            # Extract structural analysis
+            structural = analysis.get('structural_analysis', {})
+            deep_diff = structural.get('deep_diff', [])
+            deep_diff_str = '\n   - '.join(deep_diff) if deep_diff else 'No structural issues'
+            
+            # Extract semantic analysis
+            semantic = analysis.get('semantic_analysis', {})
+            hallucinations = semantic.get('hallucinations', [])
+            hallucinations_str = '\n   - '.join(hallucinations) if hallucinations else 'None detected'
+            missed_constraints = semantic.get('missed_constraints', [])
+            missed_str = '\n   - '.join(missed_constraints) if missed_constraints else 'None'
+            
+            # Format as detailed, actionable feedback
+            feedback = f"""{status_icon} Forensic Analysis (Score: {score:.2%}) - {match_status}
+
+📊 STRUCTURAL ANALYSIS (Schema Layer):
+   Format Valid: {'✅' if structural.get('format_valid', True) else '❌'}
+   Schema Compliance: {'✅' if structural.get('schema_compliance', True) else '❌'}
+   Deep Diff Issues:
+   - {deep_diff_str}
+
+🧠 SEMANTIC ANALYSIS (Meaning Layer):
+   Meaning Preserved: {'✅' if semantic.get('meaning_preserved', True) else '❌'}
+   Hallucinations:
+   - {hallucinations_str}
+   Missed Constraints:
+   - {missed_str}
+
+🔬 ROOT CAUSE HYPOTHESIS:
+{analysis.get('root_cause_hypothesis', 'Unable to determine root cause')}
+
+💉 SURGICAL FIX:
+{analysis.get('surgical_fix', 'No specific fix suggested')}
+
+💭 CONTEXT:
+- Task: "{task_input[:200]}{'...' if len(task_input) > 200 else ''}"
+- Expected: {expected_output[:200]}{'...' if len(expected_output) > 200 else ''}
+- Predicted: {predicted_output[:200] if predicted_output else '[EMPTY]'}{'...' if predicted_output and len(predicted_output) > 200 else ''}"""
+            
+            return feedback
+            
+        except json.JSONDecodeError:
+            pass
+    
+    # Fallback: return raw output with header
+    return f"""Forensic Analysis (Score: {score:.2%}):
+
+{judge_output}
+
+💭 CONTEXT:
+- Task: "{task_input[:200]}{'...' if len(task_input) > 200 else ''}"
+- Expected: {expected_output[:200]}{'...' if len(expected_output) > 200 else ''}
+- Predicted: {predicted_output[:200] if predicted_output else '[EMPTY]'}"""
+
+
+def build_empty_output_feedback(
+    task_input: str,
+    expected_output: str,
+    current_prompt: Optional[str] = None
+) -> str:
+    """
+    Generate feedback specifically for when the LLM produces no output.
+    
+    Args:
+        task_input: The task input
+        expected_output: What was expected
+        current_prompt: The current system prompt
+        
+    Returns:
+        Feedback explaining the empty output issue
+    """
+    return f"""❌ CRITICAL: Empty Output Generated
+
+🔍 PROBLEM:
+The LLM produced NO OUTPUT for this task.
+
+📋 TASK INPUT:
+{task_input[:500]}{'...' if len(task_input) > 500 else ''}
+
+📋 EXPECTED OUTPUT:
+{expected_output[:500]}{'...' if len(expected_output) > 500 else ''}
+
+📋 CURRENT PROMPT:
+{current_prompt[:300] if current_prompt else '[No prompt provided]'}{'...' if current_prompt and len(current_prompt) > 300 else ''}
+
+🔍 LIKELY CAUSES:
+1. Prompt is too vague - doesn't clearly specify what output is expected
+2. Prompt lacks output format instructions
+3. Prompt might be confusing the LLM about what action to take
+4. Task input might not align with prompt expectations
+
+💡 SUGGESTED FIX:
+Add explicit output instructions to the prompt:
+- "You MUST provide a response for every input"
+- "Always output in the following format: ..."
+- "Extract and return: [specific fields]"
+
+📝 EXAMPLE IMPROVEMENT:
+If extracting JSON, add: "Extract the following fields and return as JSON: [list expected fields from expected output]"
+"""
+
diff --git a/src/gepa_optimizer/version.py b/src/gepa_optimizer/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..4aa6349bd4b32a6d902f3d2b32367b0436f33142
--- /dev/null
+++ b/src/gepa_optimizer/version.py
@@ -0,0 +1,5 @@
+"""
+Version information for GEPA Optimizer
+"""
+
+__version__ = "0.1.0"