Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

App Files Files Community

Suhasdev commited on Dec 12, 2025

Commit

cacd4d0

0 Parent(s):

Deploy Universal Prompt Optimizer to HF Spaces (clean)

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +35 -0
.gitignore +27 -0
README.md +44 -0
app.py +1563 -0
requirements.txt +23 -0
src/gepa_optimizer.egg-info/PKG-INFO +439 -0
src/gepa_optimizer.egg-info/SOURCES.txt +65 -0
src/gepa_optimizer.egg-info/dependency_links.txt +1 -0
src/gepa_optimizer.egg-info/entry_points.txt +2 -0
src/gepa_optimizer.egg-info/requires.txt +29 -0
src/gepa_optimizer.egg-info/top_level.txt +1 -0
src/gepa_optimizer/__init__.py +295 -0
src/gepa_optimizer/cli.py +239 -0
src/gepa_optimizer/core/__init__.py +8 -0
src/gepa_optimizer/core/base_adapter.py +85 -0
src/gepa_optimizer/core/custom_adapter.py +389 -0
src/gepa_optimizer/core/optimizer.py +1279 -0
src/gepa_optimizer/core/result.py +180 -0
src/gepa_optimizer/core/universal_adapter.py +0 -0
src/gepa_optimizer/data/__init__.py +27 -0
src/gepa_optimizer/data/converters.py +265 -0
src/gepa_optimizer/data/index_caching_loader.py +278 -0
src/gepa_optimizer/data/loaders.py +237 -0
src/gepa_optimizer/data/scroll_dataset_loader.py +334 -0
src/gepa_optimizer/data/validation_dataset_loader.py +376 -0
src/gepa_optimizer/data/validators.py +207 -0
src/gepa_optimizer/evaluation/__init__.py +28 -0
src/gepa_optimizer/evaluation/base_evaluator.py +51 -0
src/gepa_optimizer/evaluation/index_caching_evaluator.py +357 -0
src/gepa_optimizer/evaluation/scroll_evaluator.py +251 -0
src/gepa_optimizer/evaluation/ui_evaluator.py +297 -0
src/gepa_optimizer/evaluation/universal_evaluator.py +911 -0
src/gepa_optimizer/evaluation/validation_evaluator.py +495 -0
src/gepa_optimizer/infrastructure/__init__.py +15 -0
src/gepa_optimizer/infrastructure/logging/__init__.py +43 -0
src/gepa_optimizer/infrastructure/logging/context.py +257 -0
src/gepa_optimizer/infrastructure/logging/formatters.py +259 -0
src/gepa_optimizer/infrastructure/logging/logger.py +260 -0
src/gepa_optimizer/llms/__init__.py +10 -0
src/gepa_optimizer/llms/base_llm.py +56 -0
src/gepa_optimizer/llms/batch_llm.py +712 -0
src/gepa_optimizer/llms/llego_enhanced_llm.py +1625 -0
src/gepa_optimizer/llms/vision_llm.py +813 -0
src/gepa_optimizer/models/__init__.py +15 -0
src/gepa_optimizer/models/config.py +488 -0
src/gepa_optimizer/models/dataset.py +89 -0
src/gepa_optimizer/models/result.py +204 -0
src/gepa_optimizer/operators/__init__.py +45 -0
src/gepa_optimizer/operators/base_operator.py +107 -0
src/gepa_optimizer/operators/crossover.py +120 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,27 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Virtual environments
+venv/
+env/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Build artifacts
+*.egg-info/
+dist/
+build/

README.md ADDED Viewed

	@@ -0,0 +1,44 @@

+---
+title: Universal Prompt Optimizer
+emoji: 🧬
+colorFrom: blue
+colorTo: cyan
+sdk: gradio
+sdk_version: 4.0.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# Universal Prompt Optimizer
+A powerful genetic evolutionary prompt optimization tool built with GEPA (Genetic Evolutionary Prompt Agent). Optimize your prompts using genetic algorithms with optional LLEGO crossover for faster convergence.
+## Features
+- 🧬 **Genetic Algorithm Optimization**: Evolve prompts through multiple iterations
+- 🎯 **Multi-Model Support**: Works with OpenAI, Anthropic, Google, and custom models
+- 📊 **Real-time Metrics**: Track optimization progress and improvements
+- 🖼️ **Multi-modal Support**: Include images in your training examples
+- ⚡ **LLEGO Crossover**: Advanced genetic operations for faster convergence
+## How to Use
+1. **Select Model**: Choose your target LLM (GPT-4, Claude, Gemini, or custom)
+2. **Enter Seed Prompt**: Describe your task, constraints, and desired output format
+3. **Add Training Examples**: Provide input/output pairs (images optional)
+4. **Configure Optimization**: Set evolution rounds, batch size, and enable LLEGO
+5. **Start Optimization**: Watch as the genetic algorithm evolves your prompt
+## API Keys
+API keys are stored in-session only and never logged. You can provide them in the UI or set them as environment variables:
+- `OPENAI_API_KEY`
+- `ANTHROPIC_API_KEY`
+- `GOOGLE_API_KEY`
+## License
+MIT License

app.py ADDED Viewed

	@@ -0,0 +1,1563 @@

+"""
+🚀 Universal Prompt Optimizer - Enhanced Production UI v8.0
+Principal Engineer Edition: Linear/Vercel-style Dark Mode with Premium UX
+"""
+import sys
+import os
+# Add src directory to Python path for gepa_optimizer imports
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src'))
+import gradio as gr
+import json
+import base64
+import io
+import os
+import logging
+import traceback
+import html
+import numpy as np
+from PIL import Image as PILImage
+from typing import List, Dict, Optional, Any, Tuple
+import threading
+from collections import deque
+# Optional import for URL image downloads
+try:
+    import requests
+    REQUESTS_AVAILABLE = True
+except ImportError:
+    REQUESTS_AVAILABLE = False
+# ==========================================
+# 0. LOGGING & BACKEND UTILS
+# ==========================================
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Global Candidates Store (Thread-safe)
+_candidates_store = {
+    'candidates': deque(maxlen=100),
+    'lock': threading.Lock(),
+    'iteration': 0
+}
+def add_candidate_to_store(candidate: Dict[str, Any]):
+    with _candidates_store['lock']:
+        _candidates_store['candidates'].append({
+            'iteration': _candidates_store['iteration'],
+            'source': candidate.get('source', 'unknown'),
+            'prompt': candidate.get('prompt', ''),
+            'timestamp': candidate.get('timestamp', ''),
+            'index': len(_candidates_store['candidates']) + 1
+        })
+def get_candidates_from_store() -> List[Dict[str, Any]]:
+    with _candidates_store['lock']:
+        return list(_candidates_store['candidates'])
+def clear_candidates_store():
+    with _candidates_store['lock']:
+        _candidates_store['candidates'].clear()
+        _candidates_store['iteration'] = 0
+def increment_iteration():
+    with _candidates_store['lock']:
+        _candidates_store['iteration'] += 1
+# ==========================================
+# 1. MOCK BACKEND (Kept as provided)
+# ==========================================
+try:
+    from gepa_optimizer import quick_optimize_sync, OptimizedResult
+    BACKEND_AVAILABLE = True
+except ImportError:
+    BACKEND_AVAILABLE = False
+    from dataclasses import dataclass
+    @dataclass
+    class OptimizedResult:
+        optimized_prompt: str
+        improvement_metrics: dict
+        iteration_history: list
+    def quick_optimize_sync(seed_prompt, dataset, model, **kwargs):
+        import time
+        iterations = kwargs.get('max_iterations', 5)
+        batch_size = kwargs.get('batch_size', 4)
+        use_llego = kwargs.get('use_llego', True)
+        # Simulate processing time based on iterations
+        time.sleep(0.5 * iterations)
+        llego_note = "with LLEGO crossover" if use_llego else "standard mutation only"
+        return OptimizedResult(
+            optimized_prompt=f"""# OPTIMIZED PROMPT FOR {model}
+# ----------------------------------------
+# Optimization: {iterations} iterations, batch size {batch_size}, {llego_note}
+## Task Context
+{seed_prompt}
+## Refined Instructions
+1. Analyse the input constraints strictly.
+2. Verify output format against expected schema.
+3. Apply chain-of-thought reasoning before answering.
+4. Cross-reference with provided examples for consistency.
+## Safety & Edge Cases
+- If input is ambiguous, ask for clarification.
+- Maintain a professional, neutral tone.
+- Handle edge cases gracefully with informative responses.""",
+            improvement_metrics={
+                "baseline_score": 0.45,
+                "final_score": 0.92,
+                "improvement": "+104.4%",
+                "iterations_run": iterations,
+                "candidates_evaluated": iterations * batch_size,
+            },
+            iteration_history=[
+                f"Iter 1: Baseline evaluation - Score: 0.45",
+                f"Iter 2: Added Chain-of-Thought constraints - Score: 0.62",
+                f"Iter 3: Refined output formatting rules - Score: 0.78",
+                f"Iter 4: {'LLEGO crossover applied' if use_llego else 'Mutation applied'} - Score: 0.88",
+                f"Iter 5: Final refinement - Score: 0.92",
+            ][:iterations],
+        )
+# ==========================================
+# 2. HELPER FUNCTIONS
+# ==========================================
+def gradio_image_to_base64(image_input) -> Optional[str]:
+    """Convert Gradio image input to base64 string with comprehensive error handling."""
+    if image_input is None:
+        return None
+    try:
+        pil_image = None
+        if isinstance(image_input, np.ndarray):
+            try:
+                # Validate array shape and dtype
+                if image_input.size == 0:
+                    logger.warning("Empty image array provided")
+                    return None
+                pil_image = PILImage.fromarray(image_input)
+            except (ValueError, TypeError) as e:
+                logger.error(f"Failed to convert numpy array to PIL Image: {str(e)}")
+                return None
+        elif isinstance(image_input, PILImage.Image):
+            pil_image = image_input
+        elif isinstance(image_input, str):
+            if not os.path.exists(image_input):
+                logger.warning(f"Image file not found: {image_input}")
+                return None
+            try:
+                pil_image = PILImage.open(image_input)
+            except (IOError, OSError) as e:
+                logger.error(f"Failed to open image file: {str(e)}")
+                return None
+        else:
+            logger.warning(f"Unsupported image input type: {type(image_input)}")
+            return None
+        if pil_image is None:
+            return None
+        try:
+            # Validate image before encoding
+            pil_image.verify()
+            # Reopen after verify (verify closes the image)
+            pil_image = PILImage.open(io.BytesIO(pil_image.tobytes()))
+        except Exception:
+            # If verify fails, try to proceed anyway
+            pass
+        try:
+            buffered = io.BytesIO()
+            pil_image.save(buffered, format="PNG")
+            img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
+            return f"data:image/png;base64,{img_str}"
+        except (IOError, OSError, ValueError) as e:
+            logger.error(f"Failed to encode image to base64: {str(e)}")
+            return None
+    except Exception as e:
+        logger.error(f"Unexpected error in image conversion: {str(e)}\n{traceback.format_exc()}")
+        return None
+def validate_dataset(dataset: List[Dict]) -> Tuple[bool, str]:
+    """Validate dataset structure and content with detailed error messages."""
+    if not isinstance(dataset, list):
+        return False, "Dataset must be a list of examples."
+    if len(dataset) == 0:
+        return False, "Dataset is empty. Add at least one example."
+    # Validate each item in the dataset
+    for i, item in enumerate(dataset):
+        if not isinstance(item, dict):
+            return False, f"Dataset item {i+1} must be a dictionary with 'input' and 'output' keys."
+        if "input" not in item or "output" not in item:
+            return False, f"Dataset item {i+1} is missing required 'input' or 'output' field."
+        if not isinstance(item.get("input"), str) or not isinstance(item.get("output"), str):
+            return False, f"Dataset item {i+1} has invalid 'input' or 'output' type (must be strings)."
+        if not item.get("input", "").strip() or not item.get("output", "").strip():
+            return False, f"Dataset item {i+1} has empty 'input' or 'output' field."
+    return True, ""
+def validate_model(model: str, custom_model: str) -> Tuple[bool, str]:
+    """Validate model selection and custom model format."""
+    if not model:
+        return False, "Please select a foundation model."
+    if model == "custom":
+        if not custom_model or not custom_model.strip():
+            return False, "Custom model selected but no model ID provided."
+        # Validate custom model format (provider/model_name)
+        parts = custom_model.strip().split("/")
+        if len(parts) != 2:
+            return False, "Custom model ID must be in format 'provider/model_name' (e.g., 'openai/gpt-4')."
+        if not parts[0].strip() or not parts[1].strip():
+            return False, "Custom model ID provider and model name cannot be empty."
+    return True, ""
+def validate_api_keys(model: str, api_keys: Dict[str, str]) -> Tuple[bool, str]:
+    """Validate that required API keys are provided for the selected model."""
+    if not api_keys:
+        return True, ""  # Keys are optional if already set in environment
+    model_provider = model.split("/")[0] if "/" in model else model.lower()
+    # Check if model requires a specific provider key
+    required_providers = {
+        "openai": "openai",
+        "anthropic": "anthropic",
+        "google": "google"
+    }
+    if model_provider in required_providers:
+        provider = required_providers[model_provider]
+        key_value = api_keys.get(provider, "").strip() if api_keys.get(provider) else ""
+        # Check environment variable as fallback
+        env_vars = {
+            "openai": "OPENAI_API_KEY",
+            "anthropic": "ANTHROPIC_API_KEY",
+            "google": "GOOGLE_API_KEY"
+        }
+        if not key_value and not os.environ.get(env_vars.get(provider, "")):
+            return False, f"API key for {provider.capitalize()} is required for model '{model}' but not provided."
+    return True, ""
+def safe_optimize(seed_prompt, dataset, model, custom_model="", max_iterations=5, max_metric_calls=50, batch_size=4, use_llego=True, api_keys=None):
+    """Safely run optimization with comprehensive error handling."""
+    try:
+        # Validate seed prompt
+        if not seed_prompt or not isinstance(seed_prompt, str):
+            return False, "Seed prompt is required and must be a string.", None
+        if not seed_prompt.strip():
+            return False, "Seed prompt cannot be empty.", None
+        # Validate dataset
+        is_valid, msg = validate_dataset(dataset)
+        if not is_valid:
+            return False, msg, None
+        # Determine final model
+        final_model = custom_model.strip() if custom_model and custom_model.strip() else model
+        # Validate model
+        model_valid, model_msg = validate_model(model, custom_model)
+        if not model_valid:
+            return False, model_msg, None
+        # Validate API keys
+        api_valid, api_msg = validate_api_keys(final_model, api_keys or {})
+        if not api_valid:
+            return False, api_msg, None
+        # Validate optimization parameters
+        if not isinstance(max_iterations, int) or max_iterations < 1 or max_iterations > 50:
+            return False, "Max iterations must be between 1 and 50.", None
+        if not isinstance(max_metric_calls, int) or max_metric_calls < 10 or max_metric_calls > 500:
+            return False, "Max metric calls must be between 10 and 500.", None
+        if not isinstance(batch_size, int) or batch_size < 1 or batch_size > 20:
+            return False, "Batch size must be between 1 and 20.", None
+        # Check backend availability
+        if not BACKEND_AVAILABLE:
+            logger.warning("Backend not available, using mock optimizer")
+        # Set API keys from UI if provided
+        if api_keys:
+            try:
+                key_mapping = {
+                    "openai": "OPENAI_API_KEY",
+                    "google": "GOOGLE_API_KEY",
+                    "anthropic": "ANTHROPIC_API_KEY",
+                }
+                for provider, env_var in key_mapping.items():
+                    if api_keys.get(provider) and api_keys[provider].strip():
+                        os.environ[env_var] = api_keys[provider].strip()
+                        logger.info(f"Set {provider} API key from UI")
+            except Exception as e:
+                logger.error(f"Failed to set API keys: {str(e)}")
+                return False, f"Failed to configure API keys: {str(e)}", None
+        # Run optimization
+        try:
+            result = quick_optimize_sync(
+                seed_prompt=seed_prompt,
+                dataset=dataset,
+                model=final_model,
+                max_iterations=max_iterations,
+                max_metric_calls=max_metric_calls,
+                batch_size=batch_size,
+                use_llego=use_llego,
+                verbose=True,
+            )
+            # Validate result structure
+            if not result:
+                return False, "Optimization returned no result.", None
+            if not hasattr(result, 'optimized_prompt'):
+                return False, "Optimization result is missing required fields.", None
+            return True, "Success", result
+        except KeyboardInterrupt:
+            logger.warning("Optimization interrupted by user")
+            return False, "Optimization was interrupted.", None
+        except TimeoutError:
+            logger.error("Optimization timed out")
+            return False, "Optimization timed out. Try reducing max_iterations or max_metric_calls.", None
+        except ConnectionError as e:
+            logger.error(f"Connection error during optimization: {str(e)}")
+            return False, f"Connection error: {str(e)}. Check your internet connection and API keys.", None
+        except ValueError as e:
+            logger.error(f"Invalid parameter in optimization: {str(e)}")
+            return False, f"Invalid configuration: {str(e)}", None
+        except Exception as e:
+            error_msg = str(e)
+            logger.error(f"Optimization failed: {error_msg}\n{traceback.format_exc()}")
+            # Provide user-friendly error messages
+            if "api" in error_msg.lower() or "key" in error_msg.lower():
+                return False, f"API error: {error_msg}. Please check your API keys.", None
+            elif "rate limit" in error_msg.lower():
+                return False, "Rate limit exceeded. Please wait a moment and try again.", None
+            elif "quota" in error_msg.lower():
+                return False, "API quota exceeded. Please check your account limits.", None
+            else:
+                return False, f"Optimization failed: {error_msg}", None
+    except Exception as e:
+        logger.error(f"Unexpected error in safe_optimize: {str(e)}\n{traceback.format_exc()}")
+        return False, f"Unexpected error: {str(e)}", None
+# ==========================================
+# 3. UI LOGIC
+# ==========================================
+def add_example(input_text, output_text, image_input, current_dataset):
+    """Add an example to the dataset with comprehensive error handling."""
+    try:
+        # Validate inputs
+        if not input_text:
+            raise gr.Error("Input text is required.")
+        if not output_text:
+            raise gr.Error("Output text is required.")
+        if not isinstance(input_text, str) or not isinstance(output_text, str):
+            raise gr.Error("Input and Output must be text strings.")
+        input_text = input_text.strip()
+        output_text = output_text.strip()
+        if not input_text:
+            raise gr.Error("Input text cannot be empty.")
+        if not output_text:
+            raise gr.Error("Output text cannot be empty.")
+        # Validate dataset state
+        if not isinstance(current_dataset, list):
+            raise gr.Error("Dataset state is invalid. Please refresh the page.")
+        # Process image with error handling
+        img_b64 = None
+        try:
+            img_b64 = gradio_image_to_base64(image_input)
+        except Exception as e:
+            logger.warning(f"Image processing failed, continuing without image: {str(e)}")
+            # Continue without image - it's optional
+        # Create new item
+        try:
+            new_item = {
+                "input": input_text,
+                "output": output_text,
+                "image": img_b64,
+                "image_preview": "🖼️ Image" if img_b64 else "-"
+            }
+            # Validate item structure
+            if not isinstance(new_item["input"], str) or not isinstance(new_item["output"], str):
+                raise gr.Error("Failed to create dataset item: invalid data types.")
+            current_dataset.append(new_item)
+            return current_dataset, "", "", None
+        except Exception as e:
+            logger.error(f"Failed to add example to dataset: {str(e)}")
+            raise gr.Error(f"Failed to add example: {str(e)}")
+    except gr.Error:
+        # Re-raise Gradio errors as-is
+        raise
+    except Exception as e:
+        logger.error(f"Unexpected error in add_example: {str(e)}\n{traceback.format_exc()}")
+        raise gr.Error(f"Unexpected error: {str(e)}")
+def update_table(dataset):
+    """Update the dataset table display with error handling."""
+    try:
+        if not dataset:
+            return []
+        if not isinstance(dataset, list):
+            logger.error(f"Invalid dataset type: {type(dataset)}")
+            return []
+        table_data = []
+        for i, item in enumerate(dataset):
+            try:
+                if not isinstance(item, dict):
+                    logger.warning(f"Skipping invalid dataset item {i+1}: not a dictionary")
+                    continue
+                input_text = str(item.get("input", ""))[:50] if item.get("input") else ""
+                output_text = str(item.get("output", ""))[:50] if item.get("output") else ""
+                image_preview = str(item.get("image_preview", "-"))
+                table_data.append([i+1, input_text, output_text, image_preview])
+            except Exception as e:
+                logger.warning(f"Error processing dataset item {i+1}: {str(e)}")
+                continue
+        return table_data
+    except Exception as e:
+        logger.error(f"Error updating table: {str(e)}\n{traceback.format_exc()}")
+        return []
+def clear_dataset():
+    """Clear the dataset with error handling."""
+    try:
+        return [], []
+    except Exception as e:
+        logger.error(f"Error clearing dataset: {str(e)}")
+        return [], []
+def get_candidates_display():
+    """Generate HTML display for candidates with error handling."""
+    try:
+        candidates = get_candidates_from_store()
+        if not candidates:
+            return "<div style='padding: 2rem; text-align: center; color: #6b7280;'><div style='font-size: 3rem; opacity: 0.3; margin-bottom: 1rem;'>🧬</div><p>Waiting for optimization to start...</p></div>"
+        if not isinstance(candidates, list):
+            logger.error(f"Invalid candidates type: {type(candidates)}")
+            return "<div style='padding: 2rem; text-align: center; color: #ef4444;'>Error loading candidates.</div>"
+        html_output = "<div style='display: flex; flex-direction: column; gap: 12px;'>"
+        # Show last 10 candidates
+        candidates_to_show = list(candidates)[-10:]
+        for c in reversed(candidates_to_show):
+            try:
+                if not isinstance(c, dict):
+                    continue
+                iteration = str(c.get('iteration', '?'))
+                source = str(c.get('source', 'unknown')).upper()
+                prompt = str(c.get('prompt', ''))[:200]
+                # Escape HTML to prevent XSS
+                iteration = html.escape(iteration)
+                source = html.escape(source)
+                prompt = html.escape(prompt)
+                html_output += f"""
+                <div style='background: linear-gradient(135deg, #0f172a 0%, #1e293b 100%); border: 1px solid #334155; border-radius: 8px; padding: 16px; position: relative; overflow: hidden;'>
+                    <div style='position: absolute; top: 0; left: 0; width: 100%; height: 2px; background: linear-gradient(90deg, #06b6d4, #3b82f6);'></div>
+                    <div style='display: flex; justify-content: space-between; align-items: center; margin-bottom: 8px;'>
+                        <span style='font-family: "JetBrains Mono", monospace; font-size: 0.75rem; color: #06b6d4; font-weight: 600;'>ITERATION {iteration}</span>
+                        <span style='background: #1e293b; border: 1px solid #334155; padding: 2px 8px; border-radius: 4px; font-size: 0.7rem; color: #94a3b8;'>{source}</span>
+                    </div>
+                    <div style='font-family: "JetBrains Mono", monospace; font-size: 0.85rem; color: #cbd5e1; line-height: 1.6;'>{prompt}...</div>
+                </div>
+                """
+            except Exception as e:
+                logger.warning(f"Error rendering candidate: {str(e)}")
+                continue
+        html_output += "</div>"
+        return html_output
+    except Exception as e:
+        logger.error(f"Error generating candidates display: {str(e)}\n{traceback.format_exc()}")
+        return "<div style='padding: 2rem; text-align: center; color: #ef4444;'>Error loading candidates display.</div>"
+def run_optimization_flow(seed, dataset, model, custom_model, iter_count, call_count, batch, llego, k_openai, k_google, k_anthropic, progress=gr.Progress()):
+    """Run the optimization flow with comprehensive error handling."""
+    import time
+    try:
+        # Validate inputs
+        if not seed:
+            raise gr.Error("Seed prompt is required.")
+        if not dataset:
+            raise gr.Error("Dataset is required. Add at least one example.")
+        if not model:
+            raise gr.Error("Model selection is required.")
+        # Validate numeric parameters
+        try:
+            iter_count = int(iter_count) if iter_count else 5
+            call_count = int(call_count) if call_count else 50
+            batch = int(batch) if batch else 4
+        except (ValueError, TypeError) as e:
+            raise gr.Error(f"Invalid optimization parameters: {str(e)}")
+        # Determine final model
+        try:
+            final_model = custom_model.strip() if custom_model and custom_model.strip() else model
+        except Exception as e:
+            logger.warning(f"Error processing custom model: {str(e)}")
+            final_model = model
+        # Clear candidates store
+        try:
+            clear_candidates_store()
+        except Exception as e:
+            logger.warning(f"Error clearing candidates store: {str(e)}")
+        # Prepare API keys
+        api_keys = {}
+        try:
+            api_keys = {
+                "openai": k_openai if k_openai else "",
+                "google": k_google if k_google else "",
+                "anthropic": k_anthropic if k_anthropic else ""
+            }
+        except Exception as e:
+            logger.warning(f"Error processing API keys: {str(e)}")
+        # Initial state
+        try:
+            yield (
+                gr.update(visible=True),
+                gr.update(visible=False),
+                gr.update(visible=False),
+                "🚀 Initializing Genetic Algorithm...",
+                "", {}, "", ""
+            )
+            time.sleep(0.5)  # Brief pause for UI update
+        except Exception as e:
+            logger.error(f"Error in initial UI update: {str(e)}")
+            raise gr.Error(f"Failed to initialize UI: {str(e)}")
+        # Evolution loop (visual progress - actual work happens in safe_optimize)
+        try:
+            for i in range(1, iter_count + 1):
+                try:
+                    increment_iteration()
+                    add_candidate_to_store({
+                        "source": "evolution_step",
+                        "prompt": f"Candidate {i}: Optimizing instruction clarity and task alignment...",
+                        "timestamp": "now"
+                    })
+                    progress(i/iter_count, desc=f"Evolution Round {i}/{iter_count}")
+                    yield (
+                        gr.update(), gr.update(), gr.update(),
+                        f"🧬 **Evolution Round {i}/{iter_count}**\n\n• Generating {batch} prompt mutations\n• Evaluating fitness scores\n• Selecting top candidates",
+                        "", {}, "", get_candidates_display()
+                    )
+                    time.sleep(0.3)  # Pause to show progress
+                except Exception as e:
+                    logger.warning(f"Error in evolution step {i}: {str(e)}")
+                    # Continue with next iteration
+                    continue
+        except Exception as e:
+            logger.error(f"Error in evolution loop: {str(e)}")
+            # Continue to optimization attempt
+        # Final optimization
+        try:
+            success, msg, result = safe_optimize(
+                seed_prompt=seed,
+                dataset=dataset,
+                model=model,
+                custom_model=custom_model,
+                max_iterations=iter_count,
+                max_metric_calls=call_count,
+                batch_size=batch,
+                use_llego=llego,
+                api_keys=api_keys
+            )
+            if not success:
+                # Show error state
+                yield (
+                    gr.update(visible=True),
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    f"❌ **Optimization Failed**\n\n{msg}",
+                    "", {}, "", get_candidates_display()
+                )
+                raise gr.Error(msg)
+            # Validate result before displaying
+            if not result:
+                raise gr.Error("Optimization completed but returned no result.")
+            if not hasattr(result, 'optimized_prompt'):
+                raise gr.Error("Optimization result is missing required fields.")
+            # Show results
+            try:
+                optimized_prompt = result.optimized_prompt if result.optimized_prompt else ""
+                improvement_metrics = result.improvement_metrics if hasattr(result, 'improvement_metrics') else {}
+                iteration_history = result.iteration_history if hasattr(result, 'iteration_history') else []
+                history_text = "\n".join(iteration_history) if isinstance(iteration_history, list) else str(iteration_history)
+                yield (
+                    gr.update(visible=False),
+                    gr.update(visible=False),
+                    gr.update(visible=True),
+                    "✅ Optimization Complete",
+                    optimized_prompt,
+                    improvement_metrics,
+                    history_text,
+                    get_candidates_display()
+                )
+            except Exception as e:
+                logger.error(f"Error displaying results: {str(e)}")
+                raise gr.Error(f"Failed to display results: {str(e)}")
+        except gr.Error:
+            # Re-raise Gradio errors
+            raise
+        except Exception as e:
+            logger.error(f"Error in optimization: {str(e)}\n{traceback.format_exc()}")
+            raise gr.Error(f"Optimization error: {str(e)}")
+    except gr.Error:
+        # Re-raise Gradio errors as-is
+        raise
+    except KeyboardInterrupt:
+        logger.warning("Optimization interrupted by user")
+        raise gr.Error("Optimization was interrupted.")
+    except Exception as e:
+        logger.error(f"Unexpected error in optimization flow: {str(e)}\n{traceback.format_exc()}")
+        raise gr.Error(f"Unexpected error: {str(e)}")
+# ==========================================
+# 4. ENHANCED CSS (Linear/Vercel-style)
+# ==========================================
+CUSTOM_CSS = """
+@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');
+:root {
+  --bg0: #070A0F;
+  --bg1: #0B1020;
+  --bg2: rgba(255,255,255,0.04);
+  --bg3: rgba(255,255,255,0.06);
+  --stroke0: rgba(148,163,184,0.14);
+  --stroke1: rgba(148,163,184,0.22);
+  --text0: #EAF0FF;
+  --text1: rgba(234,240,255,0.74);
+  --text2: rgba(234,240,255,0.56);
+  --teal: #06B6D4;
+  --blue: #3B82F6;
+  --ok: #10B981;
+  --okGlow: rgba(16,185,129,0.18);
+  --bad: #EF4444;
+  --shadow: 0 12px 40px rgba(0,0,0,0.45);
+  --shadowSoft: 0 10px 24px rgba(0,0,0,0.32);
+  --radius: 14px;
+  --radiusSm: 10px;
+}
+html, body {
+  background: radial-gradient(1200px 700px at 20% -10%, rgba(6,182,212,0.13), transparent 55%),
+              radial-gradient(1000px 650px at 90% 0%, rgba(59,130,246,0.10), transparent 60%),
+              linear-gradient(180deg, var(--bg0) 0%, var(--bg1) 100%);
+  color: var(--text0);
+  font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif;
+}
+.gradio-container {
+  max-width: 1520px !important;
+  padding: 12px 18px !important;
+  margin: 0 auto !important;
+}
+/* --- App shell --- */
+.app-shell { min-height: auto !important; }
+.topbar {
+  padding: 12px 14px 12px 14px;
+  margin-bottom: 4px;
+  border: 1px solid var(--stroke0);
+  border-radius: var(--radius);
+  background: linear-gradient(180deg, rgba(255,255,255,0.04) 0%, rgba(255,255,255,0.02) 100%);
+  box-shadow: var(--shadowSoft);
+}
+.topbar-wrap { margin-bottom: 0 !important; }
+.brand-row { display: flex; align-items: center; justify-content: space-between; gap: 16px; }
+.brand-left { display: flex; align-items: center; gap: 14px; }
+.brand-mark {
+  width: 44px; height: 44px; border-radius: 12px;
+  background: linear-gradient(135deg, rgba(6,182,212,0.26), rgba(59,130,246,0.20));
+  border: 1px solid rgba(6,182,212,0.30);
+  box-shadow: 0 0 0 4px rgba(6,182,212,0.10);
+  display: flex; align-items: center; justify-content: center;
+  font-weight: 800;
+}
+.h1 {
+  font-size: 22px; font-weight: 800; letter-spacing: -0.02em;
+  margin: 0; line-height: 1.2;
+}
+.subtitle { margin-top: 4px; color: var(--text1); font-weight: 500; font-size: 13px; }
+.status-pill {
+  display: inline-flex; align-items: center; gap: 10px;
+  padding: 10px 12px; border-radius: 999px;
+  background: rgba(255,255,255,0.03);
+  border: 1px solid var(--stroke0);
+  color: var(--text1);
+  font-size: 12px; font-weight: 700; letter-spacing: 0.08em;
+  text-transform: uppercase;
+}
+.dot {
+  width: 10px; height: 10px; border-radius: 999px;
+  background: var(--ok);
+  box-shadow: 0 0 16px rgba(16,185,129,0.40);
+  animation: pulse 1.8s ease-in-out infinite;
+}
+@keyframes pulse { 0%, 100% { transform: scale(1); opacity: 0.95; } 50% { transform: scale(1.18); opacity: 0.70; } }
+/* --- Two-column layout helpers --- */
+.left-col, .right-col { min-width: 280px; }
+/* --- Cards / Sections --- */
+.card {
+  border-radius: var(--radius);
+  background: linear-gradient(180deg, rgba(255,255,255,0.045) 0%, rgba(255,255,255,0.022) 100%);
+  border: 1px solid var(--stroke0);
+  box-shadow: var(--shadowSoft);
+  padding: 16px;
+}
+.card + .card { margin-top: 14px; }
+.card-head {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 12px;
+  padding-bottom: 12px;
+  margin-bottom: 12px;
+  border-bottom: 1px solid var(--stroke0);
+}
+.card-title {
+  display: flex; align-items: center; gap: 10px;
+  font-size: 13px; font-weight: 800; letter-spacing: 0.12em;
+  text-transform: uppercase; color: var(--text1);
+}
+.step {
+  width: 30px; height: 30px; border-radius: 10px;
+  background: linear-gradient(135deg, rgba(6,182,212,0.95), rgba(59,130,246,0.95));
+  box-shadow: 0 10px 20px rgba(6,182,212,0.18);
+  display: flex; align-items: center; justify-content: center;
+  color: white; font-weight: 900; font-size: 13px;
+}
+.hint { color: var(--text2); font-size: 12px; line-height: 1.4; }
+.ds-count span {
+  display: inline-flex;
+  align-items: center;
+  padding: 7px 10px;
+  border-radius: 999px;
+  border: 1px solid var(--stroke0);
+  background: rgba(255,255,255,0.02);
+  color: var(--text1) !important;
+  font-weight: 700;
+  font-size: 12px;
+}
+/* --- Inputs --- */
+label { color: var(--text1) !important; font-weight: 650 !important; font-size: 12px !important; }
+textarea, input, select {
+  background: rgba(255,255,255,0.03) !important;
+  border: 1px solid var(--stroke0) !important;
+  border-radius: 12px !important;
+  color: var(--text0) !important;
+  transition: border-color 0.15s ease, box-shadow 0.15s ease, transform 0.15s ease;
+}
+textarea:focus, input:focus, select:focus {
+  outline: none !important;
+  border-color: rgba(6,182,212,0.55) !important;
+  box-shadow: 0 0 0 4px rgba(6,182,212,0.14) !important;
+}
+.keybox input { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; }
+.seed textarea { min-height: 160px !important; }
+.mono textarea { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; font-size: 12.5px !important; }
+/* --- Buttons --- */
+.cta button {
+  width: 100% !important;
+  border: 0 !important;
+  border-radius: 14px !important;
+  padding: 14px 16px !important;
+  font-size: 13px !important;
+  font-weight: 900 !important;
+  letter-spacing: 0.12em !important;
+  text-transform: uppercase !important;
+  color: white !important;
+  background: linear-gradient(135deg, rgba(6,182,212,1) 0%, rgba(59,130,246,1) 100%) !important;
+  box-shadow: 0 18px 48px rgba(6,182,212,0.22) !important;
+  position: relative !important;
+  overflow: hidden !important;
+}
+.cta button::after {
+  content: "";
+  position: absolute; inset: -120px;
+  background: radial-gradient(closest-side, rgba(255,255,255,0.18), transparent 60%);
+  transform: translateX(-40%);
+  transition: transform 0.45s ease;
+}
+.cta button:hover { transform: translateY(-1px); }
+.cta button:hover::after { transform: translateX(40%); }
+.cta button:active { transform: translateY(0px); }
+.btn-secondary button {
+  border-radius: 12px !important;
+  border: 1px solid var(--stroke1) !important;
+  background: rgba(255,255,255,0.03) !important;
+  color: var(--text0) !important;
+  font-weight: 800 !important;
+}
+.btn-secondary button:hover { border-color: rgba(6,182,212,0.55) !important; }
+.btn-danger button {
+  border-radius: 12px !important;
+  border: 1px solid rgba(239,68,68,0.55) !important;
+  background: rgba(239,68,68,0.06) !important;
+  color: rgba(255,170,170,1) !important;
+  font-weight: 900 !important;
+}
+/* --- Dataframe --- */
+.dataframe {
+  border-radius: 14px !important;
+  border: 1px solid var(--stroke0) !important;
+  background: rgba(255,255,255,0.02) !important;
+  overflow: hidden !important;
+}
+.dataframe thead th {
+  background: rgba(255,255,255,0.04) !important;
+  color: var(--text1) !important;
+  font-weight: 900 !important;
+  font-size: 11px !important;
+  letter-spacing: 0.10em !important;
+  text-transform: uppercase !important;
+  border-bottom: 1px solid var(--stroke0) !important;
+}
+.dataframe tbody td {
+  color: var(--text0) !important;
+  font-size: 12px !important;
+  border-bottom: 1px solid rgba(148,163,184,0.10) !important;
+}
+.dataframe tbody tr:hover { background: rgba(255,255,255,0.03) !important; }
+/* --- Status / Results --- */
+.panel {
+  border-radius: var(--radius);
+  border: 1px solid var(--stroke0);
+  background: linear-gradient(180deg, rgba(255,255,255,0.045), rgba(255,255,255,0.020));
+  box-shadow: var(--shadowSoft);
+  padding: 16px;
+}
+.panel-title {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 10px;
+  padding-bottom: 12px; margin-bottom: 12px;
+  border-bottom: 1px solid var(--stroke0);
+}
+.panel-title h3 { margin: 0; font-size: 13px; letter-spacing: 0.12em; text-transform: uppercase; color: var(--text1); }
+.running-pill {
+  display: inline-flex; align-items: center; gap: 10px;
+  padding: 8px 10px; border-radius: 999px;
+  border: 1px solid rgba(6,182,212,0.38);
+  background: rgba(6,182,212,0.08);
+  color: rgba(153,246,228,0.95);
+  font-weight: 900; font-size: 11px; letter-spacing: 0.10em; text-transform: uppercase;
+}
+.running-dot { width: 9px; height: 9px; border-radius: 99px; background: var(--teal); box-shadow: 0 0 18px rgba(6,182,212,0.45); animation: pulse 1.8s ease-in-out infinite; }
+.empty {
+  border-radius: var(--radius);
+  border: 1px dashed rgba(148,163,184,0.26);
+  background: rgba(255,255,255,0.02);
+  padding: 28px;
+  text-align: center;
+  color: var(--text2);
+}
+.empty .big { font-size: 40px; opacity: 0.22; margin-bottom: 10px; }
+.empty .t { color: var(--text1); font-weight: 800; margin-bottom: 6px; }
+.empty .s { font-size: 12px; }
+.results {
+  border-radius: var(--radius);
+  border: 1px solid rgba(16,185,129,0.55);
+  background: linear-gradient(180deg, rgba(16,185,129,0.12), rgba(255,255,255,0.02));
+  box-shadow: 0 0 0 4px rgba(16,185,129,0.10), 0 20px 60px rgba(0,0,0,0.42);
+  padding: 16px;
+}
+.results-banner {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 12px;
+  padding-bottom: 12px; margin-bottom: 12px;
+  border-bottom: 1px solid rgba(16,185,129,0.28);
+}
+.results-banner .k { display: flex; align-items: center; gap: 10px; }
+.results-banner .k .icon {
+  width: 36px; height: 36px; border-radius: 12px;
+  background: rgba(16,185,129,0.18);
+  border: 1px solid rgba(16,185,129,0.45);
+  display: flex; align-items: center; justify-content: center;
+}
+.results-banner .k .title { font-weight: 900; color: rgba(189,255,225,0.98); letter-spacing: 0.06em; text-transform: uppercase; font-size: 12px; }
+.results-banner .k .sub { margin-top: 2px; color: rgba(189,255,225,0.70); font-size: 12px; }
+.tabs { background: transparent !important; }
+.tab-nav button {
+  background: transparent !important;
+  border: 0 !important;
+  border-bottom: 2px solid transparent !important;
+  color: var(--text2) !important;
+  font-weight: 800 !important;
+  padding: 10px 12px !important;
+}
+.tab-nav button[aria-selected="true"] {
+  color: rgba(153,246,228,0.98) !important;
+  border-bottom-color: rgba(6,182,212,0.75) !important;
+}
+.tab-nav button:hover { color: var(--text0) !important; }
+.small-note { color: var(--text2); font-size: 12px; }
+/* --- Candidates stream --- */
+.cand-empty { padding: 28px; text-align: center; color: var(--text2); }
+.cand-empty-icon { font-size: 40px; opacity: 0.25; margin-bottom: 10px; }
+.cand-empty-title { color: var(--text1); font-weight: 900; margin-bottom: 4px; }
+.cand-empty-sub { font-size: 12px; }
+.cand-stream { display: flex; flex-direction: column; gap: 10px; }
+.cand-card {
+  border-radius: 14px;
+  border: 1px solid rgba(148,163,184,0.18);
+  background: linear-gradient(135deg, rgba(15,23,42,0.85), rgba(2,6,23,0.45));
+  overflow: hidden;
+}
+.cand-topbar { height: 2px; background: linear-gradient(90deg, var(--teal), var(--blue)); }
+.cand-header {
+  display: flex; align-items: center; justify-content: space-between;
+  gap: 10px;
+  padding: 10px 12px 0 12px;
+}
+.cand-iter { font-family: "JetBrains Mono", ui-monospace; font-size: 11px; color: rgba(153,246,228,0.92); font-weight: 800; letter-spacing: 0.08em; }
+.cand-pill {
+  font-size: 10px; font-weight: 900; letter-spacing: 0.10em;
+  padding: 5px 8px; border-radius: 999px;
+  border: 1px solid rgba(148,163,184,0.20);
+  background: rgba(255,255,255,0.03);
+  color: var(--text2);
+}
+.cand-body {
+  padding: 10px 12px 12px 12px;
+  font-family: "JetBrains Mono", ui-monospace;
+  font-size: 12px;
+  line-height: 1.6;
+  color: rgba(234,240,255,0.75);
+}
+/* --- Responsive --- */
+@media (max-width: 980px) {
+  .gradio-container { padding: 16px 12px !important; }
+  .brand-row { flex-direction: column; align-items: flex-start; }
+  .status-pill { align-self: stretch; justify-content: center; }
+}
+"""
+FORCE_DARK_JS = """
+function forceDarkTheme() {
+  try {
+    const url = new URL(window.location.href);
+    if (url.searchParams.get("__theme") !== "dark") {
+      url.searchParams.set("__theme", "dark");
+      window.location.replace(url.toString());
+    }
+  } catch (e) {
+    // no-op
+  }
+}
+forceDarkTheme();
+"""
+# ==========================================
+# 5. UI CONSTRUCTION (Redesigned)
+# ==========================================
+APP_TITLE = "Universal Prompt Optimizer"
+APP_SUBTITLE = "Genetic Evolutionary Prompt Agent (GEPA)"
+STATUS_READY = "System Ready"
+with gr.Blocks(
+    title="Universal Prompt Optimizer",
+    theme=gr.themes.Base()
+) as app:
+    dataset_state = gr.State([])
+    # TOP BAR
+    gr.HTML(
+        f"""
+        <div class="topbar">
+          <div class="brand-row">
+            <div class="brand-left">
+              <div class="brand-mark">GE</div>
+              <div>
+                <div class="h1">{APP_TITLE}</div>
+                <div class="subtitle">{APP_SUBTITLE}</div>
+              </div>
+            </div>
+            <div class="status-pill"><span class="dot"></span> {STATUS_READY}</div>
+          </div>
+        </div>
+        """,
+        elem_classes=["topbar-wrap"]
+    )
+    # MAIN LAYOUT
+    with gr.Row():
+        # LEFT COLUMN: Configuration
+        with gr.Column(scale=5):
+            # Step 1
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">1</div> Model & Credentials</div>
+                      <div class="hint">Select a target model, then provide keys (stored in-session only).</div>
+                    </div>
+                    """
+                )
+                with gr.Row():
+                    model_select = gr.Dropdown(
+                        label="Foundation Model",
+                        choices=[
+                            "openai/gpt-4o",
+                            "openai/gpt-4-turbo",
+                            "anthropic/claude-3-5-sonnet",
+                            "google/gemini-1.5-pro",
+                            "custom"
+                        ],
+                        value="openai/gpt-4o",
+                        scale=2
+                    )
+                    custom_model_input = gr.Textbox(
+                        label="Custom Model ID",
+                        placeholder="provider/model_name",
+                        scale=1
+                    )
+                gr.HTML('<div class="subsection-title">API Access Keys</div>')
+                gr.Markdown("*Keys are stored in-session only and never logged*", elem_classes=["text-xs"])
+                with gr.Row():
+                    key_openai = gr.Textbox(
+                        label="OpenAI API Key",
+                        type="password",
+                        placeholder="sk-...",
+                        scale=1
+                    )
+                    key_google = gr.Textbox(
+                        label="Google API Key",
+                        type="password",
+                        placeholder="AIza...",
+                        scale=1
+                    )
+                    key_anthropic = gr.Textbox(
+                        label="Anthropic API Key",
+                        type="password",
+                        placeholder="sk-ant...",
+                        scale=1
+                    )
+            # Step 2
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">2</div> Seed Prompt</div>
+                      <div class="hint">Describe the task, constraints, output format, and tone.</div>
+                    </div>
+                    """
+                )
+                seed_input = gr.Textbox(
+                    label="Task Description",
+                    placeholder="Example: You are a code reviewer that identifies security vulnerabilities in Python code. Return a JSON report with severity and fixes...",
+                    lines=7,
+                    max_lines=14,
+                    elem_classes=["seed", "mono"]
+                )
+            # Step 3
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">3</div> Training Examples</div>
+                      <div class="hint">Add a few high-quality I/O pairs (images optional) to shape the optimizer.</div>
+                    </div>
+                    """
+                )
+                with gr.Tabs():
+                    with gr.Tab("Manual Entry"):
+                        with gr.Row():
+                            with gr.Column(scale=2):
+                                d_in = gr.Textbox(
+                                    label="Input / User Prompt",
+                                    placeholder="Example user input...",
+                                    lines=3
+                                )
+                                d_out = gr.Textbox(
+                                    label="Ideal Output",
+                                    placeholder="Expected AI response...",
+                                    lines=3
+                                )
+                            with gr.Column(scale=1):
+                                d_img = gr.Image(
+                                    label="Attach Image (Optional)",
+                                    type="numpy",
+                                    height=170
+                                )
+                        btn_add = gr.Button(
+                            "Add Example",
+                            elem_classes=["btn-secondary"]
+                        )
+                    with gr.Tab("Bulk Import (JSON)"):
+                        gr.Markdown(
+                            "Paste a JSON array like: `[{\"input\": \"...\", \"output\": \"...\"}]`",
+                            elem_classes=["small-note"]
+                        )
+                        bulk_json = gr.Textbox(
+                            show_label=False,
+                            placeholder='[{"input": "...", "output": "..."}]',
+                            lines=6
+                        )
+                        btn_import = gr.Button(
+                            "Import JSON",
+                            elem_classes=["btn-secondary"]
+                        )
+                with gr.Row():
+                    gr.HTML("<div class='hint'>Current dataset</div>")
+                    ds_count = gr.HTML(
+                        "<span style='color: var(--text-secondary);'>0 examples loaded</span>",
+                        elem_classes=["ds-count"]
+                    )
+                ds_table = gr.Dataframe(
+                    headers=["ID", "Input", "Output", "Media"],
+                    datatype=["number", "str", "str", "str"],
+                    row_count=6,
+                    column_count=(4, "fixed"),
+                    interactive=False
+                )
+                with gr.Row():
+                    btn_clear = gr.Button(
+                        "Clear All",
+                        elem_classes=["btn-danger"],
+                        size="sm"
+                    )
+            # Step 4 (Prominent, not buried)
+            with gr.Group(elem_classes=["card"]):
+                gr.HTML(
+                    """
+                    <div class="card-head">
+                      <div class="card-title"><div class="step">4</div> Optimization Controls</div>
+                      <div class="hint">Tune evolution budget. Defaults are safe for quick runs.</div>
+                    </div>
+                    """
+                )
+                with gr.Row():
+                    slider_iter = gr.Slider(
+                        minimum=1,
+                        maximum=20,
+                        value=5,
+                        step=1,
+                        label="Evolution Rounds",
+                        info="Number of genetic iterations"
+                    )
+                    slider_calls = gr.Slider(
+                        minimum=10,
+                        maximum=200,
+                        value=50,
+                        step=10,
+                        label="Max LLM Calls",
+                        info="Total API call budget"
+                    )
+                with gr.Row():
+                    slider_batch = gr.Slider(
+                        minimum=1,
+                        maximum=10,
+                        value=4,
+                        step=1,
+                        label="Batch Size",
+                        info="Candidates per iteration"
+                    )
+                    check_llego = gr.Checkbox(
+                        value=True,
+                        label="Enable LLEGO Crossover",
+                        info="Use advanced genetic operations"
+                    )
+                btn_optimize = gr.Button(
+                    "Start Optimization",
+                    elem_classes=["cta", "mt-6"]
+                )
+        # RIGHT: STATUS + RESULTS
+        with gr.Column(scale=5, elem_classes=["right-col"]):
+            # STATUS PANEL (Hidden by default)
+            status_panel = gr.Group(visible=False, elem_classes=["panel"])
+            with status_panel:
+                gr.HTML(
+                    """
+                    <div class="panel-title">
+                      <h3>Optimization status</h3>
+                      <div class="running-pill"><span class="running-dot"></span> Running</div>
+                    </div>
+                    """
+                )
+                txt_status = gr.Markdown("Initializing genetic algorithm...")
+            # EMPTY STATE
+            empty_state = gr.HTML(
+                """
+                <div class="empty">
+                  <div class="big">🧬</div>
+                  <div class="t">Ready to optimize</div>
+                  <div class="s">Fill Steps 1–3, then click <b>Start Optimization</b> to begin prompt evolution.</div>
+                </div>
+                """,
+                visible=True
+            )
+            # RESULTS PANEL (Hidden by default)
+            results_panel = gr.Group(visible=False, elem_classes=["results"])
+            with results_panel:
+                gr.HTML(
+                    """
+                    <div class="results-banner">
+                      <div class="k">
+                        <div class="icon">✓</div>
+                        <div>
+                          <div class="title">Optimization successful</div>
+                          <div class="sub">Review the optimized prompt, metrics, and evolution traces.</div>
+                        </div>
+                      </div>
+                    </div>
+                    """
+                )
+                with gr.Tabs():
+                    with gr.Tab("Optimized Prompt"):
+                        res_prompt = gr.Textbox(
+                            label="Optimized Prompt",
+                            lines=18,
+                            max_lines=28,
+                            interactive=False,
+                            show_label=True,
+                            elem_classes=["mono"]
+                        )
+                    with gr.Tab("Metrics & Log"):
+                        res_metrics = gr.JSON(label="Performance Gains")
+                        res_history = gr.TextArea(
+                            label="Evolution Log",
+                            interactive=False,
+                            lines=10
+                        )
+                    with gr.Tab("🧬 Live Candidates"):
+                        gr.Markdown("Real-time stream of generated prompt candidates during optimization:")
+                        live_candidates = gr.HTML()
+                        btn_refresh_cand = gr.Button(
+                            "🔄 Refresh Stream",
+                            elem_classes=["secondary-btn"],
+                            size="sm"
+                        )
+    # ==========================================
+    # 6. EVENT HANDLERS
+    # ==========================================
+    # Dataset Management
+    def update_dataset_count(dataset):
+        """Update dataset count display with error handling."""
+        try:
+            if not isinstance(dataset, list):
+                return "<span style='color: var(--text-secondary);'>0 examples loaded</span>"
+            count = len(dataset)
+            return f"<span style='color: var(--text-secondary);'>{count} example{'s' if count != 1 else ''} loaded</span>"
+        except Exception as e:
+            logger.error(f"Error updating dataset count: {str(e)}")
+            return "<span style='color: var(--text-secondary);'>Error</span>"
+    # Wrap event handlers with error handling
+    def safe_add_example(*args):
+        """Wrapper for add_example with error handling."""
+        try:
+            return add_example(*args)
+        except gr.Error:
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error in add_example: {str(e)}")
+            raise gr.Error(f"Failed to add example: {str(e)}")
+    def safe_update_table(dataset):
+        """Wrapper for update_table with error handling."""
+        try:
+            return update_table(dataset)
+        except Exception as e:
+            logger.error(f"Error updating table: {str(e)}")
+            return []
+    def safe_clear_dataset():
+        """Wrapper for clear_dataset with error handling."""
+        try:
+            return clear_dataset()
+        except Exception as e:
+            logger.error(f"Error clearing dataset: {str(e)}")
+            return [], []
+    btn_add.click(
+        safe_add_example,
+        inputs=[d_in, d_out, d_img, dataset_state],
+        outputs=[dataset_state, d_in, d_out, d_img]
+    ).then(
+        safe_update_table,
+        inputs=[dataset_state],
+        outputs=[ds_table]
+    ).then(
+        update_dataset_count,
+        inputs=[dataset_state],
+        outputs=[ds_count]
+    )
+    btn_clear.click(
+        safe_clear_dataset,
+        outputs=[dataset_state, ds_table]
+    ).then(
+        lambda: "<span style='color: var(--text-secondary);'>0 examples loaded</span>",
+        outputs=[ds_count]
+    )
+    # Bulk Import
+    def import_bulk_json(json_text, current_dataset):
+        """Import examples from JSON with comprehensive error handling."""
+        try:
+            # Validate inputs
+            if not json_text or not json_text.strip():
+                raise gr.Error("JSON input is empty. Please provide a JSON array.")
+            if not isinstance(current_dataset, list):
+                raise gr.Error("Dataset state is invalid. Please refresh the page.")
+            # Parse JSON
+            try:
+                data = json.loads(json_text.strip())
+            except json.JSONDecodeError as e:
+                raise gr.Error(f"Invalid JSON format: {str(e)}. Please check your JSON syntax.")
+            # Validate structure
+            if not isinstance(data, list):
+                raise gr.Error("JSON must be an array of objects. Example: [{\"input\": \"...\", \"output\": \"...\"}]")
+            if len(data) == 0:
+                raise gr.Error("JSON array is empty. Add at least one example object.")
+            # Validate and import items
+            imported_count = 0
+            errors = []
+            for i, item in enumerate(data):
+                try:
+                    if not isinstance(item, dict):
+                        errors.append(f"Item {i+1}: not a dictionary")
+                        continue
+                    if "input" not in item or "output" not in item:
+                        errors.append(f"Item {i+1}: missing 'input' or 'output' field")
+                        continue
+                    input_val = item["input"]
+                    output_val = item["output"]
+                    if not isinstance(input_val, str) or not isinstance(output_val, str):
+                        errors.append(f"Item {i+1}: 'input' and 'output' must be strings")
+                        continue
+                    if not input_val.strip() or not output_val.strip():
+                        errors.append(f"Item {i+1}: 'input' and 'output' cannot be empty")
+                        continue
+                    # Add valid item
+                    current_dataset.append({
+                        "input": input_val.strip(),
+                        "output": output_val.strip(),
+                        "image": item.get("image"),  # Optional
+                        "image_preview": "🖼️ Image" if item.get("image") else "-"
+                    })
+                    imported_count += 1
+                except Exception as e:
+                    errors.append(f"Item {i+1}: {str(e)}")
+                    logger.warning(f"Error importing item {i+1}: {str(e)}")
+                    continue
+            # Report results
+            if imported_count == 0:
+                error_msg = "No valid examples imported. "
+                if errors:
+                    error_msg += "Errors: " + "; ".join(errors[:3])
+                    if len(errors) > 3:
+                        error_msg += f" (and {len(errors) - 3} more)"
+                raise gr.Error(error_msg)
+            if errors:
+                warning_msg = f"Imported {imported_count} example(s). "
+                if len(errors) <= 3:
+                    warning_msg += f"Warnings: {'; '.join(errors)}"
+                else:
+                    warning_msg += f"{len(errors)} items had errors."
+                logger.warning(warning_msg)
+            return current_dataset, ""
+        except gr.Error:
+            # Re-raise Gradio errors
+            raise
+        except Exception as e:
+            logger.error(f"Unexpected error in import_bulk_json: {str(e)}\n{traceback.format_exc()}")
+            raise gr.Error(f"Failed to import JSON: {str(e)}")
+    btn_import.click(
+        import_bulk_json,
+        inputs=[bulk_json, dataset_state],
+        outputs=[dataset_state, bulk_json]
+    ).then(
+        safe_update_table,
+        inputs=[dataset_state],
+        outputs=[ds_table]
+    ).then(
+        update_dataset_count,
+        inputs=[dataset_state],
+        outputs=[ds_count]
+    )
+    # Main Optimization Flow
+    btn_optimize.click(
+        run_optimization_flow,
+        inputs=[
+            seed_input, dataset_state, model_select, custom_model_input,
+            slider_iter, slider_calls, slider_batch, check_llego,
+            key_openai, key_google, key_anthropic
+        ],
+        outputs=[
+            status_panel, empty_state, results_panel,
+            txt_status, res_prompt, res_metrics, res_history, live_candidates
+        ]
+    )
+    # Refresh Candidates
+    def safe_get_candidates_display():
+        """Wrapper for get_candidates_display with error handling."""
+        try:
+            return get_candidates_display()
+        except Exception as e:
+            logger.error(f"Error refreshing candidates: {str(e)}")
+            return "<div style='padding: 2rem; text-align: center; color: #ef4444;'>Error loading candidates.</div>"
+    btn_refresh_cand.click(
+        safe_get_candidates_display,
+        outputs=[live_candidates]
+    )
+# ==========================================
+# 7. LAUNCH
+# ==========================================
+if __name__ == "__main__":
+    app.queue().launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,  # Set to False for HF Spaces
+        show_error=True,
+        css=CUSTOM_CSS,
+        js=FORCE_DARK_JS
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+# Core dependencies - gepa from git
+git+https://github.com/gepa-ai/gepa.git
+numpy>=1.21.0
+pandas>=1.5.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+# HTTP/API clients
+requests>=2.31.0
+aiohttp>=3.8.0
+asyncio-throttle>=1.0.0
+# LLM Provider SDKs
+openai>=1.0.0
+anthropic>=0.18.0
+google-generativeai>=0.3.0
+google-genai>=0.2.0
+# Image processing
+Pillow>=9.0.0
+# Gradio UI (version will be set by README.md sdk_version)
+gradio>=4.0.0

src/gepa_optimizer.egg-info/PKG-INFO ADDED Viewed

	@@ -0,0 +1,439 @@

+Metadata-Version: 2.4
+Name: gepa-optimizer
+Version: 0.1.0
+Summary: Universal prompt optimization framework based on GEPA
+Home-page: https://github.com/suhasb-dev/Prompt-Optimizer
+Author: Suhas
+Author-email: Suhas <s8hasgrylls@gmail.com>
+License: MIT
+Project-URL: Homepage, https://github.com/suhasb-dev/Prompt-Optimizer
+Project-URL: Repository, https://github.com/suhasb-dev/Prompt-Optimizer
+Project-URL: Documentation, https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/
+Project-URL: Bug Reports, https://github.com/suhasb-dev/Prompt-Optimizer/issues
+Keywords: prompt-optimization,llm,gepa,ai,machine-learning,ui-tree-extraction
+Classifier: Development Status :: 3 - Alpha
+Classifier: Intended Audience :: Developers
+Classifier: Intended Audience :: Science/Research
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Classifier: Programming Language :: Python :: 3.11
+Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Requires-Python: >=3.8
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: gepa>=0.0.12
+Requires-Dist: pandas>=1.5.0
+Requires-Dist: pydantic>=2.0.0
+Requires-Dist: python-dotenv>=1.0.0
+Requires-Dist: requests>=2.31.0
+Requires-Dist: aiohttp>=3.8.0
+Requires-Dist: asyncio-throttle>=1.0.0
+Requires-Dist: google-generativeai>=0.3.0
+Requires-Dist: Pillow>=9.0.0
+Provides-Extra: dev
+Requires-Dist: pytest>=7.0.0; extra == "dev"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
+Requires-Dist: black>=23.0.0; extra == "dev"
+Requires-Dist: flake8>=6.0.0; extra == "dev"
+Requires-Dist: mypy>=1.0.0; extra == "dev"
+Provides-Extra: docs
+Requires-Dist: sphinx>=5.0.0; extra == "docs"
+Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "docs"
+Provides-Extra: all
+Requires-Dist: pytest>=7.0.0; extra == "all"
+Requires-Dist: pytest-asyncio>=0.21.0; extra == "all"
+Requires-Dist: black>=23.0.0; extra == "all"
+Requires-Dist: flake8>=6.0.0; extra == "all"
+Requires-Dist: mypy>=1.0.0; extra == "all"
+Requires-Dist: sphinx>=5.0.0; extra == "all"
+Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "all"
+Dynamic: author
+Dynamic: home-page
+Dynamic: license-file
+Dynamic: requires-python
+# GEPA Optimizer
+[![PyPI version](https://badge.fury.io/py/gepa-optimizer.svg)](https://badge.fury.io/py/gepa-optimizer)
+[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/)
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+A universal prompt optimization framework built on [GEPA](https://arxiv.org/abs/2507.19457) with optional [LLEGO](https://arxiv.org/abs/2503.14217) genetic operators for accelerated convergence.
+## Overview
+GEPA Optimizer provides a modular architecture for optimizing prompts through reflective evolution. It requires custom evaluators and LLM clients, enabling domain-specific optimization for any use case.
+**Key capabilities:**
+- Multi-modal support (text + vision models)
+- Hybrid GEPA + LLEGO optimization modes
+- Configurable train/val/test data splitting
+- Batch API support for cost reduction
+- Async-first architecture
+## Installation
+```bash
+pip install gepa-optimizer
+```
+**From source:**
+```bash
+git clone https://github.com/suhasb-dev/Prompt-Optimizer.git
+cd Prompt-Optimizer
+pip install -e .
+```
+## Quick Start
+```python
+import asyncio
+from gepa_optimizer import (
+    GepaOptimizer,
+    OptimizationConfig,
+    BaseEvaluator,
+    BaseLLMClient
+)
+# Define custom evaluator
+class MyEvaluator(BaseEvaluator):
+    def evaluate(self, predicted: str, expected: str) -> dict:
+        score = 1.0 if predicted.strip() == expected.strip() else 0.0
+        return {"accuracy": score, "composite_score": score}
+# Define custom LLM client
+class MyLLMClient(BaseLLMClient):
+    def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> dict:
+        # Your LLM integration here
+        return {"content": "response"}
+async def main():
+    config = OptimizationConfig(
+        model="openai/gpt-4o",
+        reflection_model="openai/gpt-4o",
+        max_iterations=5,
+        max_metric_calls=50,
+        batch_size=8
+    )
+    optimizer = GepaOptimizer(
+        config=config,
+        llm_client=MyLLMClient("openai", "gpt-4o"),
+        evaluator=MyEvaluator()
+    )
+    result = await optimizer.train(
+        seed_prompt="Your initial prompt",
+        dataset=your_dataset
+    )
+    print(f"Optimized: {result.prompt}")
+    print(f"Score: {result.improvement_data}")
+asyncio.run(main())
+```
+## Project Structure
+```
+src/gepa_optimizer/
+├── core/                   # Core optimization logic
+│   ├── optimizer.py        # GepaOptimizer main class
+��   ├── base_adapter.py     # BaseGepaAdapter interface
+│   └── universal_adapter.py
+├── evaluation/             # Evaluator implementations
+│   ├── base_evaluator.py   # BaseEvaluator abstract class
+│   ├── scroll_evaluator.py
+│   ├── validation_evaluator.py
+│   └── index_caching_evaluator.py
+├── llms/                   # LLM client implementations
+│   ├── base_llm.py         # BaseLLMClient abstract class
+│   ├── vision_llm.py       # VisionLLMClient (OpenAI, Google, Anthropic)
+│   └── batch_llm.py        # BatchLLMClient (50% cost savings)
+├── operators/              # LLEGO genetic operators
+│   └── llego_operators.py  # FitnessGuidedCrossover, DiversityGuidedMutation
+├── data/                   # Dataset loaders and converters
+├── models/                 # Configuration and result models
+└── utils/                  # Utilities and helpers
+```
+## Configuration
+### Basic Configuration
+```python
+from gepa_optimizer import OptimizationConfig, ModelConfig
+config = OptimizationConfig(
+    # Required parameters
+    model="openai/gpt-4o",              # or ModelConfig instance
+    reflection_model="openai/gpt-4o",
+    max_iterations=10,
+    max_metric_calls=100,
+    batch_size=8,
+    # Data splitting (train/val/test)
+    data_split=DataSplitConfig(
+        train_ratio=0.6,
+        val_ratio=0.2,
+        test_ratio=0.2
+    ),
+    # Optional settings
+    reflection_examples=3,    # Examples per reflection (2-5 recommended)
+    evaluate_on_test=True,    # Final evaluation on held-out test set
+    log_level="INFO"          # DEBUG, INFO, WARNING, ERROR
+)
+```
+### LLEGO Genetic Operators
+Enable LLEGO for faster convergence through fitness-guided crossover and diversity-guided mutation:
+```python
+config = OptimizationConfig(
+    model="openai/gpt-4o",
+    reflection_model="openai/gpt-4o",
+    max_iterations=5,
+    max_metric_calls=50,
+    batch_size=8,
+    # Enable LLEGO
+    use_llego_operators=True,
+    alpha=0.15,           # Fitness extrapolation factor
+    tau=10.0,             # Diversity temperature
+    nu=4,                 # Parent arity
+    n_crossover=2,        # Crossover offspring per iteration
+    n_mutation=3,         # Mutation offspring per iteration
+    population_size=15
+)
+```
+### Hybrid Mode (GEPA + LLEGO)
+Combine GEPA's semantic reflection with LLEGO's structural diversity:
+```python
+config = OptimizationConfig(
+    model="openai/gpt-4o",
+    reflection_model="openai/gpt-4o",
+    max_iterations=6,
+    max_metric_calls=200,
+    batch_size=10,
+    # Hybrid mode
+    use_llego_operators=True,
+    enable_gepa_reflection_with_llego=True,
+    num_gepa_reflection_candidates=3,
+    n_crossover=3,
+    n_mutation=3
+    # Total: 9 candidates per iteration (3 GEPA + 3 crossover + 3 mutation)
+)
+```
+### Batch API (Cost Optimization)
+Use batch processing for 50% cost reduction:
+```python
+from gepa_optimizer.llms import BatchLLMClient
+llm_client = BatchLLMClient(
+    provider="google",
+    model_name="gemini-2.5-flash",
+    batch_size=20,
+    polling_interval=30
+)
+optimizer = GepaOptimizer(
+    config=config,
+    llm_client=llm_client,
+    evaluator=evaluator
+)
+```
+## Built-in Components
+### LLM Clients
+| Client | Description | Use Case |
+|--------|-------------|----------|
+| `VisionLLMClient` | Multi-modal client for OpenAI, Google, Anthropic | Real-time requests |
+| `BatchLLMClient` | Batch processing client | Cost-sensitive workloads |
+### Evaluators
+| Evaluator | Description |
+|-----------|-------------|
+| `ScrollElementEvaluator` | UI element detection scoring |
+| `ValidationEvaluator` | Screen validation tasks |
+| `IndexCachingEvaluator` | Index-based element selection |
+| `UITreeEvaluator` | UI tree extraction |
+### Dataset Loaders
+| Loader | Description |
+|--------|-------------|
+| `load_scroll_dataset()` | Load scroll detection datasets |
+| `load_validation_split()` | Load validation datasets with splits |
+| `load_index_caching_split()` | Load index caching datasets |
+## Creating Custom Components
+### Custom Evaluator
+```python
+from gepa_optimizer import BaseEvaluator
+class CustomEvaluator(BaseEvaluator):
+    def __init__(self):
+        super().__init__(metric_weights={
+            "accuracy": 0.5,
+            "completeness": 0.3,
+            "format": 0.2
+        })
+    def evaluate(self, predicted: str, expected: str) -> dict:
+        accuracy = self._compute_accuracy(predicted, expected)
+        completeness = self._compute_completeness(predicted, expected)
+        format_score = self._compute_format(predicted)
+        composite = (
+            accuracy * 0.5 +
+            completeness * 0.3 +
+            format_score * 0.2
+        )
+        return {
+            "accuracy": accuracy,
+            "completeness": completeness,
+            "format": format_score,
+            "composite_score": composite  # Required key
+        }
+```
+### Custom LLM Client
+```python
+from gepa_optimizer import BaseLLMClient
+class CustomLLMClient(BaseLLMClient):
+    def __init__(self, api_key: str):
+        super().__init__(provider="custom", model_name="my-model")
+        self.api_key = api_key
+    def generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = None,
+        **kwargs
+    ) -> dict:
+        # Your API call here
+        response = call_your_api(system_prompt, user_prompt, image_base64)
+        return {"content": response}
+```
+## Examples
+| File | Description |
+|------|-------------|
+| [`examples/basic_usage.py`](examples/basic_usage.py) | Basic optimization workflow |
+| [`examples/advanced_usage.py`](examples/advanced_usage.py) | Advanced configuration |
+| [`examples/batch_api_example.py`](examples/batch_api_example.py) | Batch API usage |
+| [`examples/gemini_usage.py`](examples/gemini_usage.py) | Google Gemini integration |
+**Run examples:**
+```bash
+python examples/basic_usage.py
+```
+## Testing
+```bash
+# Run all tests
+pytest tests/
+# Run unit tests only
+pytest tests/unit/
+# Run integration tests
+pytest tests/integration/
+```
+## API Reference
+### GepaOptimizer
+```python
+class GepaOptimizer:
+    def __init__(
+        self,
+        config: OptimizationConfig,
+        llm_client: BaseLLMClient,
+        evaluator: BaseEvaluator,
+        adapter_type: str = "universal"
+    )
+    async def train(
+        self,
+        seed_prompt: str,
+        dataset: Union[List, Dict],
+        **kwargs
+    ) -> OptimizedResult
+```
+### OptimizationConfig
+| Parameter | Type | Default | Description |
+|-----------|------|---------|-------------|
+| `model` | `str \| ModelConfig` | Required | Target model |
+| `reflection_model` | `str \| ModelConfig` | Required | Reflection model |
+| `max_iterations` | `int` | Required | Maximum optimization iterations |
+| `max_metric_calls` | `int` | Required | Maximum evaluation calls |
+| `batch_size` | `int` | Required | Samples per evaluation batch |
+| `use_llego_operators` | `bool` | `False` | Enable LLEGO genetic operators |
+| `enable_gepa_reflection_with_llego` | `bool` | `False` | Enable hybrid mode |
+| `use_llm_as_judge` | `bool` | `True` | Enable LLM-as-Judge feedback |
+| `log_level` | `str` | `"INFO"` | Logging verbosity |
+### OptimizedResult
+| Attribute | Type | Description |
+|-----------|------|-------------|
+| `prompt` | `str` | Optimized prompt |
+| `original_prompt` | `str` | Initial seed prompt |
+| `improvement_data` | `dict` | Score improvements |
+| `optimization_time` | `float` | Total time in seconds |
+| `is_successful` | `bool` | Optimization success status |
+## Environment Variables
+| Variable | Description |
+|----------|-------------|
+| `OPENAI_API_KEY` | OpenAI API key |
+| `ANTHROPIC_API_KEY` | Anthropic API key |
+| `GOOGLE_API_KEY` | Google AI API key |
+## References
+- **GEPA Paper:** [Reflective Prompt Evolution Can Outperform Reinforcement Learning](https://arxiv.org/abs/2507.19457)
+- **LLEGO Paper:** [Decision Tree Induction Through LLMs via Semantically-Aware Evolution](https://arxiv.org/abs/2503.14217)
+- **GEPA Library:** [github.com/gepa-ai/gepa](https://github.com/gepa-ai/gepa)
+## License
+MIT License - see [LICENSE](LICENSE) for details.
+## Contributing
+Contributions welcome. Please open an issue or submit a pull request.
+## Support
+- **Issues:** [GitHub Issues](https://github.com/suhasb-dev/Prompt-Optimizer/issues)
+- **Documentation:** [GitBook](https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/)

src/gepa_optimizer.egg-info/SOURCES.txt ADDED Viewed

	@@ -0,0 +1,65 @@

+LICENSE
+README.md
+pyproject.toml
+setup.py
+src/gepa_optimizer/__init__.py
+src/gepa_optimizer/cli.py
+src/gepa_optimizer/types.py
+src/gepa_optimizer/version.py
+src/gepa_optimizer.egg-info/PKG-INFO
+src/gepa_optimizer.egg-info/SOURCES.txt
+src/gepa_optimizer.egg-info/dependency_links.txt
+src/gepa_optimizer.egg-info/entry_points.txt
+src/gepa_optimizer.egg-info/requires.txt
+src/gepa_optimizer.egg-info/top_level.txt
+src/gepa_optimizer/core/__init__.py
+src/gepa_optimizer/core/base_adapter.py
+src/gepa_optimizer/core/custom_adapter.py
+src/gepa_optimizer/core/optimizer.py
+src/gepa_optimizer/core/result.py
+src/gepa_optimizer/core/universal_adapter.py
+src/gepa_optimizer/data/__init__.py
+src/gepa_optimizer/data/converters.py
+src/gepa_optimizer/data/index_caching_loader.py
+src/gepa_optimizer/data/loaders.py
+src/gepa_optimizer/data/scroll_dataset_loader.py
+src/gepa_optimizer/data/validation_dataset_loader.py
+src/gepa_optimizer/data/validators.py
+src/gepa_optimizer/evaluation/__init__.py
+src/gepa_optimizer/evaluation/base_evaluator.py
+src/gepa_optimizer/evaluation/index_caching_evaluator.py
+src/gepa_optimizer/evaluation/scroll_evaluator.py
+src/gepa_optimizer/evaluation/ui_evaluator.py
+src/gepa_optimizer/evaluation/universal_evaluator.py
+src/gepa_optimizer/evaluation/validation_evaluator.py
+src/gepa_optimizer/infrastructure/__init__.py
+src/gepa_optimizer/infrastructure/logging/__init__.py
+src/gepa_optimizer/infrastructure/logging/context.py
+src/gepa_optimizer/infrastructure/logging/formatters.py
+src/gepa_optimizer/infrastructure/logging/logger.py
+src/gepa_optimizer/llms/__init__.py
+src/gepa_optimizer/llms/base_llm.py
+src/gepa_optimizer/llms/batch_llm.py
+src/gepa_optimizer/llms/llego_enhanced_llm.py
+src/gepa_optimizer/llms/vision_llm.py
+src/gepa_optimizer/models/__init__.py
+src/gepa_optimizer/models/config.py
+src/gepa_optimizer/models/dataset.py
+src/gepa_optimizer/models/result.py
+src/gepa_optimizer/operators/__init__.py
+src/gepa_optimizer/operators/base_operator.py
+src/gepa_optimizer/operators/crossover.py
+src/gepa_optimizer/operators/llego_operators.py
+src/gepa_optimizer/operators/models.py
+src/gepa_optimizer/operators/mutation.py
+src/gepa_optimizer/utils/__init__.py
+src/gepa_optimizer/utils/api_keys.py
+src/gepa_optimizer/utils/candidate_collector.py
+src/gepa_optimizer/utils/clean_logger.py
+src/gepa_optimizer/utils/exceptions.py
+src/gepa_optimizer/utils/helpers.py
+src/gepa_optimizer/utils/llm_judge_prompt.py
+src/gepa_optimizer/utils/log_parser.py
+src/gepa_optimizer/utils/logging.py
+src/gepa_optimizer/utils/metrics.py
+src/gepa_optimizer/utils/pareto_logger.py

src/gepa_optimizer.egg-info/dependency_links.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+

src/gepa_optimizer.egg-info/entry_points.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [console_scripts]
2	+ gepa-optimize = gepa_optimizer.cli:main

src/gepa_optimizer.egg-info/requires.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+gepa>=0.0.12
+pandas>=1.5.0
+pydantic>=2.0.0
+python-dotenv>=1.0.0
+requests>=2.31.0
+aiohttp>=3.8.0
+asyncio-throttle>=1.0.0
+google-generativeai>=0.3.0
+Pillow>=9.0.0
+[all]
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+black>=23.0.0
+flake8>=6.0.0
+mypy>=1.0.0
+sphinx>=5.0.0
+sphinx-rtd-theme>=1.2.0
+[dev]
+pytest>=7.0.0
+pytest-asyncio>=0.21.0
+black>=23.0.0
+flake8>=6.0.0
+mypy>=1.0.0
+[docs]
+sphinx>=5.0.0
+sphinx-rtd-theme>=1.2.0

src/gepa_optimizer.egg-info/top_level.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ gepa_optimizer

src/gepa_optimizer/__init__.py ADDED Viewed

	@@ -0,0 +1,295 @@

+"""
+GEPA Universal Prompt Optimizer
+A modern, modular Python library for universal prompt optimization powered by GEPA.
+Quick Start (No custom evaluator needed!):
+    from gepa_optimizer import quick_optimize
+    result = await quick_optimize(
+        seed_prompt="Your initial prompt",
+        dataset=[
+            {"input": "task1", "output": "expected1"},
+            {"input": "task2", "output": "expected2"},
+        ],
+        model="openai/gpt-4o"  # or any: "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022"
+    )
+    print(result.optimized_prompt)
+"""
+# Core functionality
+from .core import GepaOptimizer
+from .core.base_adapter import BaseGepaAdapter
+from .core.universal_adapter import UniversalGepaAdapter
+# Configuration and models
+from .models import OptimizationConfig, OptimizationResult, OptimizedResult, ModelConfig
+# Data processing
+from .data import UniversalConverter, DataLoader, DataValidator
+from .data.scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset
+from .data.validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split
+from .data.index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split
+# LLM clients
+from .llms import VisionLLMClient
+from .llms.base_llm import BaseLLMClient
+from .llms.batch_llm import BatchLLMClient
+# Evaluators - including Universal Semantic Evaluator (works for ANY task!)
+from .evaluation import (
+    BaseEvaluator,
+    UniversalSemanticEvaluator,
+    create_universal_evaluator,
+    UITreeEvaluator,
+    ScrollElementEvaluator,
+    ValidationEvaluator,
+    IndexCachingEvaluator
+)
+# LLEGO Genetic Operators
+from .operators import (
+    # Base interfaces
+    BaseGeneticOperator,
+    BaseCrossoverOperator,
+    BaseMutationOperator,
+    # Concrete operators
+    FitnessGuidedCrossover,
+    DiversityGuidedMutation,
+    LLEGOIntegrationLayer,
+    # Data models
+    PromptCandidate,
+    PromptMetadata
+)
+# Utilities
+from .utils import setup_logging, calculate_metrics, sanitize_prompt, APIKeyManager
+from .utils.exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError
+# Logging infrastructure
+from .infrastructure.logging import get_logger, configure_logging, LogContext
+# Type definitions (for type hints in user code)
+from .types import (
+    DatasetItem,
+    EvaluationResult,
+    LLMResponse,
+    CandidateDict,
+    LLMClientProtocol,
+    EvaluatorProtocol,
+)
+__version__ = "0.1.0"
+# ═══════════════════════════════════════════════════════════════════════════════
+# CONVENIENCE FUNCTION: quick_optimize
+# No evaluator needed - uses Universal Semantic Evaluator automatically
+# ═══════════════════════════════════════════════════════════════════════════════
+async def quick_optimize(
+    seed_prompt: str,
+    dataset: list,
+    model: str,
+    max_iterations: int = 5,
+    max_metric_calls: int = 50,
+    batch_size: int = 4,
+    use_llego: bool = True,
+    verbose: bool = True
+) -> OptimizedResult:
+    """
+    🚀 Quick prompt optimization - no custom evaluator needed!
+    Uses Universal Semantic Evaluator that works for ANY task.
+    Args:
+        seed_prompt: Your initial prompt to optimize
+        dataset: List of dicts with 'input' and 'output' (expected) keys
+                 Can also include 'image' key for multi-modal tasks
+        model: LLM model to use in format "provider/model-name" (REQUIRED)
+               Examples:
+                 - "google/gemini-1.5-pro"
+                 - "google/gemini-2.5-flash-preview-05-20"
+                 - "openai/gpt-4o"
+                 - "openai/gpt-4-turbo"
+                 - "anthropic/claude-3-5-sonnet-20241022"
+        max_iterations: Maximum optimization iterations (default: 5)
+        max_metric_calls: Maximum evaluation calls (default: 50)
+        batch_size: Samples per evaluation batch (default: 4)
+        use_llego: Enable LLEGO genetic operators (default: True)
+        verbose: Show progress logs (default: True)
+    Returns:
+        OptimizedResult with optimized prompt and improvement metrics
+    Example:
+        >>> result = await quick_optimize(
+        ...     seed_prompt="Count the objects in the image",
+        ...     dataset=[
+        ...         {"input": "image1.jpg", "output": "5 objects", "image": "base64..."},
+        ...         {"input": "image2.jpg", "output": "3 objects", "image": "base64..."},
+        ...     ],
+        ...     model="openai/gpt-4o",  # or "google/gemini-1.5-pro", etc.
+        ...     max_iterations=3
+        ... )
+        >>> print(result.optimized_prompt)
+    """
+    import logging
+    if verbose:
+        logging.basicConfig(level=logging.INFO)
+    # Create LLM client
+    llm_client = VisionLLMClient.from_model_string(model)
+    # Create Universal Semantic Evaluator (uses same LLM for analysis)
+    evaluator = UniversalSemanticEvaluator(
+        llm_client=llm_client,
+        use_llm_analysis=True
+    )
+    # Create configuration
+    config = OptimizationConfig(
+        model=model,
+        reflection_model=model,
+        max_iterations=max_iterations,
+        max_metric_calls=max_metric_calls,
+        batch_size=batch_size,
+        use_llego_operators=use_llego,
+        enable_gepa_reflection_with_llego=use_llego,
+        num_gepa_reflection_candidates=3,
+        n_crossover=2,
+        n_mutation=2,
+        verbose=verbose
+    )
+    # Create optimizer
+    optimizer = GepaOptimizer(
+        config=config,
+        llm_client=llm_client,
+        evaluator=evaluator
+    )
+    # Run optimization
+    result = await optimizer.train(
+        seed_prompt=seed_prompt,
+        dataset=dataset
+    )
+    return result
+def quick_optimize_sync(
+    seed_prompt: str,
+    dataset: list,
+    model: str,
+    max_iterations: int = 5,
+    max_metric_calls: int = 50,
+    batch_size: int = 4,
+    use_llego: bool = True,
+    verbose: bool = True
+) -> OptimizedResult:
+    """
+    🚀 Synchronous version of quick_optimize.
+    Same as quick_optimize but runs synchronously (blocks until complete).
+    Args:
+        model: LLM model to use in format "provider/model-name" (REQUIRED)
+               Examples: "openai/gpt-4o", "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022"
+    See quick_optimize for full documentation.
+    """
+    import asyncio
+    return asyncio.run(quick_optimize(
+        seed_prompt=seed_prompt,
+        dataset=dataset,
+        model=model,
+        max_iterations=max_iterations,
+        max_metric_calls=max_metric_calls,
+        batch_size=batch_size,
+        use_llego=use_llego,
+        verbose=verbose
+    ))
+__all__ = [
+    # 🚀 Quick Start (recommended for new users)
+    "quick_optimize",
+    "quick_optimize_sync",
+    # Core functionality
+    "GepaOptimizer",
+    "BaseGepaAdapter",
+    "UniversalGepaAdapter",
+    # Configuration
+    "OptimizationConfig",
+    "OptimizationResult",
+    "OptimizedResult",
+    "ModelConfig",
+    # Data processing
+    "UniversalConverter",
+    "DataLoader",
+    "DataValidator",
+    # Dataset loaders
+    "ScrollDatasetLoader",
+    "load_scroll_dataset",
+    "ValidationDatasetLoader",
+    "load_validation_dataset",
+    "load_validation_split",
+    "IndexCachingDatasetLoader",
+    "load_index_caching_dataset",
+    "load_index_caching_split",
+    # LLM clients
+    "VisionLLMClient",
+    "BaseLLMClient",
+    "BatchLLMClient",
+    # Evaluators (Universal recommended for general use)
+    "UniversalSemanticEvaluator",
+    "create_universal_evaluator",
+    "BaseEvaluator",
+    "UITreeEvaluator",
+    "ScrollElementEvaluator",
+    "ValidationEvaluator",
+    "IndexCachingEvaluator",
+    # LLEGO Genetic Operators - Base interfaces
+    "BaseGeneticOperator",
+    "BaseCrossoverOperator",
+    "BaseMutationOperator",
+    # LLEGO Genetic Operators - Concrete implementations
+    "FitnessGuidedCrossover",
+    "DiversityGuidedMutation",
+    "LLEGOIntegrationLayer",
+    "PromptCandidate",
+    "PromptMetadata",
+    # Utilities
+    "APIKeyManager",
+    "GepaOptimizerError",
+    "GepaDependencyError",
+    "InvalidInputError",
+    "DatasetError",
+    "setup_logging",
+    "calculate_metrics",
+    "sanitize_prompt",
+    # Logging infrastructure
+    "get_logger",
+    "configure_logging",
+    "LogContext",
+    # Type definitions
+    "DatasetItem",
+    "EvaluationResult",
+    "LLMResponse",
+    "CandidateDict",
+    "LLMClientProtocol",
+    "EvaluatorProtocol",
+]

src/gepa_optimizer/cli.py ADDED Viewed

	@@ -0,0 +1,239 @@

+"""
+Command Line Interface for GEPA Optimizer
+"""
+import argparse
+import sys
+import json
+import asyncio
+from pathlib import Path
+from typing import Optional
+from .core import GepaOptimizer
+from .models import OptimizationConfig, ModelConfig
+from .utils import setup_logging, APIKeyManager
+def main():
+    """Main CLI entry point"""
+    parser = argparse.ArgumentParser(
+        description="GEPA Universal Prompt Optimizer CLI",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  gepa-optimize --model openai/gpt-4-turbo --prompt "Extract UI elements" --dataset data.json
+  gepa-optimize --config config.json --prompt "Analyze interface" --dataset images/
+        """
+    )
+    # Required arguments
+    parser.add_argument(
+        "--prompt",
+        required=True,
+        help="Initial seed prompt to optimize"
+    )
+    parser.add_argument(
+        "--dataset",
+        required=True,
+        help="Path to dataset file or directory"
+    )
+    # Model configuration
+    parser.add_argument(
+        "--model",
+        help="Model specification (e.g., 'openai/gpt-4-turbo')"
+    )
+    parser.add_argument(
+        "--reflection-model",
+        help="Reflection model specification"
+    )
+    parser.add_argument(
+        "--config",
+        help="Path to configuration JSON file"
+    )
+    # Optimization parameters
+    parser.add_argument(
+        "--max-iterations",
+        type=int,
+        default=10,
+        help="Maximum optimization iterations (default: 10)"
+    )
+    parser.add_argument(
+        "--max-metric-calls",
+        type=int,
+        default=100,
+        help="Maximum metric evaluation calls (default: 100)"
+    )
+    parser.add_argument(
+        "--batch-size",
+        type=int,
+        default=4,
+        help="Batch size for evaluation (default: 4)"
+    )
+    # GEPA-specific parameters
+    parser.add_argument(
+        "--candidate-selection-strategy",
+        type=str,
+        default="pareto",
+        choices=["pareto", "best"],
+        help="Strategy for selecting candidates (default: pareto)"
+    )
+    parser.add_argument(
+        "--skip-perfect-score",
+        action="store_true",
+        help="Skip updating candidates with perfect scores"
+    )
+    parser.add_argument(
+        "--reflection-minibatch-size",
+        type=int,
+        default=None,
+        help="Number of examples to use for reflection (default: use batch_size)"
+    )
+    parser.add_argument(
+        "--perfect-score",
+        type=float,
+        default=1.0,
+        help="Perfect score threshold (default: 1.0)"
+    )
+    parser.add_argument(
+        "--module-selector",
+        type=str,
+        default="round_robin",
+        choices=["round_robin", "all"],
+        help="Component selection strategy (default: round_robin)"
+    )
+    # Output options
+    parser.add_argument(
+        "--output",
+        help="Output file path for results (default: stdout)"
+    )
+    parser.add_argument(
+        "--verbose", "-v",
+        action="store_true",
+        help="Enable verbose logging"
+    )
+    args = parser.parse_args()
+    # Setup logging
+    setup_logging(level="DEBUG" if args.verbose else "INFO")
+    try:
+        # Load configuration
+        if args.config:
+            config = load_config_from_file(args.config)
+        else:
+            config = create_config_from_args(args)
+        # Validate API keys
+        validate_api_keys(config)
+        # Create optimizer
+        optimizer = GepaOptimizer(config=config)
+        # Run optimization (async)
+        print(f"🚀 Starting optimization with model: {config.model.model_name}")
+        result = asyncio.run(optimizer.train(
+            seed_prompt=args.prompt,
+            dataset=args.dataset
+        ))
+        # Output results
+        output_results(result, args.output)
+        print("✅ Optimization completed successfully!")
+    except Exception as e:
+        print(f"❌ Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
+def load_config_from_file(config_path: str) -> OptimizationConfig:
+    """Load configuration from JSON file"""
+    path = Path(config_path)
+    if not path.exists():
+        raise FileNotFoundError(f"Configuration file not found: {config_path}")
+    with open(path, 'r') as f:
+        config_data = json.load(f)
+    # Convert model configs
+    if 'model' in config_data and isinstance(config_data['model'], dict):
+        config_data['model'] = ModelConfig(**config_data['model'])
+    if 'reflection_model' in config_data and isinstance(config_data['reflection_model'], dict):
+        config_data['reflection_model'] = ModelConfig(**config_data['reflection_model'])
+    return OptimizationConfig(**config_data)
+def create_config_from_args(args) -> OptimizationConfig:
+    """Create configuration from command line arguments"""
+    if not args.model:
+        raise ValueError("Either --model or --config must be specified")
+    # Parse model specification
+    model_config = ModelConfig.from_string(args.model)
+    reflection_model_config = None
+    if args.reflection_model:
+        reflection_model_config = ModelConfig.from_string(args.reflection_model)
+    return OptimizationConfig(
+        model=model_config,
+        reflection_model=reflection_model_config,
+        max_iterations=args.max_iterations,
+        max_metric_calls=args.max_metric_calls,
+        batch_size=args.batch_size
+    )
+def validate_api_keys(config: OptimizationConfig):
+    """Validate that required API keys are available"""
+    api_manager = APIKeyManager()
+    providers = [config.model.provider]
+    if config.reflection_model:
+        providers.append(config.reflection_model.provider)
+    missing_keys = api_manager.get_missing_keys(providers)
+    if missing_keys:
+        print("❌ Missing API keys for the following providers:")
+        for provider in missing_keys:
+            print(f"   - {provider.upper()}_API_KEY")
+        print("\nPlease set the required environment variables or use a .env file")
+        sys.exit(1)
+def output_results(result, output_path: Optional[str]):
+    """Output optimization results"""
+    output_data = {
+        "optimized_prompt": result.prompt,
+        "original_prompt": result.original_prompt,
+        "improvement_metrics": result.improvement_data,
+        "optimization_time": result.optimization_time,
+        "status": result.status,
+        "session_id": result.session_id
+    }
+    if output_path:
+        with open(output_path, 'w') as f:
+            json.dump(output_data, f, indent=2)
+        print(f"📄 Results saved to: {output_path}")
+    else:
+        print("\n📊 Optimization Results:")
+        print(f"Session ID: {result.session_id}")
+        print(f"Status: {result.status}")
+        print(f"Time: {result.optimization_time:.2f}s")
+        print(f"\nOriginal Prompt:\n{result.original_prompt}")
+        print(f"\nOptimized Prompt:\n{result.prompt}")
+        if 'improvement_percent' in result.improvement_data:
+            print(f"\nImprovement: {result.improvement_data['improvement_percent']:.2f}%")
+if __name__ == "__main__":
+    main()

src/gepa_optimizer/core/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""
+Core functionality for GEPA Universal Prompt Optimizer
+"""
+from .optimizer import GepaOptimizer
+from .result import ResultProcessor
+__all__ = ["GepaOptimizer", "ResultProcessor"]

src/gepa_optimizer/core/base_adapter.py ADDED Viewed

	@@ -0,0 +1,85 @@

+"""
+Base adapter class for all GEPA adapters.
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, List, Optional
+import logging
+from gepa.core.adapter import GEPAAdapter, EvaluationBatch
+from ..llms.base_llm import BaseLLMClient
+from ..evaluation.base_evaluator import BaseEvaluator
+logger = logging.getLogger(__name__)
+class BaseGepaAdapter(GEPAAdapter, ABC):
+    """
+    Abstract base class for GEPA adapters.
+    Provides the foundation for creating task-specific adapters while
+    maintaining compatibility with the GEPA framework.
+    """
+    def __init__(self, llm_client: BaseLLMClient, evaluator: BaseEvaluator):
+        """
+        Initialize adapter with LLM client and evaluator.
+        Args:
+            llm_client: LLM client for generating responses
+            evaluator: Evaluator for scoring predictions
+        """
+        if not isinstance(llm_client, BaseLLMClient):
+            raise TypeError("llm_client must be an instance of BaseLLMClient")
+        if not isinstance(evaluator, BaseEvaluator):
+            raise TypeError("evaluator must be an instance of BaseEvaluator")
+        self.llm_client = llm_client
+        self.evaluator = evaluator
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        # Performance tracking
+        self._evaluation_count = 0
+        self._best_score = 0.0
+        self._best_candidate = None
+    @abstractmethod
+    def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str],
+                capture_traces: bool = False) -> EvaluationBatch:
+        """
+        Evaluate candidate on a batch of data.
+        Args:
+            batch: List of data items to evaluate
+            candidate: Prompt candidate to evaluate
+            capture_traces: Whether to capture detailed traces
+        Returns:
+            EvaluationBatch with outputs, scores, and optional trajectories
+        """
+        pass
+    @abstractmethod
+    def make_reflective_dataset(self, candidate: Dict[str, str],
+                              eval_batch: EvaluationBatch,
+                              components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        Create reflective dataset for GEPA's reflection process.
+        Args:
+            candidate: Current prompt candidate
+            eval_batch: Results from evaluation
+            components_to_update: List of components to update
+        Returns:
+            Dictionary mapping components to reflection data
+        """
+        pass
+    def get_performance_stats(self) -> Dict[str, Any]:
+        """Get performance statistics for monitoring"""
+        return {
+            'evaluation_count': self._evaluation_count,
+            'best_score': self._best_score,
+            'model_info': self.llm_client.get_model_info(),
+            'evaluator_class': self.evaluator.__class__.__name__
+        }

src/gepa_optimizer/core/custom_adapter.py ADDED Viewed

	@@ -0,0 +1,389 @@

+"""
+Custom GEPA Adapter for the GEPA Universal Prompt Optimizer
+"""
+import json
+import logging
+import re
+from typing import Any, Dict, List, Optional
+# Import ModelConfig
+from ..models import ModelConfig
+from gepa.core.adapter import GEPAAdapter, EvaluationBatch
+from ..llms.vision_llm import VisionLLMClient
+from ..evaluation.ui_evaluator import UITreeEvaluator
+from .base_adapter import BaseGepaAdapter
+logger = logging.getLogger(__name__)
+class CustomGepaAdapter(BaseGepaAdapter):
+    """
+    Custom adapter for the GEPA Universal Prompt Optimizer.
+    """
+    def __init__(self, model_config: 'ModelConfig', metric_weights: Optional[Dict[str, float]] = None):
+        """Initialize the custom GEPA adapter with model configuration."""
+        # Convert string model to ModelConfig if needed
+        if not isinstance(model_config, ModelConfig):
+            model_config = ModelConfig(
+                provider='openai',
+                model_name=str(model_config),
+                api_key=None
+            )
+        # Initialize components
+        llm_client = VisionLLMClient(
+            provider=model_config.provider,
+            model_name=model_config.model_name,
+            api_key=model_config.api_key,
+            base_url=model_config.base_url,
+            temperature=model_config.temperature,
+            max_tokens=model_config.max_tokens,
+            top_p=model_config.top_p,
+            frequency_penalty=model_config.frequency_penalty,
+            presence_penalty=model_config.presence_penalty
+        )
+        evaluator = UITreeEvaluator(metric_weights=metric_weights)
+        # Initialize parent class
+        super().__init__(llm_client, evaluator)
+        # Track candidates for logging
+        self._last_candidate = None
+        self._evaluation_count = 0
+        self.logger.info(f"🚀 Initialized UI Tree adapter with {model_config.provider}/{model_config.model_name}")
+    def _parse_json_safely(self, json_str: str) -> Dict[str, Any]:
+        """Safely parse JSON string to dictionary with enhanced parsing and repair."""
+        if not json_str or not isinstance(json_str, str):
+            return {}
+        # Try direct parsing first
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError:
+            pass
+        # Try to extract JSON from markdown code blocks
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', json_str, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        # Try to find JSON object in the string
+        json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(0))
+            except json.JSONDecodeError:
+                pass
+        # Try repair and parse
+        repaired_json = self._repair_json(json_str)
+        if repaired_json:
+            try:
+                return json.loads(repaired_json)
+            except json.JSONDecodeError:
+                pass
+        self.logger.warning(f"Failed to parse JSON: {json_str[:100]}...")
+        return {}
+    def _repair_json(self, json_str: str) -> str:
+        """Attempt to repair common JSON issues."""
+        try:
+            # Remove markdown formatting
+            json_str = re.sub(r'```(?:json)?\s*', '', json_str)
+            json_str = re.sub(r'```\s*$', '', json_str)
+            # Remove extra text before/after JSON
+            json_match = re.search(r'\{.*\}', json_str, re.DOTALL)
+            if json_match:
+                json_str = json_match.group(0)
+            # Fix common issues
+            json_str = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas
+            json_str = re.sub(r',\s*]', ']', json_str)  # Remove trailing commas in arrays
+            json_str = re.sub(r'([{,]\s*)(\w+):', r'\1"\2":', json_str)  # Quote unquoted keys
+            return json_str
+        except Exception as e:
+            self.logger.warning(f"🔧 JSON repair failed: {e}")
+            return ""
+    def evaluate(
+        self,
+        batch: List[Dict[str, Any]],
+        candidate: Dict[str, str],
+        capture_traces: bool = False,
+    ) -> EvaluationBatch:
+        """Evaluate the candidate on a batch of data."""
+        outputs = []
+        scores = []
+        trajectories = [] if capture_traces else None
+        system_prompt = candidate.get('system_prompt', '')
+        # Check if this is a new candidate (different from last one)
+        if self._last_candidate != system_prompt:
+            self._evaluation_count += 1
+            self.log_proposed_candidate(candidate, self._evaluation_count)
+            self._last_candidate = system_prompt
+        self.logger.info(f"📊 Evaluating {len(batch)} samples with prompt: '{system_prompt[:50]}...'")
+        for i, item in enumerate(batch):
+            input_text = item.get('input', '')
+            image_base64 = item.get('image', '')
+            ground_truth_json = item.get('output', '')
+            # Call the LLM client
+            llm_response = self.llm_client.generate(system_prompt, input_text, image_base64=image_base64)
+            # Extract content from the response dictionary
+            if isinstance(llm_response, dict):
+                llm_output_json_str = llm_response.get("content", "")
+                if not llm_output_json_str:
+                    llm_output_json_str = str(llm_response)
+            else:
+                llm_output_json_str = str(llm_response) if llm_response else ""
+            # 🔍 DEBUG: Log essential info only (removed verbose JSON content)
+            self.logger.debug(f"🔍 Sample {i+1} - LLM Response Type: {type(llm_response)}")
+            self.logger.debug(f"🔍 Sample {i+1} - Response Length: {len(llm_output_json_str)} chars")
+            outputs.append(llm_output_json_str)
+            # Parse JSON strings to dictionaries for evaluation
+            llm_output_dict = self._parse_json_safely(llm_output_json_str)
+            ground_truth_dict = self._parse_json_safely(ground_truth_json)
+            # Initialize evaluation_results with default values
+            evaluation_results = {
+                "composite_score": 0.0,
+                "element_completeness": 0.0,
+                "element_type_accuracy": 0.0,
+                "text_content_accuracy": 0.0,
+                "hierarchy_accuracy": 0.0,
+                "style_accuracy": 0.0
+            }
+            # Calculate composite score and evaluation results
+            if not llm_output_dict and not ground_truth_dict:
+                composite_score = 0.1
+                evaluation_results = {k: 0.1 for k in evaluation_results.keys()}
+                self.logger.warning(f"⚠️  Sample {i+1}: Empty results - using default score: {composite_score}")
+            elif not llm_output_dict or not ground_truth_dict:
+                composite_score = 0.05
+                evaluation_results = {k: 0.05 for k in evaluation_results.keys()}
+                self.logger.warning(f"⚠️  Sample {i+1}: Incomplete results - using low score: {composite_score}")
+            else:
+                # Calculate score using evaluator with parsed dictionaries
+                evaluation_results = self.evaluator.evaluate(llm_output_dict, ground_truth_dict)
+                composite_score = evaluation_results["composite_score"]
+                # Clean, readable logging (removed verbose JSON dumps)
+                llm_children = len(llm_output_dict.get('children', []))
+                gt_children = len(ground_truth_dict.get('children', []))
+                if composite_score < 0.1:
+                    self.logger.warning(f"⚠️  Sample {i+1}: Low score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements")
+                    self.logger.debug(f"   Score breakdown: {evaluation_results}")
+                else:
+                    self.logger.info(f"✅ Sample {i+1}: Score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements")
+            scores.append(composite_score)
+            if capture_traces:
+                trajectories.append({
+                    'input_text': input_text,
+                    'image_base64': image_base64,
+                    'ground_truth_json': ground_truth_json,
+                    'llm_output_json': llm_output_json_str,
+                    'evaluation_results': evaluation_results
+                })
+        avg_score = sum(scores) / len(scores) if scores else 0.0
+        # Update performance tracking (handled by parent class)
+        if avg_score > self._best_score:
+            self._best_score = avg_score
+            self._best_candidate = candidate.copy()
+            self.logger.info(f"🎯 New best candidate found with score: {avg_score:.4f}")
+        self.logger.info(f"📈 Batch evaluation complete - Average score: {avg_score:.4f}")
+        return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories)
+    def make_reflective_dataset(
+        self,
+        candidate: Dict[str, str],
+        eval_batch: EvaluationBatch,
+        components_to_update: List[str],
+    ) -> Dict[str, List[Dict[str, Any]]]:
+        """Create a reflective dataset from the evaluation results."""
+        reflective_dataset = {}
+        system_prompt = candidate.get('system_prompt', '')
+        # 🎯 NEW: Log the proposed new prompt being evaluated
+        self.logger.info(f"📝 Creating reflection dataset for prompt: '{system_prompt[:100]}...'")
+        # Pretty print reflection dataset creation
+        self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update)
+        for component in components_to_update:
+            reflective_dataset[component] = []
+            for i, trace in enumerate(eval_batch.trajectories):
+                feedback = self._generate_feedback(trace['evaluation_results'])
+                reflective_dataset[component].append({
+                    "current_prompt": system_prompt,
+                    "input_text": trace['input_text'],
+                    "image_base64": trace['image_base64'],
+                    "generated_json": trace['llm_output_json'],
+                    "ground_truth_json": trace['ground_truth_json'],
+                    "score": trace['evaluation_results']["composite_score"],
+                    "feedback": feedback,
+                    "detailed_scores": trace['evaluation_results']
+                })
+        # 🎯 NEW: Log reflection dataset summary
+        total_samples = sum(len(data) for data in reflective_dataset.values())
+        avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0
+        self.logger.info(f"📝 Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}")
+        return reflective_dataset
+    def _generate_feedback(self, evaluation_results: Dict[str, float]) -> str:
+        """Generate textual feedback based on evaluation results."""
+        composite_score = evaluation_results.get("composite_score", 0.0)
+        feedback_parts = []
+        # Overall quality assessment
+        if composite_score >= 0.8:
+            feedback_parts.append("The overall quality is good.")
+        elif composite_score >= 0.5:
+            feedback_parts.append("The overall quality is moderate.")
+        else:
+            feedback_parts.append("The overall quality is low. Focus on fundamental accuracy.")
+        # Specific metric feedback
+        if evaluation_results.get("element_completeness", 0.0) < 0.7:
+            feedback_parts.append("Element completeness is low. Ensure all UI elements are captured.")
+        if evaluation_results.get("element_type_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Element type accuracy is low. Verify correct UI element identification (Button, Text, Image, etc.).")
+        if evaluation_results.get("text_content_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Text content accuracy is low. Improve text extraction fidelity.")
+        if evaluation_results.get("hierarchy_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Hierarchy accuracy is low. Ensure correct parent-child relationships.")
+        if evaluation_results.get("style_accuracy", 0.0) < 0.7:
+            feedback_parts.append("Style accuracy is low. Capture more styling properties (colors, sizes, positioning).")
+        return " ".join(feedback_parts)
+    def get_best_candidate(self) -> Optional[Dict[str, str]]:
+        """Get the best candidate found so far."""
+        return self._best_candidate
+    def get_best_score(self) -> float:
+        """Get the best score found so far."""
+        return self._best_score
+    def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0):
+        """
+        Log the new proposed candidate prompt.
+        Args:
+            candidate: The new candidate prompt from GEPA
+            iteration: Current optimization iteration
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        logger.info("="*80)
+        logger.info(f"NEW PROPOSED CANDIDATE (Iteration {iteration})")
+        logger.info("="*80)
+        logger.info(f"PROPOSED PROMPT:")
+        logger.info("-" * 40)
+        logger.debug(f'"{system_prompt}"')
+        logger.info("-" * 40)
+        logger.info(f"Prompt Length: {len(system_prompt)} characters")
+        logger.info(f"Word Count: {len(system_prompt.split())} words")
+        logger.info("="*80)
+    def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch,
+                                       components_to_update: List[str]):
+        """
+        Log the reflection dataset creation process.
+        Args:
+            candidate: Current candidate being evaluated
+            eval_batch: Evaluation results
+            components_to_update: Components being updated
+        """
+        system_prompt = candidate.get('system_prompt', '')
+        logger.info("="*80)
+        logger.info("REFLECTION DATASET CREATION")
+        logger.info("="*80)
+        logger.info(f"CURRENT PROMPT BEING ANALYZED:")
+        logger.info("-" * 40)
+        logger.debug(f'"{system_prompt}"')
+        logger.info("-" * 40)
+        logger.info(f"EVALUATION SUMMARY:")
+        logger.info("-" * 40)
+        if eval_batch.scores:
+            avg_score = sum(eval_batch.scores) / len(eval_batch.scores)
+            min_score = min(eval_batch.scores)
+            max_score = max(eval_batch.scores)
+            logger.info(f"   Average Score: {avg_score:.4f}")
+            logger.info(f"   Min Score: {min_score:.4f}")
+            logger.info(f"   Max Score: {max_score:.4f}")
+            logger.info(f"   Total Samples: {len(eval_batch.scores)}")
+        logger.info(f"COMPONENTS TO UPDATE:")
+        logger.info("-" * 40)
+        for i, component in enumerate(components_to_update, 1):
+            logger.info(f"   {i}. {component}")
+        if eval_batch.trajectories:
+            logger.debug(f"DETAILED ANALYSIS:")
+            logger.debug("-" * 40)
+            for i, trace in enumerate(eval_batch.trajectories[:3], 1):  # Show first 3 samples
+                evaluation_results = trace['evaluation_results']
+                composite_score = evaluation_results.get("composite_score", 0.0)
+                logger.debug(f"   Sample {i} (Score: {composite_score:.4f}):")
+                # Show input data (truncated)
+                input_text = trace['input_text'][:100] + "..." if len(trace['input_text']) > 100 else trace['input_text']
+                logger.debug(f"      Input: \"{input_text}\"")
+                # Show predicted output (truncated)
+                predicted_output = trace['llm_output_json'][:100] + "..." if len(trace['llm_output_json']) > 100 else trace['llm_output_json']
+                logger.debug(f"      Output: \"{predicted_output}\"")
+                # Show detailed scores
+                logger.debug(f"      Detailed Scores:")
+                for metric, score in evaluation_results.items():
+                    if metric != "composite_score":
+                        logger.debug(f"        {metric.replace('_', ' ').title()}: {score:.4f}")
+                # Show generated feedback
+                feedback = self._generate_feedback(evaluation_results)
+                logger.debug(f"      Feedback: \"{feedback}\"")
+            if len(eval_batch.trajectories) > 3:
+                logger.debug(f"   ... and {len(eval_batch.trajectories) - 3} more samples")
+        logger.info("="*80)

src/gepa_optimizer/core/optimizer.py ADDED Viewed

	@@ -0,0 +1,1279 @@

+"""
+Main GepaOptimizer class - the heart of the optimization system
+"""
+import time
+import logging
+from typing import Any, Dict, List, Optional, Union
+import asyncio
+import io
+import sys
+from contextlib import redirect_stdout, redirect_stderr
+import gepa
+from ..utils.api_keys import APIKeyManager
+from .result import ResultProcessor
+from ..data.converters import UniversalConverter
+from ..models.result import OptimizationResult, OptimizedResult
+from ..models.config import OptimizationConfig, ModelConfig
+from ..utils.helpers import sanitize_prompt
+from ..utils.exceptions import GepaDependencyError, InvalidInputError, DatasetError, GepaOptimizerError
+logger = logging.getLogger(__name__)
+class GepaOptimizer:
+    """
+    Main class for prompt optimization using GEPA
+    This is the primary interface that users interact with.
+    Provides both simple and advanced optimization capabilities.
+    """
+    def __init__(self, config: Optional[OptimizationConfig] = None,
+                 adapter_type: str = "universal",
+                 custom_adapter: Optional[Any] = None,
+                 llm_model_name: Optional[str] = None,
+                 metric_weights: Optional[Dict[str, float]] = None,
+                 **kwargs):
+        """
+        Initialize the optimizer
+        Args:
+            config: Optimization configuration (required)
+            adapter_type: Type of adapter to use ("universal" only - fully configurable)
+            custom_adapter: Custom adapter instance (overrides adapter_type)
+            llm_model_name: [Deprecated] Use config.model instead. Will be removed in future versions.
+            metric_weights: [Deprecated] Not used - evaluator handles metrics. Will be removed in future versions.
+            **kwargs: Additional parameters for universal adapter (llm_client, evaluator, etc.)
+        Raises:
+            ValueError: If required configuration is missing
+            GepaDependencyError: If GEPA library is not available
+        """
+        if config is None:
+            raise ValueError("config parameter is required. Use OptimizationConfig to configure the optimizer.")
+        # Initialize logger first
+        self.logger = logging.getLogger(__name__)
+        self.config = config
+        self.converter = UniversalConverter(data_split_config=config.data_split)
+        self.api_manager = APIKeyManager()
+        self.result_processor = ResultProcessor()
+        # Initialize adapter based on configuration
+        if custom_adapter:
+            # User provided custom adapter
+            from .base_adapter import BaseGepaAdapter
+            if not isinstance(custom_adapter, BaseGepaAdapter):
+                raise TypeError("custom_adapter must be an instance of BaseGepaAdapter")
+            self.adapter = custom_adapter
+            self.logger.info("Using user-provided custom adapter")
+        elif adapter_type == "universal":
+            # Universal adapter requires user to provide components
+            llm_client = kwargs.get('llm_client')
+            evaluator = kwargs.get('evaluator')
+            if not llm_client or not evaluator:
+                raise ValueError(
+                    "llm_client and evaluator are required for universal adapter. "
+                    "Example: GepaOptimizer(config=config, adapter_type='universal', "
+                    "llm_client=llm_client, evaluator=evaluator)"
+                )
+            from .universal_adapter import UniversalGepaAdapter
+            self.adapter = UniversalGepaAdapter(
+                llm_client=llm_client,
+                evaluator=evaluator,
+                data_converter=kwargs.get('data_converter')
+            )
+            self.logger.info("Using universal adapter")
+        else:
+            raise ValueError(
+                f"Unknown adapter_type: {adapter_type}. "
+                f"Only 'universal' is supported. "
+                f"Provide llm_client and evaluator when using universal adapter."
+            )
+        # Keep backward compatibility
+        self.custom_adapter = self.adapter
+        # Log model configuration
+        model_info = self.adapter.get_performance_stats()
+        self.logger.info(f"Initialized adapter: {model_info}")
+        # Set up logging
+        logging.basicConfig(
+            level=logging.INFO,
+            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
+        )
+        # Validate GEPA availability
+        if gepa is None:
+            raise GepaDependencyError("GEPA library is not available. Please install it with: pip install gepa")
+    async def train(self,
+                   seed_prompt: str,
+                   dataset: Union[List[Any], str, Dict, Any],
+                   **kwargs) -> OptimizedResult:
+        """
+        Main training method for prompt optimization
+        Args:
+            seed_prompt: Initial prompt to optimize
+            dataset: Training data in any format
+            **kwargs: Additional parameters that can override config
+        Returns:
+            OptimizedResult: Optimization result with improved prompt
+        Raises:
+            InvalidInputError: For invalid input parameters
+            DatasetError: For issues with dataset processing
+            GepaOptimizerError: For optimization failures
+        """
+        start_time = time.time()
+        session_id = f"opt_{int(start_time)}_{id(self)}"
+        try:
+            self.logger.info(f"Starting optimization session: {session_id}")
+            self.logger.info(f"Using model: {self.config.model.model_name} (provider: {self.config.model.provider})")
+            # #region agent log
+            import json as _json_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "optimizer.py:train_start", "message": "Optimization train() started", "data": {"session_id": session_id, "max_iterations": self.config.max_iterations}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            # #endregion
+            # 🔥 FIX E: Reset Pareto logger at start of each optimization run
+            from ..utils.pareto_logger import reset_pareto_logger
+            reset_pareto_logger()
+            self.logger.info("✅ Reset Pareto logger for new optimization run")
+            # Update config with any overrides from kwargs
+            self._update_config_from_kwargs(kwargs)
+            # Step 1: Validate inputs
+            self._validate_inputs(seed_prompt)
+            # Step 2: Convert dataset to GEPA format with 3-way split
+            # 🔥 FIX: Support pre-split datasets (user-provided train/val/test)
+            if isinstance(dataset, dict) and all(k in dataset for k in ['train', 'val', 'test']):
+                # User provided pre-split dataset - use it directly
+                self.logger.info("✅ Detected pre-split dataset - using user's split (no re-splitting)")
+                trainset_raw = dataset.get('train', [])
+                valset_raw = dataset.get('val', [])
+                testset_raw = dataset.get('test', [])
+                # Still need to standardize the format (convert to GEPA format)
+                trainset = self.converter._standardize(trainset_raw)
+                valset = self.converter._standardize(valset_raw)
+                testset = self.converter._standardize(testset_raw) if testset_raw else []
+                self.logger.info(
+                    f"Using pre-split dataset: {len(trainset)} train (Dfeedback), "
+                    f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)"
+                )
+            else:
+                # Standard path: convert and split automatically
+                self.logger.info("Converting dataset to GEPA format with 3-way split...")
+                trainset, valset, testset = self.converter.convert(
+                    dataset,
+                    split_config=self.config.data_split
+                )
+                # Log split with adaptive strategy info
+                split_strategy = self.config.data_split.small_dataset_strategy
+                strategy_note = ""
+                if split_strategy == 'adaptive':
+                    total_size = len(trainset) + len(valset) + len(testset)
+                    train_ratio, val_ratio, test_ratio = self.config.data_split.get_adaptive_ratios(total_size)
+                    strategy_note = f" (adaptive: {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% ratios)"
+                self.logger.info(
+                    f"Dataset split{strategy_note}: {len(trainset)} train (Dfeedback), "
+                    f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)"
+                )
+            if not trainset:
+                raise DatasetError("Dataset appears to be empty after conversion")
+            # Step 3: Create seed candidate
+            seed_candidate = self._create_seed_candidate(seed_prompt)
+            # 🔥 CRITICAL: Set valset info in adapter BEFORE baseline evaluation
+            # This ensures adapter correctly detects 'dpareto' dataset type
+            # Use direct assignment (don't rely on hasattr) to ensure attributes are set
+            try:
+                self.adapter._valset_size = len(valset) if valset else 0
+                self.logger.info(f"✅ Set valset_size in adapter: {len(valset) if valset else 0} for Dpareto detection")
+            except AttributeError:
+                self.logger.warning("⚠️ Could not set _valset_size in adapter - attribute not supported")
+            try:
+                self.adapter._valset = valset
+                self.logger.info(f"✅ Stored valset in adapter ({len(valset) if valset else 0} samples)")
+            except AttributeError:
+                self.logger.warning("⚠️ Could not set _valset in adapter - attribute not supported")
+            # Step 3.5: Calculate baseline score on VALIDATION set (not test set)
+            # This ensures fair comparison since optimization uses validation set for Pareto selection
+            baseline_val_score = None
+            if valset:
+                self.logger.info("📊 Evaluating seed prompt on validation set for baseline...")
+                # Set baseline flag so adapter knows this is baseline, not optimization
+                # Use direct assignment to ensure the flag is set
+                try:
+                    self.adapter._is_baseline_evaluation = True
+                    self.logger.info("✅ Set baseline evaluation flag in adapter")
+                except AttributeError:
+                    self.logger.warning("⚠️ Could not set _is_baseline_evaluation in adapter")
+                try:
+                    # Evaluate on validation set (same as what GEPA will use for Pareto selection)
+                    eval_result = self.adapter.evaluate(
+                        batch=valset,
+                        candidate=seed_candidate,
+                        capture_traces=False
+                    )
+                    baseline_val_score = sum(eval_result.scores) / len(eval_result.scores) if eval_result.scores else 0.0
+                    self.logger.info(f"📊 Baseline validation score: {baseline_val_score:.4f} (on {len(valset)} samples)")
+                    # Store baseline in adapter for later use
+                    if hasattr(self.adapter, '_baseline_score'):
+                        self.adapter._baseline_score = baseline_val_score
+                    # 🔥 CRITICAL FIX: Also set baseline in Pareto logger
+                    # This ensures candidates can be properly evaluated against baseline
+                    from ..utils.pareto_logger import get_pareto_logger
+                    pareto_log = get_pareto_logger()
+                    pareto_log.set_baseline(baseline_val_score)
+                    self.logger.info(f"✅ Baseline set in Pareto logger: {baseline_val_score:.4f}")
+                except Exception as e:
+                    self.logger.warning(f"Baseline evaluation failed: {e}")
+                    import traceback
+                    self.logger.debug(f"Baseline evaluation error: {traceback.format_exc()}")
+                finally:
+                    try:
+                        self.adapter._is_baseline_evaluation = False
+                        self.logger.debug("✅ Reset baseline evaluation flag - optimization can begin")
+                    except AttributeError:
+                        pass  # Ignore if attribute not supported
+            # Step 4: Run GEPA optimization
+            self.logger.info("Starting GEPA optimization...")
+            gepa_result, actual_iterations = await self._run_gepa_optimization(
+                adapter=self.adapter,
+                seed_candidate=seed_candidate,
+                trainset=trainset,
+                valset=valset,
+                **kwargs
+            )
+            # Step 5: Extract best candidate
+            best_candidate = self._extract_best_candidate(gepa_result)
+            # 🔥 CRITICAL: Extract optimized prompt from best_candidate
+            # This is the actual optimized prompt that GEPA found
+            self.logger.info(f"\n{'═'*80}")
+            self.logger.info(f"📝 EXTRACTING OPTIMIZED PROMPT FROM GEPA RESULT")
+            self.logger.info(f"{'═'*80}")
+            self.logger.info(f"best_candidate keys: {list(best_candidate.keys()) if isinstance(best_candidate, dict) else 'N/A'}")
+            optimized_prompt = best_candidate.get('system_prompt', seed_prompt)
+            if not optimized_prompt or optimized_prompt.strip() == '':
+                # Fallback: try other keys or use seed prompt
+                optimized_prompt = best_candidate.get('prompt', best_candidate.get('text', seed_prompt))
+            # Get fitness score if available
+            best_fitness = best_candidate.get('fitness') or self.adapter.get_best_score() if hasattr(self.adapter, 'get_best_score') else None
+            candidate_source = best_candidate.get('source', 'unknown')
+            self.logger.info(f"\n✅ EXTRACTED OPTIMIZED PROMPT:")
+            self.logger.info(f"   Source: {candidate_source}")
+            if best_fitness is not None:
+                self.logger.info(f"   Fitness: f={best_fitness:.4f}")
+            self.logger.info(f"   Length: {len(optimized_prompt)} characters")
+            self.logger.info(f"   Words: {len(optimized_prompt.split())} words")
+            self.logger.info(f"\n📝 FULL OPTIMIZED PROMPT TEXT:")
+            self.logger.info(f"{'─'*80}")
+            self.logger.info(optimized_prompt)
+            self.logger.info(f"{'─'*80}")
+            if optimized_prompt != seed_prompt:
+                self.logger.info(f"\n✅ SUCCESS: Prompt WAS OPTIMIZED!")
+                self.logger.info(f"   Seed length: {len(seed_prompt)} chars")
+                self.logger.info(f"   Optimized length: {len(optimized_prompt)} chars")
+                self.logger.info(f"   Difference: {len(optimized_prompt) - len(seed_prompt):+d} chars")
+                if best_fitness is not None:
+                    baseline_fitness = 0.5  # Default baseline, could be improved
+                    improvement = best_fitness - baseline_fitness
+                    improvement_pct = (improvement / baseline_fitness * 100) if baseline_fitness > 0 else 0
+                    self.logger.info(f"   Fitness: f={best_fitness:.4f} (improvement: {improvement:+.4f} ({improvement_pct:+.1f}%))")
+            else:
+                self.logger.warning(f"\n⚠️  WARNING: Optimized prompt is IDENTICAL to seed prompt")
+                self.logger.warning(f"   This means GEPA didn't modify the prompt during optimization")
+                if best_fitness is not None:
+                    self.logger.warning(f"   Best fitness found: f={best_fitness:.4f}")
+                    self.logger.warning(f"   💡 Check if LLEGO best candidate is being properly extracted")
+            self.logger.info(f"{'═'*80}\n")
+            # Step 5.5: Calculate improvement metrics (validation vs validation)
+            optimized_test_score = None
+            improvement_data = {}
+            # 🔥 FIX: Calculate improvement based on VALIDATION scores (fair comparison)
+            # Compare optimized VALIDATION score vs validation baseline (both on Dpareto)
+            # This ensures fair comparison - both evaluated on the same validation set
+            optimized_val_score = best_fitness  # Best candidate's fitness is from validation set (Dpareto)
+            if baseline_val_score is not None and optimized_val_score is not None:
+                absolute_improvement = optimized_val_score - baseline_val_score
+                relative_improvement = (
+                    (absolute_improvement / baseline_val_score * 100)
+                    if baseline_val_score > 0 else 0
+                )
+                improvement_data = {
+                    'baseline_val_score': baseline_val_score,
+                    'optimized_val_score': optimized_val_score,
+                    'absolute_improvement': absolute_improvement,
+                    'relative_improvement_percent': relative_improvement
+                }
+                self.logger.info(
+                    f"📈 Validation improvement: {relative_improvement:+.2f}% "
+                    f"(baseline val: {baseline_val_score:.4f} → optimized val: {optimized_val_score:.4f})"
+                )
+            # Step 5.6: Evaluate optimized prompt on test set (if available) for final reporting
+            if testset and self.config.evaluate_on_test:
+                self.logger.info("📊 Evaluating optimized prompt on test set...")
+                # 🔥 CRITICAL FIX: Clear LLEGO candidate queue before test evaluation
+                # This prevents the LLEGO wrapper from intercepting test evaluation calls
+                # and returning wrong candidates instead of actually running the optimized prompt
+                from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                if hasattr(self.adapter, 'llm_client') and isinstance(self.adapter.llm_client, LLEGOEnhancedLLMClient):
+                    if hasattr(self.adapter.llm_client, '_adapter_generated_candidates'):
+                        self.adapter.llm_client._adapter_generated_candidates = []
+                        self.logger.info("✅ Cleared LLEGO candidate queue for clean test evaluation")
+                    if hasattr(self.adapter.llm_client, '_candidate_queue'):
+                        self.adapter.llm_client._candidate_queue = []
+                        self.logger.info("✅ Cleared LLEGO hybrid candidate queue for clean test evaluation")
+                # Evaluate on test set for final reporting (but improvement is based on validation)
+                try:
+                    optimized_test_score = self._evaluate_candidate_on_testset(
+                        best_candidate,
+                        testset
+                    )
+                    self.logger.info(f"📊 Optimized test score: {optimized_test_score:.4f}")
+                    # Add test score to improvement_data for reference (but improvement is based on validation)
+                    improvement_data['optimized_test_score'] = optimized_test_score
+                    if baseline_val_score is not None:
+                        test_vs_baseline = (
+                            ((optimized_test_score - baseline_val_score) / baseline_val_score * 100)
+                            if baseline_val_score > 0 else 0
+                        )
+                        self.logger.info(
+                            f"📊 Test set vs validation baseline: {test_vs_baseline:+.2f}% "
+                            f"(baseline val: {baseline_val_score:.4f} → optimized test: {optimized_test_score:.4f})"
+                        )
+                except Exception as e:
+                    self.logger.warning(f"Test evaluation failed: {e}")
+            # Step 6: Process results
+            optimization_time = time.time() - start_time
+            processed_result = self.result_processor.process_full_result(
+                result=gepa_result,
+                original_prompt=seed_prompt,
+                optimization_time=optimization_time,
+                actual_iterations=actual_iterations,
+                test_metrics=improvement_data  # Add test metrics
+            )
+            # Merge improvement data
+            final_improvement_data = {**processed_result.get('improvement_data', {}), **improvement_data}
+            # Step 7: Create result objects
+            # 🔥 CRITICAL: Use extracted optimized_prompt instead of processed_result
+            result = OptimizedResult(
+                original_prompt=seed_prompt,
+                optimized_prompt=optimized_prompt,  # Use extracted prompt, not processed_result!
+                improvement_data=final_improvement_data,
+                optimization_time=optimization_time,
+                dataset_size=len(trainset) + len(valset) + len(testset),
+                total_iterations=processed_result.get('total_iterations', 0),
+                status=processed_result.get('status', 'completed'),
+                error_message=processed_result.get('error_message'),
+                detailed_result=OptimizationResult(
+                    session_id=session_id,
+                    original_prompt=seed_prompt,
+                    optimized_prompt=optimized_prompt,  # Use extracted prompt!
+                    improvement_data=final_improvement_data,
+                    optimization_time=optimization_time,
+                    dataset_size=len(trainset) + len(valset) + len(testset),
+                    total_iterations=processed_result.get('total_iterations', 0),
+                    status=processed_result.get('status', 'completed'),
+                    error_message=processed_result.get('error_message')
+                )
+            )
+            self.logger.info(f"✅ Optimization completed in {optimization_time:.2f}s")
+            return result
+        except Exception as e:
+            optimization_time = time.time() - start_time
+            error_msg = f"Optimization failed: {str(e)}"
+            self.logger.error(error_msg)
+            # Return failed result
+            return OptimizedResult(
+                original_prompt=seed_prompt,
+                optimized_prompt=seed_prompt,  # Return original on failure
+                improvement_data={'error': error_msg},
+                optimization_time=optimization_time,
+                dataset_size=0,
+                total_iterations=0,
+                status='failed',
+                error_message=error_msg
+            )
+    def _update_config_from_kwargs(self, kwargs: Dict[str, Any]) -> None:
+        """Update configuration with runtime overrides from kwargs."""
+        updated_params = []
+        for key, value in kwargs.items():
+            if hasattr(self.config, key):
+                setattr(self.config, key, value)
+                updated_params.append(f"{key}={value}")
+            else:
+                self.logger.warning(f"Unknown parameter '{key}' ignored")
+        if updated_params:
+            self.logger.info(f"Updated config parameters: {', '.join(updated_params)}")
+    def _validate_inputs(self, seed_prompt: str) -> None:
+        """
+        Validate input parameters for optimization
+        Args:
+            seed_prompt: The seed prompt to validate
+        Raises:
+            InvalidInputError: If validation fails
+        """
+        if not seed_prompt or not isinstance(seed_prompt, str):
+            raise InvalidInputError("Seed prompt must be a non-empty string")
+        if len(seed_prompt.strip()) < 10:
+            raise InvalidInputError("Seed prompt is too short (minimum 10 characters)")
+        # Validate model configuration
+        model_config = self.config.model
+        if not hasattr(model_config, 'model_name') or not model_config.model_name:
+            raise InvalidInputError("Model name is required")
+        reflection_config = self.config.reflection_model
+        if not hasattr(reflection_config, 'model_name') or not reflection_config.model_name:
+            raise InvalidInputError("Reflection model name is required")
+    def _clean_reflection_prompt(self, prompt: str, max_length: int = 50000) -> str:
+        """
+        Clean reflection prompt by removing base64 images and truncating if too long.
+        🔥 CRITICAL: GEPA's reflective dataset includes base64 images which create
+        massive prompts (7MB+) that exceed token limits. This function:
+        1. Strips all base64 image data
+        2. Removes excessive detailed_scores entries
+        3. Truncates to reasonable size
+        4. Preserves essential feedback information
+        Args:
+            prompt: Original prompt from GEPA (may contain base64)
+            max_length: Maximum length after cleaning (default: 50K chars)
+        Returns:
+            Cleaned prompt without base64, within size limits
+        """
+        import re
+        # Step 1: Remove base64 image strings (typically very long alphanumeric strings)
+        # Base64 images are usually 50K+ characters of A-Za-z0-9+/= pattern
+        # Look for very long base64-like sequences
+        base64_pattern = r'[A-Za-z0-9+/=]{5000,}'  # Sequences of 5000+ base64 chars
+        cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', prompt)
+        # Step 2: Remove detailed_scores sections that might contain base64 references
+        # These are usually in markdown format: "### detailed_scores\n...base64..."
+        detailed_scores_pattern = r'### detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*'
+        cleaned = re.sub(detailed_scores_pattern, '### detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE)
+        # Step 3: Remove any remaining image_base64 references
+        cleaned = re.sub(r'image_base64[^\n]*', 'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE)
+        cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned)  # Very long strings likely base64
+        # Step 4: Truncate if still too long (keep the beginning which usually has the most important info)
+        if len(cleaned) > max_length:
+            # Keep first part (usually contains prompt and key feedback)
+            # Add truncation notice
+            truncated_size = len(cleaned) - max_length
+            cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters of detailed evaluation data]"
+            self.logger.warning(f"⚠️  Prompt truncated: {len(prompt)} → {len(cleaned)} chars")
+        return cleaned
+    def _validate_models(self, task_lm, reflection_lm):
+        """
+        Validate if specified models are supported.
+        Note: No hardcoded restrictions - the API provider will validate model existence.
+        This method is kept for potential future validation logic but doesn't restrict users.
+        """
+        # No hardcoded model restrictions - users can specify any model
+        # The API provider will handle validation and return errors if model doesn't exist
+        self.logger.debug(f"Using task model: {task_lm}, reflection model: {reflection_lm}")
+    def _create_seed_candidate(self, seed_prompt: str) -> Dict[str, str]:
+        """Create a seed candidate from the input prompt."""
+        sanitized_prompt = sanitize_prompt(seed_prompt)
+        return {'system_prompt': sanitized_prompt}
+    async def _run_gepa_optimization(self, adapter, seed_candidate: Any, trainset: List[Any], valset: List[Any], **kwargs) -> tuple:  # Return tuple
+        """
+        Run GEPA optimization with the given adapter and data
+        Args:
+            adapter: Custom adapter for GEPA
+            seed_candidate: Initial prompt candidate
+            trainset: Training dataset
+            valset: Validation dataset
+            **kwargs: Additional optimization parameters that can override config
+        Returns:
+            Dict with optimization results
+        Raises:
+            GepaOptimizerError: If optimization fails
+        Note:
+            The following parameters are required in the config:
+            - max_metric_calls: Maximum number of metric evaluations
+            - batch_size: Batch size for evaluation
+            - max_iterations: Maximum number of optimization iterations
+        """
+        try:
+            # Get optimization parameters from config (these are required fields)
+            max_metric_calls = self.config.max_metric_calls
+            batch_size = self.config.batch_size
+            max_iterations = self.config.max_iterations
+            # Create reflection model client
+            from ..llms.vision_llm import VisionLLMClient
+            base_reflection_lm_client = VisionLLMClient(
+                provider=self.config.reflection_model.provider,
+                model_name=self.config.reflection_model.model_name,
+                api_key=self.config.reflection_model.api_key,
+                base_url=self.config.reflection_model.base_url,
+                temperature=self.config.reflection_model.temperature,
+                max_tokens=self.config.reflection_model.max_tokens,
+                top_p=self.config.reflection_model.top_p,
+                frequency_penalty=self.config.reflection_model.frequency_penalty,
+                presence_penalty=self.config.reflection_model.presence_penalty
+            )
+            # reflection_lm_client will be set below (may be wrapped with LLEGO)
+            reflection_lm_client = base_reflection_lm_client
+            # 🆕 LLEGO Integration: Create enhanced reflection callable
+            if self.config.use_llego_operators:
+                self.logger.info("🧬 LLEGO genetic operators ENABLED")
+                self.logger.info(f"   α={self.config.alpha}, τ={self.config.tau}, ν={self.config.nu}")
+                self.logger.info(f"   Crossover offspring: {self.config.n_crossover}, Mutation offspring: {self.config.n_mutation}")
+                # Import LLEGO operators
+                from ..operators.llego_operators import LLEGOIntegrationLayer, PromptCandidate
+                # Initialize LLEGO integration layer
+                llego = LLEGOIntegrationLayer(
+                    alpha=self.config.alpha,
+                    tau=self.config.tau,
+                    nu=self.config.nu,
+                    population_size=self.config.population_size,
+                    n_crossover=self.config.n_crossover,
+                    n_mutation=self.config.n_mutation
+                )
+                # Initialize with seed prompt
+                llego.initialize_population(
+                    seed_prompt=seed_candidate.get('system_prompt', ''),
+                    initial_fitness=0.5
+                )
+                # 🔥 HYBRID MODE FIX: Wrap reflection_lm_client with LLEGO for hybrid mode
+                # This ensures reflection calls go through LLEGO wrapper for candidate generation
+                if self.config.enable_gepa_reflection_with_llego:
+                    self.logger.info("🔥 HYBRID MODE: Wrapping reflection_lm_client with LLEGO")
+                    from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                    # Wrap reflection_lm_client with LLEGO so hybrid generation is triggered
+                    reflection_lm_client = LLEGOEnhancedLLMClient(
+                        base_llm=base_reflection_lm_client,
+                        llego_layer=llego,
+                        config=self.config,  # Pass config for hybrid mode!
+                        verbose=True
+                    )
+                    self.logger.info("✅ reflection_lm_client wrapped with LLEGO (hybrid mode enabled)")
+                    # 🔥 CRITICAL: Store reflection_lm_client reference in adapter so it can set context
+                    # This allows make_reflective_dataset to set reflection context on BOTH clients
+                    if hasattr(adapter, 'reflection_lm_client'):
+                        adapter.reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Stored reflection_lm_client reference in adapter")
+                    else:
+                        # Add reflection_lm_client attribute to adapter
+                        adapter.reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Added reflection_lm_client attribute to adapter")
+                    # 🔥 NEW: Also store config and reflection_lm_client for adapter-level generation
+                    if hasattr(adapter, '_config'):
+                        adapter._config = self.config
+                        self.logger.info("✅ Stored config in adapter for hybrid mode")
+                    else:
+                        adapter._config = self.config
+                        self.logger.info("✅ Added _config attribute to adapter")
+                    if hasattr(adapter, '_reflection_lm_client'):
+                        adapter._reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Stored _reflection_lm_client in adapter for hybrid mode")
+                    else:
+                        adapter._reflection_lm_client = reflection_lm_client
+                        self.logger.info("✅ Added _reflection_lm_client attribute to adapter")
+                    # 🔥 CRITICAL FIX: Ensure LLEGO layer is stored in adapter
+                    # Without this, adapter.llego will be None and population updates are skipped!
+                    if hasattr(adapter, 'llego'):
+                        if adapter.llego is None:
+                            adapter.llego = llego
+                            self.logger.info("✅ CRITICAL: Set LLEGO layer in adapter (was None)")
+                        else:
+                            self.logger.debug("✅ LLEGO layer already set in adapter")
+                    else:
+                        # Add llego attribute if it doesn't exist
+                        adapter.llego = llego
+                        self.logger.info("✅ CRITICAL: Added LLEGO layer to adapter")
+                # 🔥 CRITICAL: Always set _reflection_lm_client in adapter (even without hybrid mode)
+                # This is required for propose_new_texts() to work
+                if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None:
+                    adapter._reflection_lm_client = reflection_lm_client
+                    self.logger.info("✅ Set _reflection_lm_client in adapter (required for propose_new_texts)")
+                # 🔥 HYBRID MODE FIX: Inject config into LLEGO wrapper for hybrid mode
+                # The adapter already has LLEGO wrapper, we just need to update its config
+                if self.config.enable_gepa_reflection_with_llego:
+                    # HYBRID MODE: Update the LLEGO wrapper's config
+                    self.logger.info("🔥 HYBRID MODE: Enabling hybrid candidate generation in LLEGO wrapper")
+                    # Get the LLM client (may already be wrapped)
+                    llm_client = self.adapter.llm_client
+                    from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                    if isinstance(llm_client, LLEGOEnhancedLLMClient):
+                        # Already wrapped, just update config
+                        llm_client.config = self.config
+                        self.logger.info("✅ Updated LLEGO wrapper with hybrid mode config")
+                    else:
+                        # Not wrapped yet, wrap it now with config
+                        llego_wrapped_llm = LLEGOEnhancedLLMClient(
+                            base_llm=llm_client,
+                            llego_layer=llego,
+                            config=self.config,  # ← Pass config for hybrid mode!
+                            verbose=True
+                        )
+                        # Update adapter's LLM client
+                        self.adapter.llm_client = llego_wrapped_llm
+                        self.logger.info("✅ Wrapped LLM client with LLEGO (hybrid mode enabled)")
+                    adapter = self.adapter
+                else:
+                    # LLEGO-ONLY MODE: Wrap adapter with LLEGO layer (no hybrid)
+                    self.logger.info("🧬 LLEGO-ONLY MODE: Recreating adapter with LLEGO integration...")
+                    if hasattr(self, 'adapter') and self.adapter:
+                        from .universal_adapter import UniversalGepaAdapter
+                        # Get original LLM client and evaluator from current adapter
+                        original_llm = self.adapter.llm_client
+                        # If it's already wrapped, unwrap it
+                        if hasattr(original_llm, 'base_llm'):
+                            original_llm = original_llm.base_llm
+                        evaluator = self.adapter.evaluator
+                        data_converter = self.adapter.data_converter
+                        # Recreate adapter with LLEGO (no hybrid mode config)
+                        from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient
+                        llego_wrapped_llm = LLEGOEnhancedLLMClient(
+                            base_llm=original_llm,
+                            llego_layer=llego,
+                            config=None,  # No hybrid mode
+                            verbose=True
+                        )
+                        adapter = UniversalGepaAdapter(
+                            llm_client=llego_wrapped_llm,
+                            evaluator=evaluator,
+                            data_converter=data_converter,
+                            llego_layer=llego
+                        )
+                        self.logger.info("✅ Adapter recreated with LLEGO-enhanced LLM client")
+                    else:
+                        adapter = self.adapter
+                # Create LLEGO-enhanced reflection callable
+                # When hybrid mode is enabled, reflection_lm_client is wrapped with LLEGO
+                # The wrapper will automatically generate hybrid candidates when called
+                def reflection_lm_callable(prompt: str) -> str:
+                    """
+                    Reflection callable that delegates to LLEGO-wrapped client.
+                    In hybrid mode, the wrapper generates candidates from both GEPA and LLEGO.
+                    🔥 CRITICAL: Clean the prompt to remove base64 images and truncate if too long.
+                    """
+                    # 🔥 FIX: Clean prompt to remove base64 images and truncate excessive data
+                    cleaned_prompt = self._clean_reflection_prompt(prompt)
+                    self.logger.info(f"\n{'🔥'*40}")
+                    self.logger.info(f"🔥 reflection_lm_callable CALLED (delegating to LLEGO wrapper)")
+                    self.logger.info(f"🔥 Original prompt length: {len(prompt)} chars")
+                    self.logger.info(f"🔥 Cleaned prompt length: {len(cleaned_prompt)} chars")
+                    self.logger.info(f"🔥 Truncation: {len(prompt) - len(cleaned_prompt)} chars removed")
+                    self.logger.info(f"🔥 First 200 chars (cleaned): {cleaned_prompt[:200]}...")
+                    self.logger.info(f"{'🔥'*40}\n")
+                    try:
+                        # 🔥 CRITICAL: Set reflection context BEFORE generating
+                        # This signals to the LLEGO wrapper that we're in reflection mode
+                        if isinstance(reflection_lm_client, LLEGOEnhancedLLMClient):
+                            reflection_lm_client.set_reflection_context(
+                                current_prompt=cleaned_prompt,  # Use cleaned prompt
+                                feedback=None,
+                                in_reflection=True  # Enable reflection mode
+                            )
+                            self.logger.info("✅ Reflection context set on reflection_lm_client")
+                        # 🔥 HYBRID MODE: If reflection_lm_client is wrapped with LLEGO,
+                        # calling generate() will trigger hybrid candidate generation
+                        # The wrapper handles queuing and returns candidates one by one
+                        # 🔥 CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback
+                        optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization.
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+Output the improved prompt directly and only the prompt."""
+                        result = reflection_lm_client.generate(
+                            system_prompt=optimization_system_prompt,
+                            user_prompt=cleaned_prompt,  # Use cleaned prompt (no base64, truncated)
+                            image_base64=""
+                        )
+                        # Extract content from result
+                        if isinstance(result, dict):
+                            candidate = result.get("content", str(result))
+                            source = result.get("source", "unknown")
+                            self.logger.info(f"✅ Candidate from {source} (FULL TEXT):")
+                            self.logger.info(f"   '{candidate}'")
+                            return candidate
+                        else:
+                            candidate = str(result)
+                            self.logger.info(f"✅ Candidate generated (FULL TEXT):")
+                            self.logger.info(f"   '{candidate}'")
+                            return candidate
+                    except Exception as e:
+                        self.logger.error(f"❌ Error in reflection_lm_callable: {e}")
+                        import traceback
+                        self.logger.error(traceback.format_exc())
+                        # Fallback: return prompt as-is
+                        return prompt
+                # Set up reflection context for LLEGO wrapper
+                if self.config.enable_gepa_reflection_with_llego and isinstance(reflection_lm_client, LLEGOEnhancedLLMClient):
+                    # Store current prompt in reflection context for LLEGO operators
+                    reflection_lm_client.set_reflection_context(
+                        current_prompt=seed_candidate.get('system_prompt', ''),
+                        feedback=None,
+                        in_reflection=True
+                    )
+            else:
+                # Standard GEPA reflection (no LLEGO)
+                adapter = self.adapter  # Use the original adapter
+                # 🔥 CRITICAL: Always set _reflection_lm_client in adapter (even without LLEGO)
+                # This is required for propose_new_texts() to work
+                if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None:
+                    adapter._reflection_lm_client = reflection_lm_client
+                    self.logger.info("✅ Set _reflection_lm_client in adapter (required for propose_new_texts)")
+                # Define standard reflection callable (no LLEGO enhancement)
+                def reflection_lm_callable(prompt: str) -> str:
+                    """Standard callable wrapper for reflection model that GEPA expects"""
+                    try:
+                        # 🔥 CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback
+                        optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization.
+Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues.
+Core Requirements:
+1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary)
+2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening)
+3. PRESERVE the core task domain and output format requirements
+4. INTEGRATE improvements from feedback naturally into the prompt structure
+5. MAINTAIN clarity, specificity, and actionability
+Quality Standards:
+- Be specific and concrete (avoid vague instructions)
+- Use clear, imperative language for task instructions
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+DO NOT include:
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..." or "Based on feedback..."
+- Recommendations or suggestions (those are already in the feedback)
+Output the improved prompt directly and only the prompt."""
+                        # For reflection, we only need text generation (no images)
+                        result = reflection_lm_client.generate(
+                            system_prompt=optimization_system_prompt,
+                            user_prompt=prompt,
+                            image_base64=""  # No image for reflection
+                        )
+                        # Extract string content from the result dictionary
+                        if isinstance(result, dict):
+                            return result.get("content", str(result))
+                        else:
+                            return str(result)
+                    except Exception as e:
+                        self.logger.error(f"Reflection model error: {e}")
+                        return prompt  # Return original prompt on error
+            self.logger.info(
+                f"Starting GEPA optimization with {max_iterations} iterations, "
+                f"batch size {batch_size}, max metric calls: {max_metric_calls}"
+            )
+            self.logger.info(
+                f"GEPA parameters: candidate_selection_strategy=pareto, "
+                f"reflection_minibatch_size={batch_size}, "
+                f"skip_perfect_score=False, "
+                f"module_selector=round_robin"
+            )
+            # Prepare optimization parameters with ONLY valid GEPA parameters
+            # Note: 'adapter' variable is set above (either LLEGO-enhanced or standard)
+            # 🔥 REMOVED: Excessive diagnostic warnings - moved to DEBUG level
+            reflection_lm_passed = reflection_lm_callable if self.config.use_llego_operators else None
+            if reflection_lm_passed:
+                self.logger.debug(f"reflection_lm_callable passed to GEPA (may be ignored in adapter mode)")
+            # #region agent log
+            import json as _json_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params", "message": "GEPA params construction", "data": {"max_iterations_from_config": max_iterations, "max_metric_calls": max_metric_calls, "batch_size": batch_size}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            # #endregion
+            gepa_params = {
+                'adapter': adapter,  # Use the adapter created above (with or without LLEGO)
+                'seed_candidate': seed_candidate,
+                'trainset': trainset,
+                'valset': valset,
+                'max_metric_calls': max_metric_calls,
+                # NOTE: GEPA does NOT have num_iterations - it uses max_metric_calls to control iterations
+                # 🔥 CRITICAL: When using an adapter, GEPA expects:
+                # - adapter.make_reflective_dataset() to create feedback data
+                # - GEPA's internal proposer to generate candidates from that data
+                # - task_lm and reflection_lm must be None (GEPA will use model from adapter)
+                'task_lm': None,  # Don't pass - adapter handles this
+                'reflection_lm': reflection_lm_passed,  # Pass LLEGO-enhanced reflection (may be ignored!)
+                # Valid GEPA parameters based on actual library
+                'candidate_selection_strategy': 'pareto',  # Use Pareto selection
+                'skip_perfect_score': False,  # Don't skip perfect scores
+                'reflection_minibatch_size': batch_size,  # Use batch size for reflection
+                'perfect_score': 1.0,  # Perfect score threshold
+                'module_selector': 'round_robin',  # Cycle through components
+                'display_progress_bar': self.config.verbose,  # Show progress if verbose
+                'raise_on_exception': True,  # Raise exceptions for debugging
+            }
+            # 🔥 CRITICAL FIX: Filter kwargs to only include valid GEPA parameters
+            # GEPA does NOT accept num_iterations, max_iterations, or other non-GEPA params
+            VALID_GEPA_PARAMS = {
+                'seed_candidate', 'trainset', 'valset', 'adapter', 'task_lm', 'reflection_lm',
+                'candidate_selection_strategy', 'skip_perfect_score', 'batch_sampler',
+                'reflection_minibatch_size', 'perfect_score', 'reflection_prompt_template',
+                'module_selector', 'use_merge', 'max_merge_invocations', 'merge_val_overlap_floor',
+                'max_metric_calls', 'stop_callbacks', 'logger', 'run_dir', 'use_wandb',
+                'wandb_api_key', 'wandb_init_kwargs', 'use_mlflow', 'mlflow_tracking_uri',
+                'mlflow_experiment_name', 'track_best_outputs', 'display_progress_bar',
+                'use_cloudpickle', 'seed', 'raise_on_exception', 'val_evaluation_policy'
+            }
+            # Only add valid kwargs that aren't already in gepa_params
+            for key, value in kwargs.items():
+                if key in VALID_GEPA_PARAMS and key not in gepa_params:
+                    gepa_params[key] = value
+                elif key not in VALID_GEPA_PARAMS:
+                    self.logger.debug(f"⚠️  Filtering out invalid GEPA parameter: {key}")
+            # #region agent log
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params_final", "message": "Final GEPA params keys", "data": {"params_keys": list(gepa_params.keys()), "max_metric_calls": gepa_params.get('max_metric_calls', 'NOT_PASSED')}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+            # #endregion
+            # 🎯 NEW: Capture GEPA's internal logging for pareto front information
+            gepa_output = io.StringIO()
+            # Log iteration start
+            from ..utils.clean_logger import get_clean_logger
+            clean_log = get_clean_logger()
+            clean_log.log_iteration_start(1, seed_prompt=seed_candidate.get('system_prompt', ''))
+            # 🔥 CRITICAL: Pass valset size to adapter for better dataset type detection
+            if hasattr(adapter, '_valset_size'):
+                adapter._valset_size = len(valset)
+                self.logger.debug(f"✅ Set valset_size in adapter: {len(valset)} for Dpareto detection")
+            # 🔥 CRITICAL FIX: Store valset in adapter so we can evaluate generated candidates on it
+            # This ensures generated candidates are evaluated on Dpareto for Pareto selection
+            if hasattr(adapter, '_valset'):
+                adapter._valset = valset
+                self.logger.debug(f"✅ Stored valset in adapter ({len(valset)} samples) for Dpareto evaluation of generated candidates")
+            else:
+                # Add _valset attribute if it doesn't exist
+                adapter._valset = valset
+                self.logger.debug(f"✅ Added _valset attribute to adapter ({len(valset)} samples)")
+            # Run GEPA optimization (synchronous call wrapped in async)
+            result = await asyncio.get_event_loop().run_in_executor(
+                None,
+                lambda: self._run_gepa_with_logging(gepa_params, gepa_output)
+            )
+            # 🎯 NEW: Process and log pareto front information, extract iteration count
+            gepa_logs = gepa_output.getvalue()
+            actual_iterations = self._log_pareto_front_info(gepa_logs)  # Get iteration count
+            return result, actual_iterations  # Return both result and iteration count
+        except Exception as e:
+            # Try to extract partial results before failing
+            self.logger.warning(f"GEPA optimization failed: {e}")
+            # Check if we have any cached results from the adapter
+            best_candidate = adapter.get_best_candidate()
+            best_score = adapter.get_best_score()
+            if best_candidate and best_score > 0:
+                self.logger.info(f"🎯 Using cached best result with score: {best_score:.4f}")
+                # Create a mock GEPA result with the best candidate found
+                return {
+                    'best_candidate': best_candidate,
+                    'best_score': best_score,
+                    'partial_result': True,
+                    'error': f'GEPA failed but returning best result found: {str(e)}'
+                }
+            else:
+                # If no cached results, re-raise the error
+                raise GepaOptimizerError(f"GEPA optimization failed: {str(e)}")
+    def _run_gepa_with_logging(self, gepa_params: Dict[str, Any], output_buffer: io.StringIO) -> Any:
+        """Run GEPA optimization while capturing its output."""
+        # Capture GEPA's print statements and logging
+        with redirect_stdout(output_buffer), redirect_stderr(output_buffer):
+            return gepa.optimize(**gepa_params)
+    def _log_pareto_front_info(self, gepa_logs: str) -> int:  # Return int instead of None
+        """Extract and log pareto front information from GEPA logs. Returns max iteration count."""
+        lines = gepa_logs.split('\n')
+        current_iteration = 0
+        max_iteration = 0  # Track max iteration
+        for line in lines:
+            # Look for iteration information
+            if 'iteration' in line.lower():
+                # Try to extract iteration number
+                import re
+                iteration_match = re.search(r'iteration\s+(\d+)', line.lower())
+                if iteration_match:
+                    current_iteration = int(iteration_match.group(1))
+                    max_iteration = max(max_iteration, current_iteration)  # Track max
+                    # Log iteration change
+                    from ..utils.clean_logger import get_clean_logger
+                    clean_log = get_clean_logger()
+                    if current_iteration > clean_log.current_iteration:
+                        clean_log.current_iteration = current_iteration
+            # Look for pareto front information
+            if 'pareto front' in line.lower() or 'new program' in line.lower():
+                self.logger.info(f"GEPA Pareto Update: {line.strip()}")
+            elif 'iteration' in line.lower() and ('score' in line.lower() or 'program' in line.lower()):
+                self.logger.debug(f"{line.strip()}")
+            elif 'best' in line.lower() and 'score' in line.lower():
+                self.logger.info(f"{line.strip()}")
+            # Look for evaluation information
+            if 'evaluating' in line.lower() and 'candidate' in line.lower():
+                self.logger.debug(f"{line.strip()}")
+        self.logger.info(f"GEPA Optimization Complete: {max_iteration} iterations")
+        # #region agent log
+        import json as _json_debug
+        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+        with open(_debug_log_path, "a") as _f:
+            _f.write(_json_debug.dumps({"hypothesisId": "F", "location": "optimizer.py:gepa_complete", "message": "GEPA optimization complete - iteration count", "data": {"max_iteration_from_logs": max_iteration, "expected_iterations": self.config.max_iterations, "off_by_one": max_iteration != self.config.max_iterations, "gepa_logs_length": len(gepa_logs)}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        # #endregion
+        return max_iteration  # Return the max iteration count
+    def _extract_best_candidate(self, gepa_result: Any) -> Dict[str, str]:
+        """
+        Extract the best candidate from GEPA Pareto front (single source of truth).
+        GEPA Pareto front is the single source of truth because:
+        - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto
+        - All non-dominated candidates are added to GEPA Pareto front
+        - Therefore, the best candidate MUST be in GEPA Pareto front
+        Args:
+            gepa_result: Raw result from gepa.optimize() (used only as fallback edge case)
+        Returns:
+            Best candidate dictionary with prompt components from GEPA Pareto front
+        """
+        try:
+            self.logger.info(f"\n{'═'*80}")
+            self.logger.info(f"🔍 EXTRACTING BEST CANDIDATE FROM GEPA PARETO FRONT")
+            self.logger.info(f"{'═'*80}")
+            # ========================================================================
+            # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth)
+            # ========================================================================
+            from ..utils.pareto_logger import get_pareto_logger
+            pareto_log = get_pareto_logger()
+            if pareto_log.pareto_front:
+                try:
+                    # Get best candidate from GEPA Pareto front (highest score = best)
+                    gepa_pareto_best = max(pareto_log.pareto_front, key=lambda x: x['score'])
+                    gepa_pareto_fitness = gepa_pareto_best['score']
+                    gepa_pareto_prompt = gepa_pareto_best['prompt']
+                    gepa_pareto_type = gepa_pareto_best.get('type', 'unknown')
+                    gepa_pareto_notation = gepa_pareto_best.get('notation', 'S')
+                    best_candidate = {
+                        'system_prompt': gepa_pareto_prompt,
+                        'fitness': gepa_pareto_fitness,
+                        'source': 'gepa_pareto_front',
+                        'candidate_type': gepa_pareto_type,
+                        'notation': gepa_pareto_notation
+                    }
+                    self.logger.info(f"✅ SELECTED: Best candidate from GEPA Pareto front")
+                    self.logger.info(f"   Notation: {gepa_pareto_notation}")
+                    self.logger.info(f"   Fitness: f({gepa_pareto_notation})={gepa_pareto_fitness:.4f}")
+                    self.logger.info(f"   Type: {gepa_pareto_type}")
+                    self.logger.info(f"   Prompt length: {len(gepa_pareto_prompt)} chars")
+                    self.logger.info(f"   💡 GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)")
+                    return best_candidate
+                except Exception as e:
+                    self.logger.error(f"❌ Failed to extract from GEPA Pareto front: {e}")
+                    import traceback
+                    self.logger.error(traceback.format_exc())
+            # ========================================================================
+            # EDGE CASE FALLBACK: Pareto front empty (shouldn't happen, but handle gracefully)
+            # ========================================================================
+            self.logger.warning(f"⚠️  GEPA Pareto front is empty - using gepa_result as fallback")
+            self.logger.warning(f"   This should not happen if all candidates are evaluated on Dpareto")
+            # Try to extract from gepa_result (last resort)
+            if hasattr(gepa_result, 'best_candidate'):
+                gepa_candidate = gepa_result.best_candidate
+                gepa_prompt = gepa_candidate.get('system_prompt') if isinstance(gepa_candidate, dict) else str(gepa_candidate)
+                gepa_fitness = getattr(gepa_result, 'best_score', None)
+                if gepa_prompt:
+                    self.logger.info(f"✅ Using gepa_result.best_candidate as fallback")
+                    return {
+                        'system_prompt': gepa_prompt,
+                        'fitness': float(gepa_fitness) if gepa_fitness is not None else None,
+                        'source': 'gepa_result_fallback',
+                        'candidate_type': 'unknown',
+                        'notation': 'S'
+                    }
+            # Last resort: return empty prompt
+            self.logger.error(f"❌ No candidates found anywhere - returning empty prompt")
+            return {'system_prompt': ''}
+        except Exception as e:
+            self.logger.error(f"❌ Error extracting best candidate: {e}")
+            import traceback
+            self.logger.error(traceback.format_exc())
+            return {'system_prompt': ''}
+    def _evaluate_candidate_on_testset(
+        self,
+        candidate: Dict[str, str],
+        testset: List[Dict]
+    ) -> float:
+        """
+        Evaluate a candidate prompt on the held-out test set.
+        Args:
+            candidate: Prompt candidate to evaluate
+            testset: Test dataset (not used during optimization)
+        Returns:
+            Average composite score on test set
+        Raises:
+            TestSetEvaluationError: If evaluation fails
+        """
+        from ..utils.exceptions import TestSetEvaluationError
+        try:
+            # Evaluate using the adapter (same as GEPA does internally)
+            eval_result = self.adapter.evaluate(
+                batch=testset,
+                candidate=candidate,
+                capture_traces=False  # Don't need detailed traces for test
+            )
+            if not eval_result.scores:
+                raise TestSetEvaluationError("No scores returned from test evaluation")
+            # Calculate average score
+            avg_score = sum(eval_result.scores) / len(eval_result.scores)
+            self.logger.debug(
+                f"Test set evaluation: {len(eval_result.scores)} samples, "
+                f"scores: {eval_result.scores}, avg: {avg_score:.4f}"
+            )
+            return avg_score
+        except Exception as e:
+            raise TestSetEvaluationError(f"Failed to evaluate on test set: {str(e)}")
+    def optimize_sync(self,
+                     model: str,
+                     seed_prompt: str,
+                     dataset: Any,
+                     reflection_lm: str,
+                     max_metric_calls: int = 150,
+                     **kwargs) -> OptimizedResult:
+        """
+        Synchronous version of the optimization method
+        Args:
+            model: Target model to optimize for
+            seed_prompt: Initial prompt to optimize
+            dataset: Training data in any format
+            reflection_lm: Model for reflection
+            max_metric_calls: Budget for optimization attempts
+            **kwargs: Additional optimization parameters
+        Returns:
+            OptimizedResult: Optimization result
+        """
+        # Run the async method in a new event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        try:
+            result = loop.run_until_complete(
+                self.train(model, seed_prompt, dataset, reflection_lm, max_metric_calls, **kwargs)
+            )
+            return result
+        finally:
+            loop.close()
+# Convenience function for quick optimization
+def optimize_prompt(
+    model: Union[str, ModelConfig],
+    seed_prompt: str,
+    dataset: Any,
+    reflection_model: Optional[Union[str, ModelConfig]] = None,
+    **kwargs
+) -> OptimizedResult:
+    """
+    Convenience function for quick prompt optimization without creating optimizer instance
+    Args:
+        model: Target model configuration
+        seed_prompt: Initial prompt to optimize
+        dataset: Training data
+        reflection_model: Model for reflection (optional)
+        **kwargs: Additional optimization parameters
+    Returns:
+        OptimizedResult: Optimization result
+    """
+    # Create default config if not provided
+    if reflection_model is None:
+        reflection_model = model
+    config = OptimizationConfig(
+        model=model,
+        reflection_model=reflection_model,
+        max_iterations=kwargs.get('max_iterations', 10),
+        max_metric_calls=kwargs.get('max_metric_calls', 50),
+        batch_size=kwargs.get('batch_size', 4)
+    )
+    optimizer = GepaOptimizer(config=config)
+    return asyncio.run(optimizer.train(seed_prompt, dataset, **kwargs))

src/gepa_optimizer/core/result.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""
+Result processing for GEPA Optimizer
+Handles extraction and processing of GEPA optimization results
+"""
+from typing import Any, Dict, Optional
+import logging
+logger = logging.getLogger(__name__)
+class ResultProcessor:
+    """
+    Processes raw GEPA optimization results into clean, usable formats
+    """
+    @staticmethod
+    def extract_optimized_prompt(result: Any) -> str:
+        """
+        Extract the optimized prompt from GEPA result object
+        Args:
+            result: Raw GEPA optimization result
+        Returns:
+            str: The optimized prompt text
+        """
+        try:
+            # Try multiple possible result structures
+            if hasattr(result, 'best_candidate'):
+                candidate = result.best_candidate
+                if isinstance(candidate, dict):
+                    # Try common prompt keys
+                    for key in ['system_prompt', 'prompt', 'text']:
+                        if key in candidate:
+                            return str(candidate[key])
+                    # If no standard key found, return string representation
+                    return str(candidate)
+                else:
+                    return str(candidate)
+            # Fallback - convert entire result to string
+            return str(result)
+        except Exception as e:
+            logger.warning(f"Failed to extract optimized prompt: {e}")
+            return "Optimization completed (prompt extraction failed)"
+    @staticmethod
+    def extract_metrics(result: Any) -> Dict[str, Any]:
+        """
+        Extract performance metrics from GEPA result
+        Args:
+            result: Raw GEPA optimization result
+        Returns:
+            Dict[str, Any]: Extracted metrics
+        """
+        metrics = {}
+        try:
+            # Extract common metrics
+            if hasattr(result, 'best_score'):
+                metrics['best_score'] = float(result.best_score)
+            if hasattr(result, 'baseline_score'):
+                metrics['baseline_score'] = float(result.baseline_score)
+            if hasattr(result, 'improvement'):
+                metrics['improvement'] = float(result.improvement)
+            if hasattr(result, 'iterations'):
+                metrics['iterations'] = int(result.iterations)
+            # Calculate improvement percentage if we have both scores
+            if 'best_score' in metrics and 'baseline_score' in metrics:
+                baseline = metrics['baseline_score']
+                if baseline > 0:
+                    improvement_percent = ((metrics['best_score'] - baseline) / baseline) * 100
+                    metrics['improvement_percent'] = round(improvement_percent, 2)
+            # Extract additional metadata
+            if hasattr(result, 'metadata'):
+                metrics['metadata'] = result.metadata
+        except Exception as e:
+            logger.warning(f"Failed to extract metrics: {e}")
+        return metrics
+    @staticmethod
+    def extract_reflection_history(result: Any) -> list:
+        """
+        Extract reflection/optimization history from GEPA result
+        Args:
+            result: Raw GEPA optimization result
+        Returns:
+            list: List of reflection iterations
+        """
+        history = []
+        try:
+            if hasattr(result, 'optimization_history'):
+                for i, iteration in enumerate(result.optimization_history):
+                    history_item = {
+                        'iteration': i,
+                        'score': iteration.get('score', 0.0),
+                        'candidate': iteration.get('candidate', {}),
+                        'feedback': iteration.get('feedback', ''),
+                        'improvement': iteration.get('improvement', 0.0)
+                    }
+                    history.append(history_item)
+        except Exception as e:
+            logger.warning(f"Failed to extract reflection history: {e}")
+        return history
+    @staticmethod
+    def process_full_result(
+        result: Any,
+        original_prompt: str,
+        optimization_time: float,
+        actual_iterations: Optional[int] = None,
+        test_metrics: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, Any]:
+        """
+        Process complete GEPA result into structured format.
+        Args:
+            result: Raw GEPA optimization result
+            original_prompt: Original seed prompt
+            optimization_time: Time taken for optimization
+            actual_iterations: Actual number of iterations from GEPA logs (optional)
+            test_metrics: Metrics from test set evaluation (optional)
+        Returns:
+            Dict[str, Any]: Complete processed result
+        """
+        # Extract metrics first
+        metrics = ResultProcessor.extract_metrics(result)
+        # Extract iterations from GEPA result
+        total_iterations = 0
+        try:
+            # First priority: use actual_iterations if provided (from logs)
+            if actual_iterations is not None:
+                total_iterations = actual_iterations
+            elif hasattr(result, 'iterations'):
+                total_iterations = int(result.iterations)
+            elif hasattr(result, 'num_iterations'):
+                total_iterations = int(result.num_iterations)
+            elif hasattr(result, 'optimization_history'):
+                total_iterations = len(result.optimization_history)
+            # Check if it's in metrics
+            elif 'iterations' in metrics:
+                total_iterations = metrics['iterations']
+        except Exception as e:
+            logger.warning(f"Failed to extract iterations: {e}")
+        # Merge test metrics into improvement_data
+        improvement_data = {}
+        if test_metrics:
+            improvement_data.update(test_metrics)
+        return {
+            'original_prompt': original_prompt,
+            'optimized_prompt': ResultProcessor.extract_optimized_prompt(result),
+            'metrics': metrics,
+            'improvement_data': improvement_data,
+            'reflection_history': ResultProcessor.extract_reflection_history(result),
+            'optimization_time': optimization_time,
+            'total_iterations': total_iterations,
+            'status': 'completed',
+            'raw_result': result  # Keep raw result for advanced users
+        }

src/gepa_optimizer/core/universal_adapter.py ADDED Viewed

The diff for this file is too large to render. See raw diff

src/gepa_optimizer/data/__init__.py ADDED Viewed

	@@ -0,0 +1,27 @@

+"""
+Data module for GEPA Optimizer
+"""
+from .converters import UniversalConverter
+from .loaders import DataLoader
+from .validators import DataValidator
+from .scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset
+from .validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split
+from .index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split
+__all__ = [
+    "UniversalConverter",
+    "DataLoader",
+    "DataValidator",
+    # Scroll dataset
+    "ScrollDatasetLoader",
+    "load_scroll_dataset",
+    # Validation dataset
+    "ValidationDatasetLoader",
+    "load_validation_dataset",
+    "load_validation_split",
+    # Index caching dataset
+    "IndexCachingDatasetLoader",
+    "load_index_caching_dataset",
+    "load_index_caching_split",
+]

src/gepa_optimizer/data/converters.py ADDED Viewed

	@@ -0,0 +1,265 @@

+"""
+Universal converter for dataset to GEPA format with 3-way split (train/val/test)
+"""
+import os
+import json
+from typing import Any, List, Tuple, Union, Dict, Optional
+from pathlib import Path
+import pandas as pd
+import logging
+from .loaders import DataLoader
+from ..utils.exceptions import DatasetError
+from ..models.config import DataSplitConfig
+logger = logging.getLogger(__name__)
+class UniversalConverter:
+    """
+    Universal converter for datasets to GEPA format.
+    Handles 3-way splitting (train/val/test) with configurable ratios and
+    graceful handling of small datasets.
+    """
+    def __init__(self, data_split_config: Optional[DataSplitConfig] = None):
+        """
+        Initialize converter with optional split configuration.
+        Args:
+            data_split_config: Configuration for train/val/test splits.
+                             If None, uses default 60/20/20 split.
+        """
+        self.supported_extensions = [
+            '.csv', '.json', '.jsonl', '.txt', '.md',
+            '.png', '.jpg', '.jpeg'
+        ]
+        self.loader = DataLoader()
+        self.data_split_config = data_split_config or DataSplitConfig()
+    def convert(
+        self,
+        dataset: Union[List[Any], str, Any, Dict[str, Any]],
+        split_config: Optional[DataSplitConfig] = None
+    ) -> Tuple[List[dict], List[dict], List[dict]]:
+        """
+        Convert any dataset to GEPA format with 3-way split (train/val/test).
+        Args:
+            dataset: Input dataset in any supported format
+            split_config: Optional split configuration (overrides instance config)
+        Returns:
+            Tuple of (trainset, valset, testset) where:
+            - trainset: Used for reflection/feedback (Dfeedback in GEPA paper)
+            - valset: Used for Pareto selection (Dpareto in GEPA paper)
+            - testset: Held-out for final evaluation (not passed to GEPA)
+        Raises:
+            DatasetError: If dataset cannot be converted or is too small
+        """
+        try:
+            # Use provided split config or instance default
+            config = split_config or self.data_split_config
+            # Handle UI tree dataset format
+            if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset':
+                return self.convert_ui_tree_dataset(
+                    dataset.get('json_dir', 'json_tree'),
+                    dataset.get('screenshots_dir', 'screenshots'),
+                    split_config=config
+                )
+            elif isinstance(dataset, str):
+                data = self._load_from_path(dataset)
+            elif hasattr(dataset, 'to_dict'):  # pandas DataFrame
+                data = dataset.to_dict(orient='records')
+            elif isinstance(dataset, list):
+                data = dataset
+            else:
+                data = [dataset]
+            logger.info(f"Normalized data length: {len(data)}")
+            standardized = self._standardize(data)
+            train, val, test = self._split_three_way(standardized, config)
+            return train, val, test
+        except (FileNotFoundError, ValueError, TypeError) as e:
+            raise DatasetError(f"Failed to convert dataset: {str(e)}")
+    def _load_from_path(self, path: str) -> List[Any]:
+        """Load data from file path"""
+        p = Path(path)
+        if not p.exists():
+            raise FileNotFoundError(f"File not found: {path}")
+        ext = p.suffix.lower()
+        if ext in self.supported_extensions:
+            return [self.loader.load(p)]
+        else:
+            raise DatasetError(f"Unsupported file extension: {ext}")
+    def _standardize(self, data: List[Any]) -> List[dict]:
+        """Standardize data to input/output format
+        Handles both UI tree JSON format and simple text inputs.
+        UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str}
+        Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc.
+        """
+        out = []
+        for item in data:
+            if not isinstance(item, dict):
+                item = {'input': str(item)}
+            # Handle UI tree JSON format
+            if 'ui_tree' in item and 'screenshot' in item:
+                ui_tree = item['ui_tree']
+                input_text = ui_tree.get('text', '')
+                output_text = item.get('expected_output', '')
+                image = item.get('screenshot', '')
+                out.append({'input': input_text, 'output': output_text, 'image': image})
+            # Handle simple text format
+            else:
+                inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or ''
+                outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or ''
+                image = self._extract(item, ['image', 'image_base64', 'screenshot']) or ''
+                out.append({'input': inp, 'output': outp, 'image': image})
+        return out
+    def _extract(self, d: dict, keys: List[str]) -> Union[str, None]:
+        """Extract value by trying multiple keys"""
+        for k in keys:
+            if k in d:
+                return d[k]
+        return None
+    def _split_three_way(
+        self,
+        data: List[dict],
+        config: DataSplitConfig
+    ) -> Tuple[List[dict], List[dict], List[dict]]:
+        """
+        Split data into train, validation, and test sets.
+        Args:
+            data: Standardized dataset
+            config: Split configuration with ratios and strategies
+        Returns:
+            Tuple of (train, val, test) datasets
+        Raises:
+            ValueError: If dataset is too small for configured splits
+        """
+        dataset_size = len(data)
+        # 🔥 NEW: Log adaptive strategy if being used
+        if config.small_dataset_strategy == 'adaptive':
+            train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size)
+            logger.info(
+                f"📊 Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): "
+                f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% "
+                f"(prioritizes validation for reliable candidate ranking)"
+            )
+        # Get split indices from config
+        try:
+            train_end, val_end, test_end, _ = config.get_split_indices(dataset_size)
+        except ValueError as e:
+            logger.error(f"Dataset split error: {e}")
+            raise DatasetError(str(e))
+        # Perform the split
+        train = data[:train_end]
+        val = data[train_end:val_end]
+        test = data[val_end:test_end]
+        # Log split information with strategy
+        strategy_note = ""
+        if config.small_dataset_strategy == 'adaptive':
+            strategy_note = " (adaptive)"
+        logger.info(
+            f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), "
+            f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), "
+            f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)"
+        )
+        # Validate splits are not empty
+        if len(train) == 0:
+            raise DatasetError("Training set is empty after split")
+        if len(val) == 0:
+            logger.warning("Validation set is empty - this may cause issues with Pareto selection")
+            val = [train[-1]]  # Use last training sample as fallback
+        if len(test) == 0:
+            logger.warning("Test set is empty - final evaluation will not be performed")
+        return train, val, test
+    def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]:
+        """
+        DEPRECATED: Legacy 2-way split for backwards compatibility.
+        Use _split_three_way() instead for production code.
+        Args:
+            data: Standardized dataset
+            ratio: Train ratio (0.0-1.0)
+        Returns:
+            Tuple of (train, val) datasets
+        """
+        import warnings
+        warnings.warn(
+            "_split() is deprecated. Use _split_three_way() for 3-way splitting.",
+            DeprecationWarning,
+            stacklevel=2
+        )
+        split = max(1, int(len(data) * ratio))
+        train = data[:split]
+        val = data[split:] or data[-1:]  # Ensure val is not empty
+        return train, val
+    def convert_ui_tree_dataset(
+        self,
+        json_dir: str,
+        screenshots_dir: str,
+        split_config: Optional[DataSplitConfig] = None
+    ) -> Tuple[List[dict], List[dict], List[dict]]:
+        """
+        Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split.
+        Args:
+            json_dir: Directory containing JSON files
+            screenshots_dir: Directory containing screenshot images
+            split_config: Optional split configuration (overrides instance config)
+        Returns:
+            Tuple of (train_data, val_data, test_data) in GEPA format
+        Raises:
+            DatasetError: If dataset cannot be loaded or is invalid
+        """
+        try:
+            # Load paired dataset
+            dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir)
+            if not dataset:
+                raise DatasetError("No valid image-JSON pairs found")
+            logger.info(f"Loaded {len(dataset)} UI tree samples")
+            # Use provided config or instance default
+            config = split_config or self.data_split_config
+            # Split into train/val/test
+            train, val, test = self._split_three_way(dataset, config)
+            logger.info(
+                f"Split UI tree dataset: {len(train)} train, "
+                f"{len(val)} validation, {len(test)} test"
+            )
+            return train, val, test
+        except Exception as e:
+            raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}")

src/gepa_optimizer/data/index_caching_loader.py ADDED Viewed

	@@ -0,0 +1,278 @@

+"""
+Index Caching Dataset Loader
+Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format.
+"""
+import os
+import json
+import base64
+import logging
+from typing import List, Dict, Any, Optional
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class IndexCachingDatasetLoader:
+    """
+    Loads index caching dataset from JSON file.
+    Expected JSON format:
+    [
+        {
+            "command": "Tap on first option from the suggestion",
+            "image": "element_images/QMxgc_14_0_tap_IkALe_element.png",
+            "xml": "xml/IkALe__debug.xml",
+            "expected": {
+                "is_index_based": true,
+                "index_value": 1,
+                "parent_element_id": "aaaabf",
+                "element_id_of_nth_child_of_parent": "aaaabg",
+                "selected_element_is_correct": true
+            }
+        },
+        ...
+    ]
+    Converts to GEPA format:
+    - input: command text (seed prompt will be provided in test script)
+    - output: JSON string with expected values
+    - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
+    - input: Command + XML content (combined in user prompt)
+    - metadata: All original fields plus converted values
+    """
+    def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None):
+        """
+        Initialize index caching dataset loader.
+        Args:
+            json_path: Path to JSON file. Default: "./note2_debug.json" or from env var
+            base_dir: Base directory for resolving relative paths in JSON.
+                    Default: Directory containing JSON file
+        Raises:
+            FileNotFoundError: If JSON file doesn't exist
+            json.JSONDecodeError: If JSON file is invalid
+        """
+        # Get JSON path from env or use default
+        if json_path is None:
+            json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json")
+        self.json_path = Path(json_path).resolve()
+        if not self.json_path.exists():
+            raise FileNotFoundError(
+                f"Dataset file not found: {self.json_path}\n"
+                f"Make sure note2_debug.json exists in the project root."
+            )
+        # Base directory for resolving relative paths
+        if base_dir is None:
+            base_dir = self.json_path.parent
+        self.base_dir = Path(base_dir).resolve()
+    def load_dataset(self) -> List[Dict[str, Any]]:
+        """
+        Load dataset from JSON file and convert to GEPA format.
+        Returns:
+            List of dataset items in GEPA format:
+            [
+                {
+                    "input": "Tap on first option from the suggestion",  # Command only
+                    "output": '{"is_index_based": true, "index_value": 1, ...}',  # Expected JSON
+                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL
+                    "metadata": {
+                        "command": "...",
+                        "image_path": "...",
+                        "xml_path": "...",
+                        "expected": {...}
+                    }
+                },
+                ...
+            ]
+        Raises:
+            FileNotFoundError: If image or XML file doesn't exist
+            json.JSONDecodeError: If JSON file is invalid
+        """
+        # Load JSON file
+        with open(self.json_path, "r", encoding="utf-8") as f:
+            dataset = json.load(f)
+        gepa_dataset = []
+        for idx, entry in enumerate(dataset):
+            command = entry.get("command", "")
+            image_path = entry.get("image", "")
+            xml_path = entry.get("xml", "")
+            expected = entry.get("expected", {})
+            # Resolve paths relative to base_dir
+            abs_image_path = (self.base_dir / image_path).resolve()
+            abs_xml_path = (self.base_dir / xml_path).resolve()
+            # Validate paths
+            if not abs_image_path.exists():
+                raise FileNotFoundError(
+                    f"Image file not found: {abs_image_path}\n"
+                    f"Entry {idx + 1}: {command}"
+                )
+            if not abs_xml_path.exists():
+                raise FileNotFoundError(
+                    f"XML file not found: {abs_xml_path}\n"
+                    f"Entry {idx + 1}: {command}"
+                )
+            # Load and encode image
+            with open(abs_image_path, "rb") as f:
+                image_data = f.read()
+                image_base64 = base64.b64encode(image_data).decode("utf-8")
+            # Load XML content
+            with open(abs_xml_path, "r", encoding="utf-8") as f:
+                xml_content = f.read()
+            # Convert expected to JSON string
+            expected_json = json.dumps(expected, ensure_ascii=False)
+            # Create user prompt with command + XML content
+            # The XML will be included in the user prompt text (as the agent does)
+            user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```"
+            # For reflection, we don't need full XML - just the command is enough
+            # Reflection is about improving the prompt based on evaluation feedback,
+            # not analyzing specific XML structures
+            reflection_input = command  # Just the command, no XML
+            # Create GEPA format item
+            gepa_item = {
+                "input": user_prompt,  # Command + XML content (for evaluation)
+                "reflection_input": reflection_input,  # Just command (for reflection)
+                "output": expected_json,  # Expected output as JSON string
+                "image_base64": image_base64,  # TOP LEVEL for UniversalConverter
+                "metadata": {
+                    "command": command,
+                    "image_path": str(image_path),
+                    "xml_path": str(xml_path),
+                    "abs_image_path": str(abs_image_path),
+                    "abs_xml_path": str(abs_xml_path),
+                    "xml_content": xml_content,  # Store XML separately in metadata
+                    "expected": expected,
+                    "dataset_index": idx
+                }
+            }
+            gepa_dataset.append(gepa_item)
+        return gepa_dataset
+    def load_split(
+        self,
+        train_ratio: float = 0.6,
+        val_ratio: float = 0.4
+    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Load dataset and split into train/val sets (no test set).
+        Args:
+            train_ratio: Ratio for training set (default: 0.6)
+            val_ratio: Ratio for validation set (default: 0.4)
+        Returns:
+            Tuple of (train_set, val_set)
+        Raises:
+            ValueError: If ratios don't sum to 1.0
+        """
+        if abs(train_ratio + val_ratio - 1.0) > 0.01:
+            raise ValueError(
+                f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}"
+            )
+        dataset = self.load_dataset()
+        total = len(dataset)
+        train_end = int(total * train_ratio)
+        train_set = dataset[:train_end]
+        val_set = dataset[train_end:]
+        return train_set, val_set
+def load_index_caching_dataset(
+    json_path: Optional[str] = None,
+    base_dir: Optional[str] = None
+) -> List[Dict[str, Any]]:
+    """
+    Convenience function to load index caching dataset.
+    Args:
+        json_path: Path to JSON file
+        base_dir: Base directory for resolving relative paths
+    Returns:
+        List of dataset items in GEPA format
+    """
+    loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
+    return loader.load_dataset()
+def load_index_caching_split(
+    json_path: Optional[str] = None,
+    base_dir: Optional[str] = None,
+    train_ratio: float = 0.6,
+    val_ratio: float = 0.4
+) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Convenience function to load and split index caching dataset.
+    Args:
+        json_path: Path to JSON file
+        base_dir: Base directory for resolving relative paths
+        train_ratio: Ratio for training set
+        val_ratio: Ratio for validation set
+    Returns:
+        Tuple of (train_set, val_set) - no test set
+    """
+    loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
+    return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio)
+# Example usage
+if __name__ == "__main__":
+    print("🚀 Testing Index Caching Dataset Loader...")
+    # Test loading
+    try:
+        loader = IndexCachingDatasetLoader(json_path="./note2_debug.json")
+        dataset = loader.load_dataset()
+        print(f"\n✅ Loaded {len(dataset)} items")
+        # Show sample
+        if dataset:
+            sample = dataset[0]
+            print(f"\n📝 Sample Item:")
+            print(f"   Command: {sample['input']}")
+            print(f"   Image path: {sample['metadata']['image_path']}")
+            print(f"   XML path: {sample['metadata']['xml_path']}")
+            print(f"   Expected: {sample['output'][:100]}...")
+            print(f"   Image base64 length: {len(sample['image_base64'])}")
+            print(f"   XML content length: {len(sample['metadata'].get('xml_content', ''))}")
+        # Test split
+        train, val = loader.load_split()
+        print(f"\n📊 Dataset Split:")
+        print(f"   Training: {len(train)} samples")
+        print(f"   Validation: {len(val)} samples")
+        print(f"   Test: Not used (no test set)")
+    except Exception as e:
+        print(f"❌ Error: {e}")

src/gepa_optimizer/data/loaders.py ADDED Viewed

	@@ -0,0 +1,237 @@

+"""
+Data loading utilities for various file formats
+"""
+import json
+import base64
+import pandas as pd
+from typing import Any, Optional, Union, List , Dict
+from pathlib import Path
+import logging
+logger = logging.getLogger(__name__)
+class DataLoader:
+    """
+    Utility class for loading data from various sources
+    """
+    def __init__(self):
+        self.supported_formats = [
+            '.csv', '.json', '.jsonl', '.txt', '.md', '.xlsx',
+            '.png', '.jpg', '.jpeg'
+        ]
+    def load(self, source: Union[str, Path], format_hint: Optional[str] = None) -> Optional[Any]:
+        """
+        Load data from any supported source
+        Args:
+            source: File path or data source
+            format_hint: Optional format hint to override auto-detection
+        Returns:
+            Loaded data or None if failed
+        """
+        try:
+            path = Path(source)
+            if not path.exists():
+                logger.error(f"File not found: {source}")
+                return None
+            # Use format hint or detect from extension
+            file_format = format_hint or path.suffix.lower()
+            if file_format == '.csv':
+                return self.load_csv(path)
+            elif file_format == '.json':
+                return self.load_json(path)
+            elif file_format == '.jsonl':
+                return self.load_jsonl(path)
+            elif file_format in ['.txt', '.md']:
+                return self.load_text(path)
+            elif file_format == '.xlsx':
+                return self.load_excel(path)
+            elif file_format in ['.png', '.jpg', '.jpeg']:
+                return self.load_image_base64(path)
+            else:
+                logger.warning(f"Unsupported format: {file_format}")
+                return None
+        except Exception as e:
+            logger.error(f"Failed to load data from {source}: {str(e)}")
+            return None
+    def load_csv(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
+        """Load CSV file as pandas DataFrame"""
+        try:
+            df = pd.read_csv(path)
+            logger.info(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns")
+            return df
+        except Exception as e:
+            logger.error(f"Failed to load CSV {path}: {str(e)}")
+            return None
+    def load_json(self, path: Union[str, Path]) -> Optional[Any]:
+        """Load JSON file"""
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+            if isinstance(data, list):
+                logger.info(f"Loaded JSON with {len(data)} items")
+            else:
+                logger.info("Loaded JSON object")
+            return data
+        except Exception as e:
+            logger.error(f"Failed to load JSON {path}: {str(e)}")
+            return None
+    def load_jsonl(self, path: Union[str, Path]) -> Optional[List[Dict]]:
+        """Load JSONL (JSON Lines) file"""
+        try:
+            data = []
+            with open(path, 'r', encoding='utf-8') as f:
+                for line_num, line in enumerate(f, 1):
+                    line = line.strip()
+                    if line:
+                        try:
+                            data.append(json.loads(line))
+                        except json.JSONDecodeError as e:
+                            logger.warning(f"Invalid JSON on line {line_num}: {str(e)}")
+            logger.info(f"Loaded JSONL with {len(data)} items")
+            return data
+        except Exception as e:
+            logger.error(f"Failed to load JSONL {path}: {str(e)}")
+            return None
+    def load_text(self, path: Union[str, Path]) -> Optional[str]:
+        """Load plain text file"""
+        try:
+            with open(path, 'r', encoding='utf-8') as f:
+                content = f.read()
+            logger.info(f"Loaded text file with {len(content)} characters")
+            return content
+        except Exception as e:
+            logger.error(f"Failed to load text {path}: {str(e)}")
+            return None
+    def load_excel(self, path: Union[str, Path]) -> Optional[pd.DataFrame]:
+        """Load Excel file as pandas DataFrame"""
+        try:
+            df = pd.read_excel(path)
+            logger.info(f"Loaded Excel with {len(df)} rows and {len(df.columns)} columns")
+            return df
+        except Exception as e:
+            logger.error(f"Failed to load Excel {path}: {str(e)}")
+            return None
+    def load_image_base64(self, path: Union[str, Path]) -> Optional[str]:
+        """Load image file and encode as Base64 string"""
+        try:
+            with open(path, 'rb') as f:
+                encoded_string = base64.b64encode(f.read()).decode('utf-8')
+            logger.info(f"Loaded image {path} and encoded to Base64")
+            return encoded_string
+        except Exception as e:
+            logger.error(f"Failed to load image {path}: {str(e)}")
+            return None
+    def is_supported_format(self, file_path: Union[str, Path]) -> bool:
+        """Check if file format is supported"""
+        path = Path(file_path)
+        return path.suffix.lower() in self.supported_formats
+    def get_file_info(self, file_path: Union[str, Path]) -> Dict[str, Any]:
+        """Get information about a file"""
+        path = Path(file_path)
+        if not path.exists():
+            return {'exists': False}
+        return {
+            'exists': True,
+            'size': path.stat().st_size,
+            'format': path.suffix.lower(),
+            'supported': self.is_supported_format(path),
+            'name': path.name,
+            'stem': path.stem,
+            'parent': str(path.parent)
+        }
+    def load_ui_tree_dataset(self, json_dir: str, screenshots_dir: str) -> List[Dict[str, Any]]:
+        """
+        Load UI tree dataset by pairing JSON files with corresponding screenshots
+        Args:
+            json_dir: Directory containing JSON files (e.g., "json_tree")
+            screenshots_dir: Directory containing screenshot images (e.g., "screenshots")
+        Returns:
+            List of dictionaries with 'input', 'output', and 'image' keys
+        """
+        json_path = Path(json_dir)
+        screenshots_path = Path(screenshots_dir)
+        if not json_path.exists():
+            raise FileNotFoundError(f"JSON directory not found: {json_dir}")
+        if not screenshots_path.exists():
+            raise FileNotFoundError(f"Screenshots directory not found: {screenshots_dir}")
+        dataset = []
+        # Get all JSON files
+        json_files = list(json_path.glob("*.json"))
+        logger.info(f"Found {len(json_files)} JSON files in {json_dir}")
+        for json_file in json_files:
+            # Extract filename without extension (e.g., "2" from "2.json")
+            file_stem = json_file.stem
+            # Look for corresponding image file
+            image_extensions = ['.jpg', '.jpeg', '.png']
+            image_file = None
+            for ext in image_extensions:
+                potential_image = screenshots_path / f"{file_stem}{ext}"
+                if potential_image.exists():
+                    image_file = potential_image
+                    break
+            if not image_file:
+                logger.warning(f"No corresponding image found for {json_file.name}")
+                continue
+            try:
+                # Load JSON content
+                json_data = self.load_json(json_file)
+                if not json_data:
+                    logger.warning(f"Failed to load JSON: {json_file}")
+                    continue
+                # Load image as base64
+                image_base64 = self.load_image_base64(image_file)
+                if not image_base64:
+                    logger.warning(f"Failed to load image: {image_file}")
+                    continue
+                # Create dataset entry
+                dataset_entry = {
+                    'input': 'Extract UI elements from this screenshot and provide the complete UI tree structure',
+                    'output': json.dumps(json_data, indent=2),  # Convert JSON to string
+                    'image': image_base64
+                }
+                dataset.append(dataset_entry)
+                logger.debug(f"Loaded pair: {json_file.name} + {image_file.name}")
+            except Exception as e:
+                logger.error(f"Error loading {json_file.name}: {str(e)}")
+                continue
+        logger.info(f"Successfully loaded {len(dataset)} image-JSON pairs")
+        return dataset

src/gepa_optimizer/data/scroll_dataset_loader.py ADDED Viewed

	@@ -0,0 +1,334 @@

+"""
+Scroll Element Dataset Loader for Drizz Mobile App Testing
+Loads screenshots with bounding boxes and commands to identify scroll elements.
+Converts to GEPA-compatible format for prompt optimization.
+"""
+import base64
+import random
+import logging
+from typing import List, Dict, Any, Tuple, Optional
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class ScrollDatasetLoader:
+    """
+    GENERIC dataset loader for image-based tasks.
+    This is a LIBRARY class - NO hardcoded assumptions about:
+    - What the task is (OCR, element detection, classification, etc.)
+    - Input format (questions, commands, descriptions, etc.)
+    - Output format (IDs, text, JSON, etc.)
+    Users define their dataset in the test script and pass it here.
+    Dataset format per item: (image_filename, input_text, expected_output)
+    Example usage (ANY task):
+        # Define YOUR dataset in YOUR test script
+        my_dataset = [
+            ("img1.png", "What is the main color?", "blue"),
+            ("img2.png", "Count the objects", "5"),
+            ("img3.png", "Describe the scene", "A cat on a sofa"),
+        ]
+        # Pass to loader
+        loader = ScrollDatasetLoader(
+            images_dir="images",
+            dataset_config=my_dataset
+        )
+        data = loader.load_dataset()
+    """
+    def __init__(
+        self,
+        images_dir: str = "images",
+        dataset_config: Optional[List[Tuple[str, str, str]]] = None
+    ):
+        """
+        Initialize dataset loader.
+        Args:
+            images_dir: Directory containing images
+            dataset_config: List of (image_filename, input_text, expected_output) tuples.
+                           REQUIRED - no hardcoded defaults to keep library generic.
+        Raises:
+            FileNotFoundError: If images_dir doesn't exist
+            ValueError: If dataset_config is None
+        """
+        self.images_dir = Path(images_dir)
+        if not self.images_dir.exists():
+            raise FileNotFoundError(f"Images directory not found: {images_dir}")
+        if dataset_config is None:
+            raise ValueError(
+                "dataset_config is required. This is a library class - define your "
+                "dataset in the test script:\n"
+                "  dataset = [('img1.png', 'your input', 'expected output'), ...]\n"
+                "  loader = ScrollDatasetLoader(images_dir='...', dataset_config=dataset)"
+            )
+        self.dataset_config = dataset_config
+    def load_dataset(self) -> List[Dict[str, Any]]:
+        """
+        Load complete dataset with images.
+        Phase 1: Includes element_id extraction from expected output.
+        Returns:
+            List of dataset items in GEPA format:
+            [
+                {
+                    "input": "Command: Scroll down by 70%",
+                    "output": "3",
+                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL
+                    "metadata": {
+                        "image_path": "images/5.png",
+                        "input_text": "Command: Scroll down by 70%",
+                        "expected_output": "3",
+                        "image_filename": "5.png",
+                        "element_id": 3  # Extracted integer (None if extraction fails)
+                    }
+                },
+                ...
+            ]
+        """
+        dataset = []
+        # Generic variable names - no assumptions about data type
+        for image_filename, input_text, expected_output in self.dataset_config:
+            image_path = self.images_dir / image_filename
+            # Validate image exists
+            if not image_path.exists():
+                logger.warning(f"Image not found: {image_path}")
+                continue
+            # Read and encode image
+            try:
+                image_base64 = self._encode_image(image_path)
+            except Exception as e:
+                logger.warning(f"Error encoding {image_filename}: {e}")
+                continue
+            # 🔥 Phase 1: Extract element_id from expected_output for robust evaluation
+            element_id = self._extract_element_id(expected_output)
+            if element_id is None:
+                logger.warning(f"Could not extract element_id from '{expected_output}' in {image_filename}")
+            # Create dataset item - COMPLETELY GENERIC
+            # NO assumptions about output format (element IDs, commands, etc.)
+            # Just: image + input text + expected output text
+            # Library doesn't know or care what the task is!
+            # IMPORTANT: Put image_base64 at TOP LEVEL for UniversalConverter to find it
+            dataset_item = {
+                "input": input_text,  # Generic input text (ANY format)
+                "output": expected_output,  # Generic expected output (ANY format, full reasoning)
+                "image_base64": image_base64,  # TOP LEVEL for converter
+                "metadata": {
+                    "image_path": str(image_path),
+                    "input_text": input_text,
+                    "expected_output": expected_output,
+                    "image_filename": image_filename,
+                    "element_id": element_id  # NEW: Extracted element ID (int or None)
+                }
+            }
+            dataset.append(dataset_item)
+        if not dataset:
+            raise ValueError("No valid images found in dataset")
+        logger.info(f"Loaded {len(dataset)} scroll element detection samples")
+        return dataset
+    def _extract_element_id(self, expected_output: str) -> Optional[int]:
+        """
+        Extract element ID from expected output string.
+        Handles multiple formats:
+        - "Element: 4"
+        - "Element 4"
+        - "4" (standalone)
+        - "Element: 4, Description: ..." (full reasoning)
+        Args:
+            expected_output: Full expected output string with reasoning
+        Returns:
+            Element ID as integer, or None if not found
+        """
+        import re
+        if not expected_output:
+            return None
+        # Pattern 1: "Element: X" or "Element X" (case insensitive)
+        patterns = [
+            r'element[:\s]+(\d+)',  # "Element: 4" or "Element 4"
+            r'\belement\s+(\d+)\b',  # "element 4" (word boundary)
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, expected_output, re.IGNORECASE)
+            if match:
+                try:
+                    element_id = int(match.group(1))
+                    # Validate range (reasonable UI element IDs)
+                    if 1 <= element_id <= 100:
+                        return element_id
+                except (ValueError, IndexError):
+                    continue
+        # Pattern 2: First standalone number (if no "Element:" pattern found)
+        # Only use if it's a reasonable element ID (1-100)
+        number_match = re.search(r'\b(\d{1,3})\b', expected_output)
+        if number_match:
+            try:
+                element_id = int(number_match.group(1))
+                if 1 <= element_id <= 100:  # Reasonable range for UI elements
+                    return element_id
+            except ValueError:
+                pass
+        return None
+    def _encode_image(self, image_path: Path) -> str:
+        """
+        Encode image to base64 string.
+        Args:
+            image_path: Path to image file
+        Returns:
+            Base64 encoded image string
+        """
+        with open(image_path, "rb") as image_file:
+            encoded = base64.b64encode(image_file.read()).decode('utf-8')
+        return encoded
+    def split_dataset(
+        self,
+        dataset: List[Dict[str, Any]],
+        train_size: int = 4,
+        val_size: int = 1,
+        test_size: int = 1,
+        shuffle: bool = True,
+        seed: Optional[int] = None
+    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Split dataset into train, validation, and test sets.
+        🔥 NEW: Added shuffling support to ensure different image distribution
+        across splits, preventing hard images from always landing in validation set.
+        Args:
+            dataset: Complete dataset
+            train_size: Number of samples for training (default: 4)
+            val_size: Number of samples for validation (default: 1)
+            test_size: Number of samples for test (default: 1)
+            shuffle: Whether to shuffle dataset before splitting (default: True)
+            seed: Random seed for reproducible shuffling (default: None = random)
+        Returns:
+            Tuple of (train_set, val_set, test_set)
+        """
+        n = len(dataset)
+        # Validate split sizes
+        total_size = train_size + val_size + test_size
+        if total_size > n:
+            logger.warning(f"Requested split ({total_size}) exceeds dataset size ({n}). Adjusting split proportionally...")
+            ratio = n / total_size
+            train_size = int(train_size * ratio)
+            val_size = int(val_size * ratio)
+            test_size = n - train_size - val_size
+        # 🔥 CRITICAL: Shuffle dataset to ensure different image distribution
+        # This prevents the same hard images from always being in validation set
+        dataset_copy = dataset.copy()  # Don't modify original
+        if shuffle:
+            if seed is not None:
+                random.seed(seed)
+                logger.debug(f"Shuffling dataset with seed={seed} for reproducible splits")
+            else:
+                logger.debug(f"Shuffling dataset randomly (no seed)")
+            random.shuffle(dataset_copy)
+        else:
+            logger.warning(f"Not shuffling dataset - using original order")
+        # Split shuffled dataset
+        train_set = dataset_copy[:train_size]
+        val_set = dataset_copy[train_size:train_size + val_size]
+        test_set = dataset_copy[train_size + val_size:train_size + val_size + test_size]
+        logger.info(f"Dataset split: {len(train_set)} train, {len(val_set)} val, {len(test_set)} test")
+        # Log which images are in each split for debugging
+        if shuffle:
+            train_images = [item['metadata'].get('image_filename', 'N/A') for item in train_set]
+            val_images = [item['metadata'].get('image_filename', 'N/A') for item in val_set]
+            test_images = [item['metadata'].get('image_filename', 'N/A') for item in test_set]
+            print(f"   Train images: {train_images[:5]}{'...' if len(train_images) > 5 else ''}")
+            print(f"   Val images:   {val_images}")
+            print(f"   Test images:  {test_images[:5]}{'...' if len(test_images) > 5 else ''}")
+        return train_set, val_set, test_set
+def load_scroll_dataset(
+    images_dir: str = "images",
+    dataset_config: List[Tuple[str, str, str]] = None,
+    split: bool = True
+) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Convenience function to load image-based dataset (GENERIC).
+    Args:
+        images_dir: Directory containing images
+        dataset_config: List of (image_filename, input_text, expected_output) tuples
+        split: Whether to split into train/val/test
+    Returns:
+        If split=True: (train_set, val_set, test_set)
+        If split=False: (full_dataset, [], [])
+    Example (works for ANY task):
+        dataset_config = [
+            ("img1.png", "What color is the sky?", "blue"),
+            ("img2.png", "Count the dogs", "2"),
+        ]
+        train, val, test = load_scroll_dataset(
+            images_dir="images",
+            dataset_config=dataset_config
+        )
+    """
+    loader = ScrollDatasetLoader(images_dir, dataset_config=dataset_config)
+    dataset = loader.load_dataset()
+    if split:
+        return loader.split_dataset(dataset)
+    else:
+        return dataset, [], []
+# Example usage (for testing the library loader itself)
+if __name__ == "__main__":
+    print("🚀 Testing Scroll Dataset Loader...")
+    print("⚠️  NOTE: This is a library class. Define your dataset in your test script.")
+    print("\nExample:")
+    print("  dataset_config = [")
+    print("      ('image1.png', 'Scroll down by 50%', '3'),")
+    print("      ('image2.png', 'Swipe left', '4'),")
+    print("  ]")
+    print("  train, val, test = load_scroll_dataset(")
+    print("      images_dir='images',")
+    print("      dataset_config=dataset_config")
+    print("  )")

src/gepa_optimizer/data/validation_dataset_loader.py ADDED Viewed

	@@ -0,0 +1,376 @@

+"""
+Validation Dataset Loader for UI Validation Use Case
+Loads validation datapoints from SQLite database and converts to GEPA-compatible format.
+Supports filtering by data_type (trainset/valset/testset) and confirmed status.
+"""
+import os
+import sqlite3
+import base64
+import logging
+from typing import List, Dict, Any, Optional, Literal
+from pathlib import Path
+logger = logging.getLogger(__name__)
+class ValidationDatasetLoader:
+    """
+    Loads validation dataset from SQLite database.
+    Database schema:
+    - validation_data: id, image_id, command, result (0/1), reasoning, data_type, confirmed, created_at
+    - images: image_id, mime, bytes (BLOB), created_at
+    Converts to GEPA format:
+    - input: command text (seed prompt will be provided in test script)
+    - output: "true" or "false" (converted from 0/1)
+    - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
+    - metadata: All original fields plus converted values
+    Note: The seed prompt is NOT stored in database - it will be provided in the test script.
+    The input field contains just the command, and the image is at top level.
+    """
+    def __init__(
+        self,
+        db_path: Optional[str] = None,
+        confirmed_only: bool = True
+    ):
+        """
+        Initialize validation dataset loader.
+        Args:
+            db_path: Path to SQLite database file.
+                    Default: "./validation_data.db" or from VD_DB_PATH env var
+            confirmed_only: If True, only load datapoints where confirmed=1.
+                           Default: True (only manually reviewed data)
+        Raises:
+            FileNotFoundError: If database file doesn't exist
+            sqlite3.Error: If database connection fails
+        """
+        # Get database path from env or use default
+        if db_path is None:
+            db_path = os.getenv("VD_DB_PATH", "./validation_data.db")
+        self.db_path = Path(db_path).resolve()
+        if not self.db_path.exists():
+            raise FileNotFoundError(
+                f"Database file not found: {self.db_path}\n"
+                f"Make sure validation_data_ui_server_async.py has been run at least once to create the database."
+            )
+        self.confirmed_only = confirmed_only
+    def load_dataset(
+        self,
+        data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
+        confirmed_only: Optional[bool] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Load dataset from database and convert to GEPA format.
+        Args:
+            data_type: Filter by data_type. If None, loads all types.
+                      Options: "trainset", "valset", "testset"
+            confirmed_only: Override instance default. If True, only load confirmed datapoints.
+                           If None, uses instance default (self.confirmed_only)
+        Returns:
+            List of dataset items in GEPA format:
+            [
+                {
+                    "input": "Validate Submit button is visible",  # Command only (seed prompt in test script)
+                    "output": "true",  # or "false" (converted from 0/1)
+                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL (image + command together)
+                    "metadata": {
+                        "id": 1,
+                        "image_id": "abc123...",
+                        "command": "Validate Submit button is visible",
+                        "result": True,  # Boolean
+                        "result_int": 1,  # Original 0/1
+                        "reasoning": "Detailed explanation...",
+                        "data_type": "trainset",
+                        "confirmed": True,
+                        "created_at": "2024-01-01 12:00:00"
+                    }
+                },
+                ...
+            ]
+            Note: Seed prompt is provided separately in test script, not in database.
+        Raises:
+            sqlite3.Error: If database query fails
+            ValueError: If no datapoints found matching criteria
+        """
+        # Use provided confirmed_only or instance default
+        use_confirmed = confirmed_only if confirmed_only is not None else self.confirmed_only
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row  # Access columns by name
+        dataset = []
+        try:
+            # Build query with filters
+            query = """
+                SELECT
+                    v.id,
+                    v.image_id,
+                    v.command,
+                    v.result,
+                    v.reasoning,
+                    v.data_type,
+                    v.confirmed,
+                    v.created_at,
+                    i.mime,
+                    i.bytes
+                FROM validation_data v
+                INNER JOIN images i ON v.image_id = i.image_id
+                WHERE 1=1
+            """
+            params = []
+            # Add filters
+            if use_confirmed:
+                query += " AND v.confirmed = 1"
+            if data_type:
+                query += " AND v.data_type = ?"
+                params.append(data_type)
+            query += " ORDER BY v.id ASC"
+            # Execute query
+            cursor = conn.execute(query, params)
+            rows = cursor.fetchall()
+            if not rows:
+                filter_msg = []
+                if use_confirmed:
+                    filter_msg.append("confirmed=1")
+                if data_type:
+                    filter_msg.append(f"data_type='{data_type}'")
+                filter_str = " with filters: " + ", ".join(filter_msg) if filter_msg else ""
+                raise ValueError(
+                    f"No datapoints found{filter_str} in database: {self.db_path}\n"
+                    f"Make sure you have generated and saved datapoints using the validation UI."
+                )
+            # Convert rows to GEPA format
+            for row in rows:
+                # Convert 0/1 to "true"/"false" string for GEPA
+                result_str = "true" if row["result"] == 1 else "false"
+                # Encode image bytes to base64
+                image_base64 = base64.b64encode(row["bytes"]).decode("utf-8")
+                # Create GEPA format item
+                # Input: command (seed prompt will be provided in test script)
+                # Image: separate at top level (image_base64)
+                # Output: "true" or "false" (converted from 0/1)
+                dataset_item = {
+                    "input": row["command"],  # Just the command - seed prompt will be in test script
+                    "output": result_str,  # "true" or "false" (string)
+                    "image_base64": image_base64,  # TOP LEVEL for UniversalConverter (image + command together)
+                    "metadata": {
+                        "id": row["id"],
+                        "image_id": row["image_id"],
+                        "command": row["command"],  # Keep original for reference
+                        "result": bool(row["result"]),  # Boolean for reference
+                        "result_int": row["result"],  # Original 0/1 for reference
+                        "reasoning": row["reasoning"],
+                        "data_type": row["data_type"],
+                        "confirmed": bool(row["confirmed"]),
+                        "created_at": row["created_at"],
+                        "mime": row["mime"],
+                    }
+                }
+                dataset.append(dataset_item)
+            # Log summary
+            data_type_str = f" ({data_type})" if data_type else ""
+            confirmed_str = " (confirmed only)" if use_confirmed else " (all)"
+            logger.info(f"Loaded {len(dataset)} validation datapoints{data_type_str}{confirmed_str}")
+            return dataset
+        finally:
+            conn.close()
+    def load_split_dataset(
+        self,
+        confirmed_only: Optional[bool] = None
+    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+        """
+        Load dataset split by data_type (trainset/valset/testset).
+        Convenience method that loads all three splits at once.
+        Args:
+            confirmed_only: Override instance default. If True, only load confirmed datapoints.
+        Returns:
+            Tuple of (train_set, val_set, test_set) in GEPA format
+        Example:
+            loader = ValidationDatasetLoader(db_path="./validation_data.db")
+            train, val, test = loader.load_split_dataset()
+        """
+        train_set = self.load_dataset(data_type="trainset", confirmed_only=confirmed_only)
+        val_set = self.load_dataset(data_type="valset", confirmed_only=confirmed_only)
+        test_set = self.load_dataset(data_type="testset", confirmed_only=confirmed_only)
+        logger.info(f"Dataset Split Summary: Training={len(train_set)}, Validation={len(val_set)}, Test={len(test_set)}, Total={len(train_set) + len(val_set) + len(test_set)}")
+        return train_set, val_set, test_set
+    def get_dataset_stats(self) -> Dict[str, Any]:
+        """
+        Get statistics about the dataset in the database.
+        Returns:
+            Dictionary with dataset statistics:
+            {
+                "total": 100,
+                "confirmed": 95,
+                "unconfirmed": 5,
+                "by_data_type": {
+                    "trainset": 70,
+                    "valset": 15,
+                    "testset": 15
+                },
+                "by_result": {
+                    "true": 50,
+                    "false": 50
+                }
+            }
+        """
+        conn = sqlite3.connect(str(self.db_path))
+        conn.row_factory = sqlite3.Row
+        try:
+            stats = {}
+            # Total counts
+            total = conn.execute("SELECT COUNT(*) FROM validation_data").fetchone()[0]
+            confirmed = conn.execute("SELECT COUNT(*) FROM validation_data WHERE confirmed = 1").fetchone()[0]
+            stats["total"] = total
+            stats["confirmed"] = confirmed
+            stats["unconfirmed"] = total - confirmed
+            # By data_type
+            data_type_rows = conn.execute("""
+                SELECT data_type, COUNT(*) as count
+                FROM validation_data
+                GROUP BY data_type
+            """).fetchall()
+            stats["by_data_type"] = {row["data_type"]: row["count"] for row in data_type_rows}
+            # By result (true/false)
+            result_rows = conn.execute("""
+                SELECT result, COUNT(*) as count
+                FROM validation_data
+                GROUP BY result
+            """).fetchall()
+            stats["by_result"] = {
+                "true": sum(row["count"] for row in result_rows if row["result"] == 1),
+                "false": sum(row["count"] for row in result_rows if row["result"] == 0)
+            }
+            return stats
+        finally:
+            conn.close()
+def load_validation_dataset(
+    db_path: Optional[str] = None,
+    data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
+    confirmed_only: bool = True
+) -> List[Dict[str, Any]]:
+    """
+    Convenience function to load validation dataset.
+    Args:
+        db_path: Path to SQLite database file. Default: "./validation_data.db"
+        data_type: Filter by data_type. If None, loads all types.
+        confirmed_only: If True, only load confirmed datapoints.
+    Returns:
+        List of dataset items in GEPA format
+    Example:
+        # Load all confirmed training data
+        train_data = load_validation_dataset(data_type="trainset", confirmed_only=True)
+        # Load all confirmed data
+        all_data = load_validation_dataset(confirmed_only=True)
+    """
+    loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
+    return loader.load_dataset(data_type=data_type, confirmed_only=confirmed_only)
+def load_validation_split(
+    db_path: Optional[str] = None,
+    confirmed_only: bool = True
+) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
+    """
+    Convenience function to load validation dataset split by data_type.
+    Args:
+        db_path: Path to SQLite database file. Default: "./validation_data.db"
+        confirmed_only: If True, only load confirmed datapoints.
+    Returns:
+        Tuple of (train_set, val_set, test_set) in GEPA format
+    Example:
+        train, val, test = load_validation_split(confirmed_only=True)
+    """
+    loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
+    return loader.load_split_dataset(confirmed_only=confirmed_only)
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Validation Dataset Loader...")
+    try:
+        loader = ValidationDatasetLoader()
+        # Get stats
+        print("\n📊 Dataset Statistics:")
+        stats = loader.get_dataset_stats()
+        print(f"   Total: {stats['total']}")
+        print(f"   Confirmed: {stats['confirmed']}")
+        print(f"   Unconfirmed: {stats['unconfirmed']}")
+        print(f"   By data_type: {stats['by_data_type']}")
+        print(f"   By result: {stats['by_result']}")
+        # Load split dataset
+        print("\n📦 Loading split dataset...")
+        train, val, test = loader.load_split_dataset()
+        # Show sample
+        if train:
+            sample = train[0]
+            print(f"\n📝 Sample Training Item:")
+            print(f"   Input: {sample['input']}")
+            print(f"   Output: {sample['output']}")
+            print(f"   Image ID: {sample['metadata']['image_id'][:8]}...")
+            print(f"   Data Type: {sample['metadata']['data_type']}")
+            print(f"   Result: {sample['metadata']['result']} (int: {sample['metadata']['result_int']})")
+    except FileNotFoundError as e:
+        print(f"❌ {e}")
+        print("\n💡 Make sure validation_data_ui_server_async.py has been run to create the database.")
+    except ValueError as e:
+        print(f"❌ {e}")
+        print("\n💡 Generate and save some datapoints using the validation UI first.")

src/gepa_optimizer/data/validators.py ADDED Viewed

	@@ -0,0 +1,207 @@

+"""
+Data validation utilities for GEPA optimizer
+"""
+from typing import List, Dict, Any, Optional, Tuple
+import logging
+logger = logging.getLogger(__name__)
+class DataValidator:
+    """
+    Validates datasets for completeness and GEPA compatibility
+    """
+    def __init__(self):
+        self.required_fields = ['input', 'output']
+        self.optional_fields = ['metadata', 'id', 'tags']
+    def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
+        """
+        Validate entire dataset
+        Args:
+            dataset: List of data items to validate
+        Returns:
+            Tuple[bool, List[str]]: (is_valid, list_of_errors)
+        """
+        errors = []
+        # Basic dataset checks
+        if not dataset:
+            errors.append("Dataset is empty")
+            return False, errors
+        if not isinstance(dataset, list):
+            errors.append("Dataset must be a list")
+            return False, errors
+        # Validate each item
+        for idx, item in enumerate(dataset):
+            item_errors = self.validate_item(item, idx)
+            errors.extend(item_errors)
+        # Check for minimum dataset size
+        if len(dataset) < 2:
+            errors.append("Dataset should have at least 2 items for proper train/val split")
+        # Log validation results
+        if errors:
+            logger.warning(f"Dataset validation failed with {len(errors)} errors")
+        else:
+            logger.info(f"Dataset validation passed for {len(dataset)} items")
+        return len(errors) == 0, errors
+    def validate_item(self, item: Dict[str, Any], index: Optional[int] = None) -> List[str]:
+        """
+        Validate a single dataset item
+        Args:
+            item: Single data item to validate
+            index: Optional item index for error reporting
+        Returns:
+            List[str]: List of validation errors
+        """
+        errors = []
+        item_ref = f"item {index}" if index is not None else "item"
+        # Check if item is a dictionary
+        if not isinstance(item, dict):
+            errors.append(f"{item_ref}: Must be a dictionary")
+            return errors
+        # Check for required fields
+        if 'input' not in item:
+            errors.append(f"{item_ref}: Missing required 'input' field")
+        elif not isinstance(item['input'], str):
+            errors.append(f"{item_ref}: 'input' field must be a string")
+        elif not item['input'].strip():
+            errors.append(f"{item_ref}: 'input' field cannot be empty")
+        # Check output field (can be empty but should exist for supervised learning)
+        if 'output' in item:
+            if not isinstance(item['output'], str):
+                errors.append(f"{item_ref}: 'output' field must be a string")
+        # Validate metadata if present
+        if 'metadata' in item and not isinstance(item['metadata'], dict):
+            errors.append(f"{item_ref}: 'metadata' field must be a dictionary")
+        return errors
+    def validate_gepa_format(self, gepa_data: List[Dict[str, Any]]) -> Tuple[bool, List[str]]:
+        """
+        Validate data in GEPA format
+        Args:
+            gepa_data: Data in GEPA format
+        Returns:
+            Tuple[bool, List[str]]: (is_valid, list_of_errors)
+        """
+        errors = []
+        if not gepa_data:
+            errors.append("GEPA dataset is empty")
+            return False, errors
+        for idx, item in enumerate(gepa_data):
+            if 'input' not in item:
+                errors.append(f"GEPA item {idx}: Missing 'input' field")
+            if 'expected_output' not in item:
+                errors.append(f"GEPA item {idx}: Missing 'expected_output' field")
+            if 'metadata' not in item:
+                errors.append(f"GEPA item {idx}: Missing 'metadata' field")
+            elif not isinstance(item['metadata'], dict):
+                errors.append(f"GEPA item {idx}: 'metadata' must be a dictionary")
+        return len(errors) == 0, errors
+    def validate_split(self, trainset: List[Dict], valset: List[Dict]) -> Tuple[bool, List[str]]:
+        """
+        Validate train/validation split
+        Args:
+            trainset: Training data
+            valset: Validation data
+        Returns:
+            Tuple[bool, List[str]]: (is_valid, list_of_errors)
+        """
+        errors = []
+        if not trainset:
+            errors.append("Training set is empty")
+        if not valset:
+            errors.append("Validation set is empty")
+        # Check proportions
+        total_size = len(trainset) + len(valset)
+        if total_size > 0:
+            train_ratio = len(trainset) / total_size
+            if train_ratio < 0.5:
+                errors.append(f"Training set too small: {train_ratio:.2%} of total data")
+            elif train_ratio > 0.95:
+                errors.append(f"Validation set too small: {1-train_ratio:.2%} of total data")
+        return len(errors) == 0, errors
+    def get_dataset_stats(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Get statistics about the dataset
+        Args:
+            dataset: Dataset to analyze
+        Returns:
+            Dict[str, Any]: Dataset statistics
+        """
+        if not dataset:
+            return {'total_items': 0, 'valid': False}
+        stats = {
+            'total_items': len(dataset),
+            'has_output': sum(1 for item in dataset if item.get('output')),
+            'avg_input_length': 0,
+            'avg_output_length': 0,
+            'empty_inputs': 0,
+            'empty_outputs': 0
+        }
+        input_lengths = []
+        output_lengths = []
+        for item in dataset:
+            if isinstance(item, dict):
+                input_text = item.get('input', '')
+                output_text = item.get('output', '')
+                if isinstance(input_text, str):
+                    input_lengths.append(len(input_text))
+                    if not input_text.strip():
+                        stats['empty_inputs'] += 1
+                if isinstance(output_text, str):
+                    output_lengths.append(len(output_text))
+                    if not output_text.strip():
+                        stats['empty_outputs'] += 1
+        if input_lengths:
+            stats['avg_input_length'] = sum(input_lengths) / len(input_lengths)
+        if output_lengths:
+            stats['avg_output_length'] = sum(output_lengths) / len(output_lengths)
+        # Determine if dataset looks valid
+        stats['valid'] = (
+            stats['total_items'] > 0 and
+            stats['empty_inputs'] < stats['total_items'] * 0.5  # Less than 50% empty inputs
+        )
+        return stats

src/gepa_optimizer/evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+"""
+Evaluation module for GEPA Optimizer
+Includes:
+- UniversalSemanticEvaluator: Works for ANY task (recommended for general use)
+- BaseEvaluator: Abstract base class for custom evaluators
+- Task-specific evaluators for specialized use cases
+"""
+from .base_evaluator import BaseEvaluator
+from .universal_evaluator import UniversalSemanticEvaluator, create_universal_evaluator
+from .ui_evaluator import UITreeEvaluator
+from .scroll_evaluator import ScrollElementEvaluator
+from .validation_evaluator import ValidationEvaluator
+from .index_caching_evaluator import IndexCachingEvaluator
+__all__ = [
+    # Universal (recommended)
+    "UniversalSemanticEvaluator",
+    "create_universal_evaluator",
+    # Base class
+    "BaseEvaluator",
+    # Task-specific
+    "UITreeEvaluator",
+    "ScrollElementEvaluator",
+    "ValidationEvaluator",
+    "IndexCachingEvaluator",
+]

src/gepa_optimizer/evaluation/base_evaluator.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""
+Base evaluator class for all evaluation strategies.
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional
+import logging
+logger = logging.getLogger(__name__)
+class BaseEvaluator(ABC):
+    """
+    Abstract base class for all evaluation strategies.
+    This enforces a consistent interface while allowing complete customization
+    of evaluation logic for any use case.
+    """
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize evaluator with optional metric weights.
+        Args:
+            metric_weights: Optional weights for different metrics.
+                          If None, subclasses should provide defaults.
+        """
+        self.metric_weights = metric_weights or {}
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+    @abstractmethod
+    def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]:
+        """
+        Evaluate predicted output against expected output.
+        Args:
+            predicted: The model's predicted output
+            expected: The ground truth expected output
+        Returns:
+            Dictionary with metric names as keys and scores as values.
+            Must include 'composite_score' key for GEPA integration.
+        """
+        pass
+    def validate_weights(self) -> bool:
+        """Validate that metric weights sum to approximately 1.0"""
+        if not self.metric_weights:
+            return True
+        total = sum(self.metric_weights.values())
+        return abs(total - 1.0) < 0.01  # Allow small floating point errors

src/gepa_optimizer/evaluation/index_caching_evaluator.py ADDED Viewed

	@@ -0,0 +1,357 @@

+"""
+Index Caching Evaluator for Index-Based Element Selection Use Case
+Evaluates predicted index caching results against expected results.
+Compares all 5 fields with equal weight:
+- is_index_based
+- index_value
+- parent_element_id
+- element_id_of_nth_child_of_parent
+- selected_element_is_correct
+"""
+from typing import Dict, Any, Optional
+import json
+import re
+import logging
+from .base_evaluator import BaseEvaluator
+class IndexCachingEvaluator(BaseEvaluator):
+    """
+    Evaluator for index caching use case.
+    Features:
+    - Compares all 5 fields with equal weight (20% each)
+    - Parses JSON from LLM response
+    - Handles null values correctly
+    - Returns detailed field-by-field comparison
+    """
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize index caching evaluator.
+        Args:
+            metric_weights: Weights for evaluation metrics
+                          Default: Equal weight for all 5 fields (0.2 each)
+        """
+        # Each field gets 20% weight (5 fields * 0.2 = 1.0)
+        default_weights = {
+            "is_index_based_match": 0.2,
+            "index_value_match": 0.2,
+            "parent_element_id_match": 0.2,
+            "element_id_of_nth_child_match": 0.2,
+            "selected_element_correct_match": 0.2,
+        }
+        weights = metric_weights or default_weights
+        super().__init__(metric_weights=weights)
+    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+        """
+        Evaluate predicted index caching result against expected result.
+        Args:
+            predicted: LLM's output (JSON string with all 5 fields)
+            expected: Expected output (JSON string or dict with all 5 fields)
+        Returns:
+            Dictionary with evaluation metrics:
+            {
+                "is_index_based_match": 1.0 or 0.0,
+                "index_value_match": 1.0 or 0.0,
+                "parent_element_id_match": 1.0 or 0.0,
+                "element_id_of_nth_child_match": 1.0 or 0.0,
+                "selected_element_correct_match": 1.0 or 0.0,
+                "composite_score": 0.0 to 1.0,
+                "predicted_output": str,
+                "expected_output": str,
+                "field_scores": {...},
+                "evaluation_reason": str
+            }
+        """
+        if not predicted or not expected:
+            return {
+                "is_index_based_match": 0.0,
+                "index_value_match": 0.0,
+                "parent_element_id_match": 0.0,
+                "element_id_of_nth_child_match": 0.0,
+                "selected_element_correct_match": 0.0,
+                "composite_score": 0.0,
+                "predicted_output": str(predicted).strip() if predicted else "",
+                "expected_output": str(expected).strip() if expected else "",
+                "field_scores": {},
+                "evaluation_reason": "❌ Empty or missing input/output"
+            }
+        # Parse expected (could be JSON string or dict)
+        try:
+            if isinstance(expected, str):
+                expected_dict = json.loads(expected)
+            else:
+                expected_dict = expected
+        except (json.JSONDecodeError, TypeError):
+            # If expected is already a dict from dataset
+            expected_dict = expected if isinstance(expected, dict) else {}
+        # Parse predicted (must be JSON string)
+        try:
+            predicted_dict = self._parse_json_response(predicted)
+        except Exception as e:
+            # Log the actual response for debugging
+            response_preview = predicted[:200] if predicted else "(empty)"
+            self.logger.warning(f"Failed to parse predicted JSON: {e}")
+            self.logger.warning(f"Response preview: {response_preview}...")
+            predicted_dict = {}
+        # NOTE: "notes" field is present in the output but is NOT used for scoring or reflection
+        # It's kept for reference but ignored in evaluation
+        # Compare each field (only the 5 core fields, ignoring "notes")
+        field_scores = {}
+        field_reasons = []
+        # 1. is_index_based (boolean)
+        pred_is_index = predicted_dict.get("is_index_based")
+        exp_is_index = expected_dict.get("is_index_based")
+        is_index_match = (pred_is_index == exp_is_index) if (pred_is_index is not None and exp_is_index is not None) else False
+        field_scores["is_index_based"] = 1.0 if is_index_match else 0.0
+        field_reasons.append(f"is_index_based: {pred_is_index} vs {exp_is_index} → {'✅' if is_index_match else '❌'}")
+        # 2. index_value (int or null)
+        pred_index_val = predicted_dict.get("index_value")
+        exp_index_val = expected_dict.get("index_value")
+        # Handle null/None comparison
+        index_val_match = (pred_index_val == exp_index_val) or (pred_index_val is None and exp_index_val is None)
+        field_scores["index_value"] = 1.0 if index_val_match else 0.0
+        field_reasons.append(f"index_value: {pred_index_val} vs {exp_index_val} → {'✅' if index_val_match else '❌'}")
+        # 3. parent_element_id (string or null)
+        pred_parent = predicted_dict.get("parent_element_id")
+        exp_parent = expected_dict.get("parent_element_id")
+        # Handle null/None comparison
+        parent_match = (pred_parent == exp_parent) or (pred_parent is None and exp_parent is None)
+        field_scores["parent_element_id"] = 1.0 if parent_match else 0.0
+        field_reasons.append(f"parent_element_id: {pred_parent} vs {exp_parent} → {'✅' if parent_match else '❌'}")
+        # 4. element_id_of_nth_child_of_parent (string or null)
+        pred_element = predicted_dict.get("element_id_of_nth_child_of_parent")
+        exp_element = expected_dict.get("element_id_of_nth_child_of_parent")
+        # Handle null/None comparison
+        element_match = (pred_element == exp_element) or (pred_element is None and exp_element is None)
+        field_scores["element_id_of_nth_child_of_parent"] = 1.0 if element_match else 0.0
+        field_reasons.append(f"element_id_of_nth_child: {pred_element} vs {exp_element} → {'✅' if element_match else '❌'}")
+        # 5. selected_element_is_correct (boolean)
+        pred_selected = predicted_dict.get("selected_element_is_correct")
+        exp_selected = expected_dict.get("selected_element_is_correct")
+        selected_match = (pred_selected == exp_selected) if (pred_selected is not None and exp_selected is not None) else False
+        field_scores["selected_element_is_correct"] = 1.0 if selected_match else 0.0
+        field_reasons.append(f"selected_element_is_correct: {pred_selected} vs {exp_selected} → {'✅' if selected_match else '❌'}")
+        # Calculate composite score (weighted average)
+        composite_score = (
+            field_scores["is_index_based"] * 0.2 +
+            field_scores["index_value"] * 0.2 +
+            field_scores["parent_element_id"] * 0.2 +
+            field_scores["element_id_of_nth_child_of_parent"] * 0.2 +
+            field_scores["selected_element_is_correct"] * 0.2
+        )
+        # Build evaluation reason
+        all_match = composite_score == 1.0
+        reason = "✅ All fields match!" if all_match else f"❌ Partial match ({composite_score:.1%})"
+        reason += "\n" + "\n".join(f"   {r}" for r in field_reasons)
+        # Log evaluation details
+        self.logger.info(f"\n{'─'*70}")
+        self.logger.info(f"📊 INDEX CACHING EVALUATION")
+        self.logger.info(f"{'─'*70}")
+        self.logger.info(f"   🎯 COMPOSITE SCORE: {composite_score:.2f} ({composite_score:.1%})")
+        for field, score in field_scores.items():
+            status = "✅" if score == 1.0 else "❌"
+            self.logger.info(f"   {status} {field}: {score:.0f}")
+        self.logger.info(f"{'─'*70}\n")
+        return {
+            "is_index_based_match": field_scores["is_index_based"],
+            "index_value_match": field_scores["index_value"],
+            "parent_element_id_match": field_scores["parent_element_id"],
+            "element_id_of_nth_child_match": field_scores["element_id_of_nth_child_of_parent"],
+            "selected_element_correct_match": field_scores["selected_element_is_correct"],
+            "composite_score": composite_score,
+            "predicted_output": predicted,
+            "expected_output": json.dumps(expected_dict) if isinstance(expected_dict, dict) else str(expected),
+            "predicted_dict": predicted_dict,
+            "expected_dict": expected_dict,
+            "field_scores": field_scores,
+            "evaluation_reason": reason
+        }
+    def _parse_json_response(self, response: str) -> Dict[str, Any]:
+        """
+        Parse JSON from LLM response, handling markdown code blocks and various formats.
+        Args:
+            response: LLM response string (may contain markdown)
+        Returns:
+            Parsed JSON dictionary (empty dict if parsing fails)
+        """
+        if not response or not isinstance(response, str):
+            return {}
+        response = response.strip()
+        # If response is empty, return empty dict
+        if not response:
+            return {}
+        # Strategy 1: Try to extract JSON from markdown code block
+        json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL)
+        if json_match:
+            try:
+                json_str = json_match.group(1).strip()
+                return json.loads(json_str)
+            except json.JSONDecodeError:
+                pass
+        # Strategy 2: Find JSON object in response (handle nested braces)
+        json_start = response.find('{')
+        if json_start != -1:
+            # Find matching closing brace
+            brace_count = 0
+            json_end = json_start
+            for i in range(json_start, len(response)):
+                if response[i] == '{':
+                    brace_count += 1
+                elif response[i] == '}':
+                    brace_count -= 1
+                    if brace_count == 0:
+                        json_end = i + 1
+                        break
+            if brace_count == 0:
+                json_str = response[json_start:json_end]
+                try:
+                    return json.loads(json_str)
+                except json.JSONDecodeError:
+                    pass
+        # Strategy 3: Try to find any JSON-like structure (more lenient)
+        # Look for patterns like {"key": "value"} even if not perfectly formatted
+        json_pattern = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response, re.DOTALL)
+        if json_pattern:
+            try:
+                return json.loads(json_pattern.group(0))
+            except json.JSONDecodeError:
+                pass
+        # Strategy 4: Try parsing entire response as JSON
+        try:
+            return json.loads(response)
+        except json.JSONDecodeError:
+            pass
+        # If all strategies fail, return empty dict
+        self.logger.debug(f"Could not parse JSON from response: {response[:100]}...")
+        return {}
+    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        Args:
+            results: List of evaluation result dictionaries
+        Returns:
+            Summary statistics including accuracy per field and overall
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "overall_accuracy": 0.0,
+                "field_accuracies": {},
+                "perfect_matches": 0
+            }
+        total = len(results)
+        perfect_matches = sum(1 for r in results if r.get("composite_score", 0.0) == 1.0)
+        overall_accuracy = perfect_matches / total if total > 0 else 0.0
+        # Calculate accuracy per field
+        field_accuracies = {
+            "is_index_based": sum(1 for r in results if r.get("is_index_based_match", 0.0) == 1.0) / total,
+            "index_value": sum(1 for r in results if r.get("index_value_match", 0.0) == 1.0) / total,
+            "parent_element_id": sum(1 for r in results if r.get("parent_element_id_match", 0.0) == 1.0) / total,
+            "element_id_of_nth_child": sum(1 for r in results if r.get("element_id_of_nth_child_match", 0.0) == 1.0) / total,
+            "selected_element_is_correct": sum(1 for r in results if r.get("selected_element_correct_match", 0.0) == 1.0) / total,
+        }
+        return {
+            "total_samples": total,
+            "overall_accuracy": overall_accuracy,
+            "field_accuracies": field_accuracies,
+            "perfect_matches": perfect_matches,
+            "partial_matches": total - perfect_matches
+        }
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Index Caching Evaluator...")
+    evaluator = IndexCachingEvaluator()
+    # Test cases
+    test_cases = [
+        # (predicted, expected, should_be_perfect)
+        (
+            '{"is_index_based": true, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}',
+            {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True},
+            True
+        ),
+        (
+            '{"is_index_based": false, "index_value": null, "parent_element_id": null, "element_id_of_nth_child_of_parent": null, "selected_element_is_correct": true}',
+            {"is_index_based": False, "index_value": None, "parent_element_id": None, "element_id_of_nth_child_of_parent": None, "selected_element_is_correct": True},
+            True
+        ),
+        (
+            '{"is_index_based": true, "index_value": 3, "parent_element_id": null, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": true}',
+            {"is_index_based": True, "index_value": 3, "parent_element_id": None, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": True},
+            True
+        ),
+        (
+            '{"is_index_based": true, "index_value": 2, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}',
+            {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True},
+            False  # index_value mismatch
+        ),
+    ]
+    print("\n📝 Running test cases:")
+    print("-" * 80)
+    results = []
+    for predicted, expected, should_be_perfect in test_cases:
+        result = evaluator.evaluate(predicted, expected)
+        is_perfect = result["composite_score"] == 1.0
+        status = "✅" if is_perfect == should_be_perfect else "❌"
+        print(f"{status} Test: Perfect match = {is_perfect} (expected {should_be_perfect})")
+        print(f"   Score: {result['composite_score']:.2f}")
+        print()
+        results.append(result)
+    # Summary
+    print("\n📊 Summary:")
+    summary = evaluator.get_evaluation_summary(results)
+    print(f"   Total: {summary['total_samples']}")
+    print(f"   Perfect matches: {summary['perfect_matches']}")
+    print(f"   Overall accuracy: {summary['overall_accuracy']:.1%}")
+    print(f"   Field accuracies:")
+    for field, acc in summary['field_accuracies'].items():
+        print(f"      {field}: {acc:.1%}")

src/gepa_optimizer/evaluation/scroll_evaluator.py ADDED Viewed

	@@ -0,0 +1,251 @@

+"""
+GENERIC String Match Evaluator
+Compares predicted output against expected output (simple string comparison).
+NO assumptions about what the output represents (IDs, text, JSON, etc.).
+Let GEPA discover the correct output format through evolution and feedback!
+"""
+from typing import Dict, Any
+try:
+    from .base_evaluator import BaseEvaluator
+except ImportError:
+    # For standalone testing
+    import sys
+    from pathlib import Path
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+    from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
+class ScrollElementEvaluator(BaseEvaluator):
+    """
+    GENERIC evaluator - just compares strings!
+    NO assumptions about:
+    - Output format (element IDs, text, JSON, etc.)
+    - Output structure
+    - What the task is
+    GEPA will learn the correct format through feedback and evolution.
+    """
+    def __init__(self, metric_weights: Dict[str, float] = None):
+        """
+        Initialize evaluator.
+        Args:
+            metric_weights: Weights for evaluation metrics
+                          Default: {"output_match": 1.0}
+        """
+        default_weights = {
+            "output_match": 1.0  # Simple string comparison
+        }
+        weights = metric_weights or default_weights
+        super().__init__(metric_weights=weights)
+    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+        """
+        Binary evaluation with element ID extraction.
+        Phase 1 Implementation:
+        - Extracts element IDs using regex patterns (flexible format support)
+        - Uses INTEGER comparison for robustness (prevents "4" vs "14" bugs)
+        - Binary scoring: correct element = 1.0, wrong/missing = 0.0
+        Scoring Strategy:
+        1. Extract element ID from both predicted and expected outputs
+        2. Compare using integer arithmetic (not string comparison)
+        3. Return 1.0 if match, 0.0 otherwise (no partial credit)
+        Args:
+            predicted: LLM's output (may include verbose explanation)
+            expected: Expected output (may include verbose explanation)
+        Returns:
+            Dictionary with evaluation metrics and extracted element IDs
+        """
+        import re
+        if not predicted or not expected:
+            return {
+                "content_match": 0.0,
+                "output_match": 0.0,
+                "composite_score": 0.0,
+                "predicted_output": str(predicted).strip() if predicted else "",
+                "expected_output": str(expected).strip() if expected else "",
+                "predicted_element": "None",
+                "expected_element": "None",
+                "evaluation_reason": "❌ Empty or missing input/output"
+            }
+        predicted_str = str(predicted).strip()
+        expected_str = str(expected).strip()
+        # 1. Extract element numbers using MULTIPLE strategies (flexible!)
+        # Strategy A: "Element: X" or "Element X" (explicit format)
+        element_pattern_a = r'element[:\s]+(\d+)'
+        # Strategy B: "element X" or "Element X" anywhere in text
+        element_pattern_b = r'\belement\s+(\d+)\b'
+        # Strategy C: Just find ANY number if other strategies fail (last resort)
+        number_pattern = r'\b(\d+)\b'
+        # Try to extract from predicted
+        pred_match = re.search(element_pattern_a, predicted_str, re.IGNORECASE)
+        if not pred_match:
+            pred_match = re.search(element_pattern_b, predicted_str, re.IGNORECASE)
+        if not pred_match:
+            # Last resort: find first number in the text
+            pred_match = re.search(number_pattern, predicted_str)
+        # Try to extract from expected
+        exp_match = re.search(element_pattern_a, expected_str, re.IGNORECASE)
+        if not exp_match:
+            exp_match = re.search(element_pattern_b, expected_str, re.IGNORECASE)
+        if not exp_match:
+            exp_match = re.search(number_pattern, expected_str)
+        # 2. Check if we found element numbers in both
+        if not exp_match:
+            # Expected doesn't have element pattern - fallback to exact match
+            content_score = 1.0 if predicted_str.lower() == expected_str.lower() else 0.0
+        elif not pred_match:
+            # Predicted doesn't have element number - WRONG
+            content_score = 0.0
+        else:
+            # Both have element pattern - compare using INTEGER comparison
+            pred_element = pred_match.group(1)
+            exp_element = exp_match.group(1)
+            # 🔥 Phase 1: Use INTEGER comparison for robustness
+            # This prevents bugs like "4" != "14" string comparison issues
+            try:
+                pred_num = int(pred_element)
+                exp_num = int(exp_element)
+                # Integer comparison (more robust than string)
+                content_score = 1.0 if pred_num == exp_num else 0.0
+                # Log comparison for debugging
+                if pred_num != exp_num:
+                    import logging
+                    logger = logging.getLogger(__name__)
+                    logger.debug(f"Element mismatch: predicted={pred_num}, expected={exp_num}")
+            except (ValueError, TypeError) as e:
+                # Fallback to string comparison if conversion fails
+                import logging
+                logger = logging.getLogger(__name__)
+                logger.warning(f"Could not convert elements to integers: {e}, using string comparison")
+                content_score = 1.0 if pred_element == exp_element else 0.0
+        # 3. Binary score and reason
+        if content_score == 1.0:
+            composite_score = 1.0
+            reason = "✅ Correct! Element number matches"
+        else:
+            composite_score = 0.0
+            if pred_match and exp_match:
+                reason = "❌ Wrong element number (predicted different element)"
+            else:
+                reason = "❌ Missing or invalid element number"
+        pred_element = pred_match.group(1) if pred_match else "None"
+        exp_element = exp_match.group(1) if exp_match else "None"
+        # Detailed logging for transparency
+        import logging
+        logger = logging.getLogger(__name__)
+        logger.info(f"\n{'─'*70}")
+        logger.info(f"📊 EVALUATION DETAILS")
+        logger.info(f"{'─'*70}")
+        logger.info(f"   Expected: '{expected_str}' (Element: {exp_element})")
+        logger.info(f"   Predicted: '{predicted_str}' (Element: {pred_element})")
+        logger.info(f"   {'─'*66}")
+        logger.info(f"   🎯 SCORE: {composite_score:.2f} - {reason}")
+        logger.info(f"{'─'*70}\n")
+        return {
+            "content_match": content_score,
+            "output_match": composite_score,  # This is what GEPA uses
+            "composite_score": composite_score,
+            "predicted_output": predicted_str,
+            "expected_output": expected_str,
+            "predicted_element": pred_element,
+            "expected_element": exp_element,
+            "evaluation_reason": reason
+        }
+    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        Args:
+            results: List of evaluation result dictionaries
+        Returns:
+            Summary statistics
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "accuracy": 0.0,
+                "correct_predictions": 0
+            }
+        total = len(results)
+        correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
+        accuracy = correct / total if total > 0 else 0.0
+        return {
+            "total_samples": total,
+            "accuracy": accuracy,
+            "correct_predictions": correct,
+            "incorrect_predictions": total - correct
+        }
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Scroll Element Evaluator...")
+    evaluator = ScrollElementEvaluator()
+    # Test cases
+    test_cases = [
+        ("4", "4", True),
+        ("Element: 4", "4", True),
+        ("Element 4", "4", True),
+        ("The element to interact with is 4", "4", True),
+        ("Element ID: 4", "4", True),
+        ("Click on element 4 to scroll", "4", True),
+        ("5", "4", False),
+        ("Element: 5", "4", False),
+        ("No element found", "4", False),
+        ("", "4", False),
+    ]
+    print("\n📝 Running test cases:")
+    print("-" * 80)
+    results = []
+    for predicted, expected, should_match in test_cases:
+        result = evaluator.evaluate(predicted, expected)
+        match = result["composite_score"] == 1.0
+        status = "✅" if match == should_match else "❌"
+        print(f"{status} Predicted: '{predicted}' | Expected: '{expected}' | Match: {match}")
+        results.append(result)
+    # Summary
+    print("\n📊 Summary:")
+    summary = evaluator.get_evaluation_summary(results)
+    print(f"   Total: {summary['total_samples']}")
+    print(f"   Correct: {summary['correct_predictions']}")
+    print(f"   Accuracy: {summary['accuracy']:.1%}")

src/gepa_optimizer/evaluation/ui_evaluator.py ADDED Viewed

	@@ -0,0 +1,297 @@

+"""
+UI Tree Evaluator for GEPA Optimizer
+"""
+import json
+import logging
+import difflib
+from typing import Any, Dict, List, Optional
+from .base_evaluator import BaseEvaluator
+logger = logging.getLogger(__name__)
+class UITreeEvaluator(BaseEvaluator):
+    """
+    Comprehensive evaluator for UI tree extraction quality.
+    """
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initializes the UITreeEvaluator with configurable metric weights.
+        Args:
+            metric_weights: A dictionary of weights for different metrics.
+                            If None, default weights will be used.
+        """
+        # Set default weights for UI tree evaluation
+        default_weights = {
+            "element_completeness": 0.3,      # How many elements are captured
+            "element_type_accuracy": 0.25,    # Correct element types (Button, Text, etc.)
+            "text_content_accuracy": 0.2,     # Text content matches
+            "hierarchy_accuracy": 0.15,       # Parent-child relationships
+            "style_accuracy": 0.1,            # Style properties captured
+        }
+        # Use provided weights or defaults
+        weights = metric_weights or default_weights
+        # Initialize parent class
+        super().__init__(metric_weights=weights)
+        # Normalize weights
+        self._normalize_weights()
+    def _normalize_weights(self):
+        """Normalize weights to sum to 1.0"""
+        total_weight = sum(self.metric_weights.values())
+        if total_weight > 0:
+            self.metric_weights = {k: v / total_weight for k, v in self.metric_weights.items()}
+        else:
+            self.logger.warning("Total metric weight is zero. Scores will be zero.")
+    def evaluate(self, predicted_json: Dict[str, Any], expected_json: Dict[str, Any]) -> Dict[str, float]:
+        """
+        Generates a weighted composite score from individual metrics.
+        Args:
+            predicted_json: The JSON generated by the LLM.
+            expected_json: The ground truth JSON.
+        Returns:
+            A dictionary of individual metric scores and the composite score.
+        """
+        scores = {
+            "element_completeness": self.calculate_element_completeness(predicted_json, expected_json),
+            "element_type_accuracy": self.calculate_element_type_accuracy(predicted_json, expected_json),
+            "text_content_accuracy": self.calculate_text_content_accuracy(predicted_json, expected_json),
+            "hierarchy_accuracy": self.calculate_hierarchy_accuracy(predicted_json, expected_json),
+            "style_accuracy": self.calculate_style_accuracy(predicted_json, expected_json),
+        }
+        composite_score = sum(scores[metric] * self.metric_weights.get(metric, 0) for metric in scores)
+        scores["composite_score"] = composite_score
+        # Add detailed logging for debugging
+        logger.debug(f"Evaluation scores: {scores}")
+        logger.debug(f"Composite score: {composite_score:.4f}")
+        # Add small improvement bonus for better prompts (encourage GEPA to accept improvements)
+        # This helps GEPA recognize even tiny improvements
+        if composite_score > 0.05:  # If we have any meaningful content
+            composite_score = min(composite_score + 0.001, 1.0)  # Small bonus to encourage acceptance
+        return scores
+    def calculate_element_completeness(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates how many UI elements are captured in the predicted JSON.
+        This is the most important metric for UI tree extraction.
+        """
+        def _count_elements(node):
+            """Count total elements in the tree"""
+            if not isinstance(node, dict):
+                return 0
+            count = 1  # Count current node
+            for child in node.get("children", []):
+                count += _count_elements(child)
+            return count
+        try:
+            predicted_count = _count_elements(predicted)
+            expected_count = _count_elements(expected)
+            if expected_count == 0:
+                return 1.0 if predicted_count == 0 else 0.0
+            # Score based on how many elements are captured
+            completeness_ratio = predicted_count / expected_count
+            # Give bonus for capturing more elements (up to 1.0)
+            # Penalize heavily for missing elements
+            if completeness_ratio >= 1.0:
+                return 1.0  # Perfect or better
+            elif completeness_ratio >= 0.8:
+                return completeness_ratio  # Good coverage
+            elif completeness_ratio >= 0.5:
+                return completeness_ratio * 0.8  # Moderate coverage with penalty
+            else:
+                return completeness_ratio * 0.5  # Poor coverage with heavy penalty
+        except Exception as e:
+            logger.warning(f"Error calculating element completeness: {e}")
+            return 0.0
+    def calculate_element_type_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates element type accuracy by comparing the 'type' attribute of corresponding nodes.
+        Focuses on common UI element types like Button, Text, Image, etc.
+        """
+        def _get_all_types(node):
+            if not isinstance(node, dict):
+                return []
+            types = [node.get("type")]
+            for child in node.get("children", []):
+                types.extend(_get_all_types(child))
+            return [t for t in types if t is not None]
+        try:
+            predicted_types = _get_all_types(predicted)
+            expected_types = _get_all_types(expected)
+            if not expected_types:
+                return 1.0 if not predicted_types else 0.5
+            if not predicted_types:
+                return 0.0
+            # Count matching types with frequency consideration
+            expected_type_counts = {}
+            for t in expected_types:
+                expected_type_counts[t] = expected_type_counts.get(t, 0) + 1
+            predicted_type_counts = {}
+            for t in predicted_types:
+                predicted_type_counts[t] = predicted_type_counts.get(t, 0) + 1
+            # Calculate accuracy based on type matches
+            total_matches = 0
+            for type_name, expected_count in expected_type_counts.items():
+                predicted_count = predicted_type_counts.get(type_name, 0)
+                # Count matches up to the expected count
+                total_matches += min(predicted_count, expected_count)
+            return total_matches / len(expected_types) if expected_types else 0.0
+        except Exception as e:
+            logger.warning(f"Error calculating element type accuracy: {e}")
+            return 0.0
+    def calculate_hierarchy_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates hierarchy accuracy by comparing parent-child relationships.
+        """
+        def _get_hierarchy_structure(node, parent_type="ROOT"):
+            """Extract hierarchy structure as (parent_type, child_type) pairs"""
+            if not isinstance(node, dict):
+                return []
+            current_type = node.get("type", "unknown")
+            hierarchy = [(parent_type, current_type)]
+            for child in node.get("children", []):
+                hierarchy.extend(_get_hierarchy_structure(child, current_type))
+            return hierarchy
+        try:
+            predicted_hierarchy = _get_hierarchy_structure(predicted)
+            expected_hierarchy = _get_hierarchy_structure(expected)
+            if not expected_hierarchy:
+                return 1.0 if not predicted_hierarchy else 0.5
+            if not predicted_hierarchy:
+                return 0.0
+            # Count matching hierarchy relationships
+            expected_hierarchy_set = set(expected_hierarchy)
+            predicted_hierarchy_set = set(predicted_hierarchy)
+            matches = len(expected_hierarchy_set.intersection(predicted_hierarchy_set))
+            total_expected = len(expected_hierarchy_set)
+            return matches / total_expected if total_expected > 0 else 0.0
+        except Exception as e:
+            logger.warning(f"Error calculating hierarchy accuracy: {e}")
+            return 0.0
+    def calculate_text_content_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates text content accuracy by comparing the 'text' attribute of corresponding nodes.
+        """
+        def _get_all_texts(node):
+            if not isinstance(node, dict):
+                return []
+            texts = [node.get("text")]
+            for child in node.get("children", []):
+                texts.extend(_get_all_texts(child))
+            return [t for t in texts if t is not None and str(t).strip()]
+        try:
+            predicted_texts = _get_all_texts(predicted)
+            expected_texts = _get_all_texts(expected)
+            if not expected_texts:
+                return 1.0 if not predicted_texts else 0.5  # Partial credit if predicted has texts but expected doesn't
+            if not predicted_texts:
+                return 0.0  # No predicted texts, so no match
+            total_similarity = 0.0
+            for p_text in predicted_texts:
+                best_similarity = 0.0
+                for e_text in expected_texts:
+                    similarity = difflib.SequenceMatcher(None, str(p_text).strip(), str(e_text).strip()).ratio()
+                    best_similarity = max(best_similarity, similarity)
+                total_similarity += best_similarity
+            # Average similarity over all predicted texts
+            if not predicted_texts and not expected_texts:
+                return 1.0
+            elif not predicted_texts:
+                return 0.0
+            else:
+                return total_similarity / len(predicted_texts)
+        except Exception as e:
+            logger.warning(f"Error calculating text content accuracy: {e}")
+            return 0.0
+    def calculate_style_accuracy(self, predicted: Dict, expected: Dict) -> float:
+        """
+        Calculates style accuracy by comparing style properties.
+        """
+        def _get_all_styles(node):
+            """Extract all style properties from the tree"""
+            if not isinstance(node, dict):
+                return []
+            styles = []
+            if "style" in node and isinstance(node["style"], dict):
+                styles.append(node["style"])
+            for child in node.get("children", []):
+                styles.extend(_get_all_styles(child))
+            return styles
+        try:
+            predicted_styles = _get_all_styles(predicted)
+            expected_styles = _get_all_styles(expected)
+            if not expected_styles:
+                return 1.0 if not predicted_styles else 0.5
+            if not predicted_styles:
+                return 0.0
+            # Calculate style property overlap
+            total_style_properties = 0
+            matching_properties = 0
+            for exp_style in expected_styles:
+                for prop_name, prop_value in exp_style.items():
+                    total_style_properties += 1
+                    # Find matching property in predicted styles
+                    for pred_style in predicted_styles:
+                        if prop_name in pred_style and pred_style[prop_name] == prop_value:
+                            matching_properties += 1
+                            break
+            return matching_properties / total_style_properties if total_style_properties > 0 else 0.0
+        except Exception as e:
+            logger.warning(f"Error calculating style accuracy: {e}")
+            return 0.0

src/gepa_optimizer/evaluation/universal_evaluator.py ADDED Viewed

	@@ -0,0 +1,911 @@

+"""
+Universal Semantic Evaluator for ANY prompt optimization use case.
+This evaluator uses LLM-powered semantic analysis to compare predicted vs expected outputs,
+enabling prompt optimization for ANY task without requiring custom evaluator code.
+Key Features:
+- Semantic understanding (not just string matching)
+- Works with text, JSON, numbers, structured outputs
+- Provides rich feedback for GEPA reflection
+- No task-specific assumptions
+"""
+import json
+import re
+import logging
+from typing import Dict, Any, Optional, List
+from difflib import SequenceMatcher
+from .base_evaluator import BaseEvaluator
+logger = logging.getLogger(__name__)
+class UniversalSemanticEvaluator(BaseEvaluator):
+    """
+    Universal evaluator using LLM for semantic comparison.
+    Works for ANY task without hardcoded assumptions:
+    - Text outputs: "The answer is 42" vs "42"
+    - JSON outputs: {"count": 23} vs {"count": 22}
+    - Structured data: Lists, nested objects
+    - Multi-modal: Image descriptions, analysis results
+    Evaluation Strategy:
+    1. Quick checks (exact match, empty handling)
+    2. Structural comparison (for JSON/structured data)
+    3. LLM semantic analysis (for meaning understanding)
+    4. Combine into composite score with rich feedback
+    """
+    def __init__(
+        self,
+        llm_client=None,
+        use_llm_analysis: bool = True,
+        semantic_weight: float = 0.6,
+        structural_weight: float = 0.25,
+        exact_match_bonus: float = 0.15,
+        metric_weights: Optional[Dict[str, float]] = None
+    ):
+        """
+        Initialize Universal Semantic Evaluator.
+        Args:
+            llm_client: LLM client for semantic analysis (optional, falls back to heuristics)
+            use_llm_analysis: Whether to use LLM for semantic comparison
+            semantic_weight: Weight for semantic similarity (0.0-1.0)
+            structural_weight: Weight for structural similarity (0.0-1.0)
+            exact_match_bonus: Bonus weight for exact matches (0.0-1.0)
+            metric_weights: Optional custom weights (overrides above)
+        """
+        default_weights = metric_weights or {
+            "semantic_similarity": semantic_weight,
+            "structural_similarity": structural_weight,
+            "exact_match": exact_match_bonus
+        }
+        super().__init__(metric_weights=default_weights)
+        self.llm_client = llm_client
+        self.use_llm_analysis = use_llm_analysis and llm_client is not None
+        # Cache for LLM analysis to reduce API calls
+        self._analysis_cache: Dict[str, Dict] = {}
+        logger.info(f"🎯 Universal Semantic Evaluator initialized")
+        logger.info(f"   LLM analysis: {'enabled' if self.use_llm_analysis else 'disabled (using heuristics)'}")
+        logger.info(f"   Weights: semantic={semantic_weight}, structural={structural_weight}, exact={exact_match_bonus}")
+    def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]:
+        """
+        Evaluate predicted output against expected output using semantic understanding.
+        Args:
+            predicted: The model's predicted output (string, dict, or any serializable type)
+            expected: The ground truth expected output
+        Returns:
+            Dictionary with metrics including 'composite_score' (required for GEPA)
+        """
+        # Convert to strings for comparison
+        predicted_str = self._to_string(predicted)
+        expected_str = self._to_string(expected)
+        # Initialize result
+        result = {
+            "composite_score": 0.0,
+            "exact_match": 0.0,
+            "semantic_similarity": 0.0,
+            "structural_similarity": 0.0,
+            "predicted_output": predicted_str[:500],  # Truncate for logging
+            "expected_output": expected_str[:500],
+            "analysis": {},
+            "improvement_feedback": ""
+        }
+        # Handle empty/missing outputs
+        if not predicted_str or not predicted_str.strip():
+            result["improvement_feedback"] = "❌ Output is EMPTY. The prompt must instruct the model to produce output."
+            result["analysis"] = {"status": "empty_predicted"}
+            return result
+        if not expected_str or not expected_str.strip():
+            result["improvement_feedback"] = "⚠️ Expected output is empty - cannot evaluate."
+            result["analysis"] = {"status": "empty_expected"}
+            result["composite_score"] = 0.5  # Neutral score
+            return result
+        # ─────────────────────────────────────────────────────
+        # STEP 1: Exact Match Check (Fast Path)
+        # ─────────────────────────────────────────────────────
+        normalized_pred = self._normalize(predicted_str)
+        normalized_exp = self._normalize(expected_str)
+        if normalized_pred == normalized_exp:
+            result["exact_match"] = 1.0
+            result["semantic_similarity"] = 1.0
+            result["structural_similarity"] = 1.0
+            result["composite_score"] = 1.0
+            result["improvement_feedback"] = "✅ Perfect match! Output exactly matches expected."
+            result["analysis"] = {"status": "exact_match"}
+            return result
+        # ─────────────────────────────────────────────────────
+        # STEP 1.5: FORMAT MISMATCH DETECTION (CRITICAL FIX)
+        # ─────────────────────────────────────────────────────
+        # 🔥 CRITICAL: Detect when expected is JSON but predicted is narrative text
+        # This causes catastrophically low scores and needs explicit handling
+        expected_is_json = self._try_parse_json(expected_str) is not None
+        predicted_is_json = self._try_parse_json(predicted_str) is not None
+        format_mismatch = expected_is_json and not predicted_is_json
+        if format_mismatch:
+            # Expected JSON but got narrative - this is a CRITICAL format error
+            # Give partial credit for semantic content but penalize heavily for format
+            result["analysis"]["format_mismatch"] = True
+            result["improvement_feedback"] = (
+                "❌ FORMAT ERROR: Expected JSON output but received narrative text. "
+                "The prompt MUST enforce JSON output format. "
+                "Add explicit instructions like: 'Output ONLY valid JSON, no explanations.' "
+                "Consider adding: 'Do NOT write prose or explanations.'"
+            )
+            # Still evaluate semantic content but cap the score
+            # This gives feedback for improving the prompt
+            logger.warning(f"⚠️  Format mismatch: expected JSON ({len(expected_str)} chars), got narrative ({len(predicted_str)} chars)")
+        # ─────────────────────────────────────────────────────
+        # STEP 2: Structural Comparison (for JSON/structured data)
+        # ─────────────────────────────────────────────────────
+        structural_result = self._compare_structure(predicted_str, expected_str)
+        result["structural_similarity"] = structural_result["score"]
+        result["analysis"]["structural"] = structural_result.get("details", {})
+        # ─────────────────────────────────────────────────────
+        # STEP 3: Semantic Analysis
+        # ─────────────────────────────────────────────────────
+        if self.use_llm_analysis:
+            semantic_result = self._llm_semantic_analysis(predicted_str, expected_str)
+        else:
+            semantic_result = self._heuristic_semantic_analysis(predicted_str, expected_str)
+        result["semantic_similarity"] = semantic_result["score"]
+        result["analysis"]["semantic"] = semantic_result.get("details", {})
+        result["improvement_feedback"] = semantic_result.get("feedback", "")
+        # ─────────────────────────────────────────────────────
+        # STEP 4: Compute Composite Score
+        # ─────────────────────────────────────────────────────
+        weights = self.metric_weights
+        composite = (
+            result["semantic_similarity"] * weights.get("semantic_similarity", 0.6) +
+            result["structural_similarity"] * weights.get("structural_similarity", 0.25) +
+            result["exact_match"] * weights.get("exact_match", 0.15)
+        )
+        # 🔥 CRITICAL FIX: Apply format mismatch penalty
+        # If expected JSON but got narrative, cap the score to encourage format compliance
+        if result.get("analysis", {}).get("format_mismatch"):
+            # Cap at 0.3 to indicate "partial semantic match but wrong format"
+            # This ensures format-correct outputs always score higher
+            composite = min(composite, 0.30)
+            logger.debug(f"📊 Format mismatch penalty applied: score capped at {composite:.3f}")
+        result["composite_score"] = min(max(composite, 0.0), 1.0)
+        # Add score breakdown to feedback
+        if not result["improvement_feedback"]:
+            result["improvement_feedback"] = self._generate_default_feedback(result)
+        # Log evaluation
+        logger.debug(f"📊 Evaluation: composite={result['composite_score']:.3f}, "
+                    f"semantic={result['semantic_similarity']:.3f}, "
+                    f"structural={result['structural_similarity']:.3f}")
+        # #region agent log
+        try:
+            import json as _json_debug
+            import time as _time_debug
+            import os as _os_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_evaluator.py:final_score", "message": "Final evaluation score breakdown", "data": {"composite": result["composite_score"], "semantic": result["semantic_similarity"], "structural": result["structural_similarity"], "exact_match": result["exact_match"], "format_mismatch": result.get("analysis", {}).get("format_mismatch", False), "predicted_preview": predicted_str[:150] if predicted_str else "EMPTY", "expected_preview": expected_str[:150] if expected_str else "EMPTY"}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        except Exception as _e:
+            pass  # Silent fail for instrumentation
+        # #endregion
+        return result
+    def _to_string(self, value: Any) -> str:
+        """Convert any value to string for comparison."""
+        if value is None:
+            return ""
+        if isinstance(value, str):
+            return value.strip()
+        if isinstance(value, dict):
+            try:
+                return json.dumps(value, sort_keys=True, indent=2)
+            except (TypeError, ValueError):
+                return str(value)
+        if isinstance(value, (list, tuple)):
+            try:
+                return json.dumps(list(value), sort_keys=True)
+            except (TypeError, ValueError):
+                return str(value)
+        return str(value).strip()
+    def _normalize(self, text: str) -> str:
+        """Normalize text for comparison (lowercase, whitespace)."""
+        # Lowercase and normalize whitespace
+        normalized = ' '.join(text.lower().split())
+        # Remove common punctuation that doesn't affect meaning
+        normalized = re.sub(r'[.,;:!?\'"]+$', '', normalized)
+        return normalized
+    def _compare_structure(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """
+        Compare structural similarity (especially for JSON/structured outputs).
+        Returns:
+            Dict with 'score' (0.0-1.0) and 'details'
+        """
+        result = {"score": 0.0, "details": {}}
+        # Try to parse as JSON
+        pred_json = self._try_parse_json(predicted)
+        exp_json = self._try_parse_json(expected)
+        if pred_json is not None and exp_json is not None:
+            # Both are valid JSON - do structural comparison
+            return self._compare_json_structures(pred_json, exp_json)
+        # Fallback: Compare as text structure
+        return self._compare_text_structure(predicted, expected)
+    def _try_parse_json(self, text: str) -> Optional[Any]:
+        """
+        Try to parse text as JSON with robust extraction.
+        🔥 FIX: LLMs often wrap JSON in markdown code blocks or add extra text.
+        This method now handles multiple formats:
+        - Direct JSON
+        - ```json ... ``` blocks
+        - ``` ... ``` blocks (no language tag)
+        - JSON embedded in prose
+        - Escaped newlines and quotes
+        """
+        if not text or not isinstance(text, str):
+            return None
+        # 🔥 PREPROCESSING: Clean common LLM output issues
+        cleaned = text.strip()
+        # Remove BOM and other invisible characters
+        cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d')
+        # Strategy 1: Try direct parse (cleanest case)
+        try:
+            return json.loads(cleaned)
+        except json.JSONDecodeError:
+            pass
+        # Strategy 2: Extract JSON from markdown code block (```json ... ```)
+        # More permissive regex that handles optional language tags
+        json_match = re.search(r'```(?:json|JSON)?\s*([\{|\[].*?[\}|\]])\s*```', cleaned, re.DOTALL)
+        if json_match:
+            try:
+                return json.loads(json_match.group(1))
+            except json.JSONDecodeError:
+                pass
+        # Strategy 3: Find JSON using balanced brace matching (handles nested objects)
+        def extract_balanced_json(s: str, start_char: str, end_char: str) -> Optional[str]:
+            """Extract JSON with balanced braces/brackets."""
+            count = 0
+            start_idx = -1
+            for i, char in enumerate(s):
+                if char == start_char:
+                    if count == 0:
+                        start_idx = i
+                    count += 1
+                elif char == end_char:
+                    count -= 1
+                    if count == 0 and start_idx >= 0:
+                        return s[start_idx:i+1]
+            return None
+        # Try to find JSON object
+        json_obj = extract_balanced_json(cleaned, '{', '}')
+        if json_obj:
+            try:
+                return json.loads(json_obj)
+            except json.JSONDecodeError:
+                # Try to repair common issues
+                repaired = self._repair_json(json_obj)
+                try:
+                    return json.loads(repaired)
+                except json.JSONDecodeError:
+                    pass
+        # Try to find JSON array
+        json_arr = extract_balanced_json(cleaned, '[', ']')
+        if json_arr:
+            try:
+                return json.loads(json_arr)
+            except json.JSONDecodeError:
+                repaired = self._repair_json(json_arr)
+                try:
+                    return json.loads(repaired)
+                except json.JSONDecodeError:
+                    pass
+        return None
+    def _repair_json(self, json_str: str) -> str:
+        """
+        Attempt to repair common JSON issues from LLM output.
+        Fixes:
+        - Trailing commas before } or ]
+        - Single quotes instead of double quotes
+        - Unquoted keys
+        - Comments (// and /* */)
+        """
+        repaired = json_str
+        # Remove trailing commas
+        repaired = re.sub(r',\s*}', '}', repaired)
+        repaired = re.sub(r',\s*]', ']', repaired)
+        # Remove single-line comments
+        repaired = re.sub(r'//[^\n]*', '', repaired)
+        # Remove multi-line comments
+        repaired = re.sub(r'/\*.*?\*/', '', repaired, flags=re.DOTALL)
+        # Replace single quotes with double quotes (but be careful with apostrophes)
+        # Only replace when it looks like a JSON delimiter
+        def replace_single_quotes(match):
+            content = match.group(0)
+            # Skip if it looks like an apostrophe in a word
+            if re.match(r"'\w+'\s*:", content) or re.match(r":\s*'[^']*'", content):
+                return content.replace("'", '"')
+            return content
+        # Basic single quote replacement for keys
+        repaired = re.sub(r"'([^']+)'\s*:", r'"\1":', repaired)
+        return repaired
+    def _compare_json_structures(self, pred: Any, exp: Any) -> Dict[str, Any]:
+        """Compare two JSON structures."""
+        result = {"score": 0.0, "details": {"type": "json", "matches": [], "mismatches": []}}
+        if type(pred) != type(exp):
+            result["details"]["mismatches"].append(f"Type mismatch: predicted={type(pred).__name__}, expected={type(exp).__name__}")
+            result["score"] = 0.2  # Some credit for being JSON
+            return result
+        if isinstance(pred, dict) and isinstance(exp, dict):
+            return self._compare_dicts(pred, exp)
+        elif isinstance(pred, list) and isinstance(exp, list):
+            return self._compare_lists(pred, exp)
+        else:
+            # Primitive types
+            if pred == exp:
+                result["score"] = 1.0
+                result["details"]["matches"].append(f"Values match: {pred}")
+            else:
+                result["score"] = self._value_similarity(pred, exp)
+                result["details"]["mismatches"].append(f"Value mismatch: predicted={pred}, expected={exp}")
+            return result
+    def _compare_dicts(self, pred: dict, exp: dict) -> Dict[str, Any]:
+        """
+        Compare two dictionaries with CASE-INSENSITIVE key matching.
+        🔥 FIX: LLMs often produce keys like 'Category' when expected is 'category'.
+        This method now normalizes keys before comparison for fair scoring.
+        """
+        result = {"score": 0.0, "details": {"type": "dict", "matches": [], "mismatches": [], "missing_keys": [], "extra_keys": []}}
+        # 🔥 NORMALIZE: Convert all keys to lowercase for comparison
+        # Also handle common variations like underscores vs camelCase
+        def normalize_key(key: str) -> str:
+            """Normalize key: lowercase, underscores to nothing, strip spaces."""
+            return re.sub(r'[_\s-]', '', str(key).lower())
+        # Build normalized key mappings
+        pred_normalized = {normalize_key(k): (k, v) for k, v in pred.items()}
+        exp_normalized = {normalize_key(k): (k, v) for k, v in exp.items()}
+        pred_norm_keys = set(pred_normalized.keys())
+        exp_norm_keys = set(exp_normalized.keys())
+        # Check for missing/extra keys (using normalized comparison)
+        missing_norm = exp_norm_keys - pred_norm_keys
+        extra_norm = pred_norm_keys - exp_norm_keys
+        common_norm = pred_norm_keys & exp_norm_keys
+        # Convert back to original key names for reporting
+        missing = [exp_normalized[k][0] for k in missing_norm]
+        extra = [pred_normalized[k][0] for k in extra_norm]
+        result["details"]["missing_keys"] = missing
+        result["details"]["extra_keys"] = extra
+        if not exp_norm_keys:
+            result["score"] = 1.0 if not pred_norm_keys else 0.5
+            return result
+        # Score based on key overlap (normalized)
+        key_score = len(common_norm) / len(exp_norm_keys) if exp_norm_keys else 1.0
+        # Score based on value matches
+        value_scores = []
+        for norm_key in common_norm:
+            pred_orig_key, pred_val = pred_normalized[norm_key]
+            exp_orig_key, exp_val = exp_normalized[norm_key]
+            if pred_val == exp_val:
+                value_scores.append(1.0)
+                result["details"]["matches"].append(f"{exp_orig_key}: {exp_val}")
+            else:
+                sim = self._value_similarity(pred_val, exp_val)
+                value_scores.append(sim)
+                if sim < 0.8:
+                    result["details"]["mismatches"].append(f"{exp_orig_key}: predicted={pred_val}, expected={exp_val}")
+        value_score = sum(value_scores) / len(value_scores) if value_scores else 0.0
+        # Combine scores
+        result["score"] = 0.3 * key_score + 0.7 * value_score
+        # Penalty for missing keys (reduced from 0.1 to 0.05 per key)
+        if missing:
+            result["score"] *= (1 - 0.05 * len(missing))
+        result["score"] = max(0.0, min(1.0, result["score"]))
+        return result
+    def _compare_lists(self, pred: list, exp: list) -> Dict[str, Any]:
+        """Compare two lists."""
+        result = {"score": 0.0, "details": {"type": "list", "length_match": False, "item_matches": 0}}
+        if not exp:
+            result["score"] = 1.0 if not pred else 0.5
+            return result
+        result["details"]["length_match"] = len(pred) == len(exp)
+        # Compare items (order-sensitive)
+        matches = 0
+        for i, exp_item in enumerate(exp):
+            if i < len(pred):
+                if pred[i] == exp_item:
+                    matches += 1
+                else:
+                    # Check if item exists elsewhere
+                    if exp_item in pred:
+                        matches += 0.5  # Partial credit for wrong position
+        result["details"]["item_matches"] = matches
+        result["score"] = matches / len(exp)
+        # Penalty for length mismatch
+        if len(pred) != len(exp):
+            len_ratio = min(len(pred), len(exp)) / max(len(pred), len(exp))
+            result["score"] *= (0.7 + 0.3 * len_ratio)
+        return result
+    def _value_similarity(self, pred: Any, exp: Any) -> float:
+        """
+        Calculate similarity between two values.
+        🔥 ENHANCED: Now handles:
+        - Case-insensitive string comparison
+        - Semantic similarity for common variations
+        - Underscore/space/dash normalization
+        - Numeric comparison with tolerance
+        """
+        # Same value (exact match)
+        if pred == exp:
+            return 1.0
+        # Numeric comparison
+        try:
+            pred_num = float(pred)
+            exp_num = float(exp)
+            if exp_num == 0:
+                return 1.0 if pred_num == 0 else 0.0
+            # Relative error with tolerance
+            error = abs(pred_num - exp_num) / abs(exp_num)
+            return max(0.0, 1.0 - error)
+        except (ValueError, TypeError):
+            pass
+        # String comparison with normalization
+        pred_str = str(pred).strip()
+        exp_str = str(exp).strip()
+        # Case-insensitive exact match
+        if pred_str.lower() == exp_str.lower():
+            return 0.98  # Slight penalty for case mismatch
+        # Normalize strings (remove underscores, spaces, dashes for comparison)
+        def normalize_str(s: str) -> str:
+            return re.sub(r'[_\s\-]+', '', s.lower())
+        pred_norm = normalize_str(pred_str)
+        exp_norm = normalize_str(exp_str)
+        if pred_norm == exp_norm:
+            return 0.95  # Good match despite formatting differences
+        # Check if one contains the other (partial match)
+        if pred_norm in exp_norm or exp_norm in pred_norm:
+            ratio = min(len(pred_norm), len(exp_norm)) / max(len(pred_norm), len(exp_norm))
+            return 0.7 + (0.2 * ratio)  # 0.7-0.9 for partial matches
+        # 🔥 SEMANTIC SIMILARITY: Check for common equivalent terms
+        semantic_equivalents = {
+            # Priority levels
+            'low': ['low', 'minor', 'trivial', 'p3', 'p4'],
+            'medium': ['medium', 'normal', 'moderate', 'p2'],
+            'high': ['high', 'important', 'major', 'p1', 'critical', 'urgent'],
+            # Boolean variations
+            'true': ['true', 'yes', '1', 'on', 'enabled'],
+            'false': ['false', 'no', '0', 'off', 'disabled'],
+            # Status variations
+            'success': ['success', 'succeeded', 'completed', 'done', 'passed'],
+            'failure': ['failure', 'failed', 'error', 'crashed'],
+            'pending': ['pending', 'waiting', 'queued', 'in_progress', 'processing'],
+        }
+        for canonical, equivalents in semantic_equivalents.items():
+            pred_match = any(eq in pred_norm for eq in equivalents)
+            exp_match = any(eq in exp_norm for eq in equivalents)
+            if pred_match and exp_match:
+                return 0.85  # Semantic match
+        # Sequence matching (character-level similarity)
+        ratio = SequenceMatcher(None, pred_str.lower(), exp_str.lower()).ratio()
+        # 🔥 WORD-LEVEL SIMILARITY: Check word overlap
+        pred_words = set(re.findall(r'\w+', pred_str.lower()))
+        exp_words = set(re.findall(r'\w+', exp_str.lower()))
+        if pred_words and exp_words:
+            word_overlap = len(pred_words & exp_words) / max(len(pred_words), len(exp_words))
+            # Combine character and word similarity
+            return max(ratio, word_overlap * 0.9)
+    def _compare_text_structure(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """Compare text structure when not JSON."""
+        result = {"score": 0.0, "details": {"type": "text"}}
+        # Word overlap
+        pred_words = set(predicted.lower().split())
+        exp_words = set(expected.lower().split())
+        if not exp_words:
+            result["score"] = 1.0 if not pred_words else 0.5
+            return result
+        overlap = len(pred_words & exp_words)
+        result["details"]["word_overlap"] = overlap
+        result["details"]["expected_words"] = len(exp_words)
+        # Jaccard similarity
+        union = len(pred_words | exp_words)
+        result["score"] = overlap / union if union > 0 else 0.0
+        return result
+    def _llm_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """
+        Use LLM for semantic analysis of predicted vs expected.
+        Uses XML-delimited prompt structure to prevent context bleeding
+        and Multi-Dimensional Scoring (Semantics vs. Syntax).
+        Returns:
+            Dict with 'score' (0.0-1.0), 'details', and 'feedback'
+        """
+        # Check cache
+        cache_key = f"{hash(predicted)}:{hash(expected)}"
+        if cache_key in self._analysis_cache:
+            return self._analysis_cache[cache_key]
+        result = {"score": 0.0, "details": {}, "feedback": ""}
+        try:
+            # Truncate for token limits but preserve enough context
+            expected_truncated = expected[:10000]
+            predicted_truncated = predicted[:10000]
+            # OPTIMIZED: Penalty-based scoring with self-verification
+            # Starts at 1.0 and deducts for failures - more consistent than subjective scoring
+            analysis_prompt = f"""<system_role>
+You are a **Semantic Logic Engine** tasked with grading AI performance.
+You must compare a [PREDICTED] output against a [EXPECTED] truth.
+</system_role>
+<input_data>
+    <expected_output>
+{expected_truncated}
+    </expected_output>
+    <predicted_output>
+{predicted_truncated}
+    </predicted_output>
+</input_data>
+<scoring_algorithm>
+Calculate the score based on these STRICT rules. Start with 1.0 and deduct penalties.
+1. **Information Completeness (Max -0.5)**:
+   - If key facts/fields are missing, deduct proportional to importance.
+   - If a nested JSON field is missing, deduct 0.1 per field.
+2. **Accuracy & Hallucination (Max -1.0)**:
+   - If factual numbers/IDs are wrong: Score = 0 immediately.
+   - If the model invents information NOT in the input: Deduct 0.3.
+3. **Format Compliance (Max -0.3)**:
+   - If JSON is requested but Markdown is returned: Deduct 0.3.
+   - If keys are lowercase instead of snake_case: Deduct 0.1.
+4. **Semantic Equivalence (No Penalty)**:
+   - Synonyms are ACCEPTED (e.g., "Purchase" == "Buy").
+   - Formatting differences (whitespace) are IGNORED.
+</scoring_algorithm>
+<self_verification>
+Before finalizing the score, ask: "If I used the predicted output in code expecting the original output, would the code crash?"
+- If YES (Crash) -> Score must be < 0.5.
+- If NO (Safe) -> Score can be high.
+</self_verification>
+<output_schema>
+Return JSON ONLY:
+{{
+    "semantic_similarity": 0.0-1.0,
+    "structural_similarity": 0.0-1.0,
+    "verdict": "PERFECT" | "ACCEPTABLE" | "FORMAT_ERROR" | "DATA_CORRUPTION",
+    "critical_failures": ["List specific failures that caused score < 1.0"],
+    "penalty_breakdown": {{"completeness": -0.0, "accuracy": -0.0, "format": -0.0}},
+    "fix_directive": "Imperative command to fix the prompt"
+}}
+</output_schema>
+"""
+            response = self.llm_client.generate(
+                system_prompt="You are a Semantic Logic Engine. Calculate scores using penalty-based deduction from 1.0. Respond only with valid JSON.",
+                user_prompt=analysis_prompt,
+                image_base64=""
+            )
+            content = response.get("content", str(response)) if isinstance(response, dict) else str(response)
+            # Parse JSON response
+            analysis = self._extract_json_from_response(content)
+            if analysis:
+                # Extract semantic similarity (primary score)
+                semantic_sim = float(analysis.get("semantic_similarity", 0.5))
+                structural_sim = float(analysis.get("structural_similarity", semantic_sim))
+                # Compute weighted score based on verdict (updated for new schema)
+                verdict = analysis.get("verdict", "ACCEPTABLE")
+                verdict_multiplier = {
+                    "PERFECT": 1.0,
+                    "ACCEPTABLE": 0.85,
+                    "FORMAT_ERROR": 0.6,      # New: was WRONG_FORMAT
+                    "DATA_CORRUPTION": 0.1,   # New: replaces WRONG_CONTENT + HALLUCINATION
+                    # Legacy support
+                    "WRONG_FORMAT": 0.6,
+                    "WRONG_CONTENT": 0.3,
+                    "HALLUCINATION": 0.1
+                }.get(verdict, 0.5)
+                # Final score: weighted combination
+                result["score"] = min(1.0, semantic_sim * 0.6 + structural_sim * 0.3 + verdict_multiplier * 0.1)
+                # Extract penalty breakdown if available
+                penalty_breakdown = analysis.get("penalty_breakdown", {})
+                critical_failures = analysis.get("critical_failures", [])
+                result["details"] = {
+                    "verdict": verdict,
+                    "semantic_similarity": semantic_sim,
+                    "structural_similarity": structural_sim,
+                    "critical_failures": critical_failures,
+                    "penalty_breakdown": penalty_breakdown,
+                    # Legacy field support
+                    "key_matches": analysis.get("key_matches", []),
+                    "key_differences": analysis.get("key_differences", critical_failures),
+                    "value_errors": analysis.get("value_errors", {}),
+                    "reasoning": analysis.get("reasoning", "")
+                }
+                result["feedback"] = analysis.get("fix_directive", "")
+            else:
+                # Fallback if JSON parsing fails
+                result = self._heuristic_semantic_analysis(predicted, expected)
+            # Cache result
+            self._analysis_cache[cache_key] = result
+        except Exception as e:
+            logger.warning(f"LLM semantic analysis failed: {e}, falling back to heuristics")
+            result = self._heuristic_semantic_analysis(predicted, expected)
+        return result
+    def _extract_json_from_response(self, content: str) -> Optional[Dict]:
+        """Extract JSON from LLM response."""
+        # Try to find JSON in response
+        json_match = re.search(r'\{[\s\S]*\}', content)
+        if json_match:
+            try:
+                return json.loads(json_match.group(0))
+            except json.JSONDecodeError:
+                pass
+        return None
+    def _heuristic_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]:
+        """
+        Heuristic-based semantic analysis when LLM is not available.
+        Uses multiple signals:
+        - Word overlap (Jaccard)
+        - Sequence matching (SequenceMatcher)
+        - Number extraction and comparison
+        - Key phrase matching
+        """
+        result = {"score": 0.0, "details": {}, "feedback": ""}
+        pred_lower = predicted.lower()
+        exp_lower = expected.lower()
+        # 1. Sequence similarity
+        seq_sim = SequenceMatcher(None, pred_lower, exp_lower).ratio()
+        # 2. Word overlap (Jaccard)
+        pred_words = set(pred_lower.split())
+        exp_words = set(exp_lower.split())
+        jaccard = len(pred_words & exp_words) / len(pred_words | exp_words) if (pred_words | exp_words) else 0.0
+        # 3. Number comparison
+        pred_nums = re.findall(r'-?\d+\.?\d*', predicted)
+        exp_nums = re.findall(r'-?\d+\.?\d*', expected)
+        num_score = 1.0
+        num_errors = []
+        if exp_nums:
+            matches = 0
+            for exp_num in exp_nums:
+                if exp_num in pred_nums:
+                    matches += 1
+                else:
+                    # Check for close matches
+                    try:
+                        exp_val = float(exp_num)
+                        for pred_num in pred_nums:
+                            pred_val = float(pred_num)
+                            if abs(pred_val - exp_val) <= 1:  # Off by 1
+                                matches += 0.9
+                                num_errors.append(f"Number close: expected {exp_num}, got {pred_num}")
+                                break
+                        else:
+                            num_errors.append(f"Number missing: expected {exp_num}")
+                    except ValueError:
+                        pass
+            num_score = matches / len(exp_nums) if exp_nums else 1.0
+        # 4. Key entity extraction (simple approach)
+        # Look for capitalized words, quoted strings, etc.
+        pred_entities = set(re.findall(r'\b[A-Z][a-z]+\b', predicted))
+        exp_entities = set(re.findall(r'\b[A-Z][a-z]+\b', expected))
+        entity_overlap = len(pred_entities & exp_entities) / len(exp_entities) if exp_entities else 1.0
+        # Combine scores
+        result["score"] = (
+            0.3 * seq_sim +
+            0.25 * jaccard +
+            0.25 * num_score +
+            0.2 * entity_overlap
+        )
+        result["details"] = {
+            "sequence_similarity": seq_sim,
+            "word_overlap": jaccard,
+            "number_accuracy": num_score,
+            "entity_overlap": entity_overlap,
+            "number_errors": num_errors
+        }
+        # Generate feedback
+        feedback_parts = []
+        if jaccard < 0.5:
+            feedback_parts.append("Low word overlap - output may be missing key terms.")
+        if num_errors:
+            feedback_parts.append(f"Number issues: {'; '.join(num_errors[:3])}")
+        if entity_overlap < 0.5 and exp_entities:
+            missing = exp_entities - pred_entities
+            feedback_parts.append(f"Missing entities: {', '.join(list(missing)[:3])}")
+        if feedback_parts:
+            result["feedback"] = " | ".join(feedback_parts)
+        else:
+            result["feedback"] = "Output is semantically similar but not exact match."
+        return result
+    def _generate_default_feedback(self, result: Dict) -> str:
+        """Generate default feedback based on scores."""
+        score = result["composite_score"]
+        semantic = result["semantic_similarity"]
+        structural = result["structural_similarity"]
+        if score >= 0.9:
+            return "✅ Excellent match! Minor differences only."
+        elif score >= 0.7:
+            return f"⚠️ Good match (semantic={semantic:.0%}, structural={structural:.0%}). Some differences to address."
+        elif score >= 0.5:
+            return f"⚠️ Partial match (semantic={semantic:.0%}, structural={structural:.0%}). Significant differences found."
+        else:
+            return f"❌ Poor match (semantic={semantic:.0%}, structural={structural:.0%}). Major issues to fix."
+    def get_evaluation_summary(self, results: List[Dict]) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        Args:
+            results: List of evaluation result dictionaries
+        Returns:
+            Summary statistics
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "accuracy": 0.0,
+                "avg_semantic_similarity": 0.0,
+                "avg_structural_similarity": 0.0
+            }
+        total = len(results)
+        scores = [r.get("composite_score", 0.0) for r in results]
+        semantic_scores = [r.get("semantic_similarity", 0.0) for r in results]
+        structural_scores = [r.get("structural_similarity", 0.0) for r in results]
+        return {
+            "total_samples": total,
+            "accuracy": sum(1 for s in scores if s >= 0.8) / total,
+            "avg_composite_score": sum(scores) / total,
+            "avg_semantic_similarity": sum(semantic_scores) / total,
+            "avg_structural_similarity": sum(structural_scores) / total,
+            "min_score": min(scores),
+            "max_score": max(scores)
+        }
+# Convenience function to create evaluator
+def create_universal_evaluator(llm_client=None) -> UniversalSemanticEvaluator:
+    """
+    Create a Universal Semantic Evaluator.
+    Args:
+        llm_client: Optional LLM client for semantic analysis.
+                   If not provided, uses heuristic-based analysis.
+    Returns:
+        Configured UniversalSemanticEvaluator instance
+    """
+    return UniversalSemanticEvaluator(
+        llm_client=llm_client,
+        use_llm_analysis=llm_client is not None
+    )

src/gepa_optimizer/evaluation/validation_evaluator.py ADDED Viewed

	@@ -0,0 +1,495 @@

+"""
+Validation Evaluator for UI Validation Use Case
+Evaluates predicted validation results (true/false) against expected results.
+Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback.
+"""
+from typing import Dict, Any, Optional
+import re
+import logging
+try:
+    from .base_evaluator import BaseEvaluator
+except ImportError:
+    # For standalone testing
+    import sys
+    from pathlib import Path
+    sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+    from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator
+class ValidationEvaluator(BaseEvaluator):
+    """
+    Evaluator for validation use case (true/false results).
+    Features:
+    - Normalizes boolean formats ("true"/"True"/"1" → True, "false"/"False"/"0" → False)
+    - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge)
+    - Binary scoring: correct boolean = 1.0, wrong = 0.0
+    - Returns reasoning in evaluation results for LLM-as-judge feedback
+    """
+    def __init__(self, metric_weights: Optional[Dict[str, float]] = None):
+        """
+        Initialize validation evaluator.
+        Args:
+            metric_weights: Weights for evaluation metrics
+                          Default: {"output_match": 1.0}
+        """
+        default_weights = {
+            "output_match": 1.0  # Binary boolean comparison
+        }
+        weights = metric_weights or default_weights
+        super().__init__(metric_weights=weights)
+    def evaluate(self, predicted: str, expected: str) -> Dict[str, float]:
+        """
+        Evaluate predicted validation result against expected result.
+        Scoring Strategy:
+        1. Normalize both predicted and expected to boolean
+        2. Compare booleans (exact match required)
+        3. Extract reasoning from both (for LLM-as-judge)
+        4. Return 1.0 if match, 0.0 otherwise (binary scoring)
+        Args:
+            predicted: LLM's output (may include "true"/"false" + reasoning)
+            expected: Expected output (should be "true" or "false", may include reasoning)
+        Returns:
+            Dictionary with evaluation metrics, extracted booleans, and reasoning:
+            {
+                "output_match": 1.0 or 0.0,
+                "composite_score": 1.0 or 0.0,
+                "predicted_output": str,
+                "expected_output": str,
+                "predicted_boolean": True/False,
+                "expected_boolean": True/False,
+                "predicted_reasoning": str,  # REQUIRED for LLM-as-judge
+                "expected_reasoning": str,   # REQUIRED for LLM-as-judge
+                "evaluation_reason": str
+            }
+        """
+        if not predicted or not expected:
+            return {
+                "output_match": 0.0,
+                "composite_score": 0.0,
+                "predicted_output": str(predicted).strip() if predicted else "",
+                "expected_output": str(expected).strip() if expected else "",
+                "predicted_boolean": None,
+                "expected_boolean": None,
+                "predicted_reasoning": "",
+                "expected_reasoning": "",
+                "evaluation_reason": "❌ Empty or missing input/output"
+            }
+        predicted_str = str(predicted).strip()
+        expected_str = str(expected).strip()
+        # 1. Extract boolean from predicted output
+        pred_bool = self._normalize_to_bool(predicted_str)
+        pred_reasoning = self._extract_reasoning(predicted_str)
+        # 2. Extract boolean from expected output
+        exp_bool = self._normalize_to_bool(expected_str)
+        exp_reasoning = self._extract_reasoning(expected_str)
+        # 🔥 NEW: Detect output structure for both expected and predicted
+        expected_structure = self._detect_output_structure(expected_str)
+        predicted_structure = self._detect_output_structure(predicted_str)
+        # Compare structures
+        structure_match = (expected_structure['format'] == predicted_structure['format'])
+        # 3. Compare booleans (binary scoring)
+        if pred_bool is None or exp_bool is None:
+            # Could not extract boolean from one or both
+            score = 0.0
+            reason = "❌ Could not extract boolean value"
+            if pred_bool is None:
+                reason += " from predicted output"
+            if exp_bool is None:
+                reason += " from expected output"
+        else:
+            # Both booleans extracted successfully - compare
+            score = 1.0 if pred_bool == exp_bool else 0.0
+            if score == 1.0:
+                reason = f"✅ Correct! Result matches (both are {exp_bool})"
+                # 🔥 NEW: Add note if structure doesn't match
+                if not structure_match:
+                    reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})"
+            else:
+                reason = f"❌ Wrong result (predicted: {pred_bool}, expected: {exp_bool})"
+        # 4. Log evaluation details
+        self.logger.info(f"\n{'─'*70}")
+        self.logger.info(f"📊 VALIDATION EVALUATION")
+        self.logger.info(f"{'─'*70}")
+        self.logger.info(f"   Expected: '{expected_str[:100]}...' → {exp_bool}")
+        self.logger.info(f"   Predicted: '{predicted_str[:100]}...' → {pred_bool}")
+        self.logger.info(f"   {'─'*66}")
+        self.logger.info(f"   🎯 SCORE: {score:.2f} - {reason}")
+        if pred_reasoning:
+            self.logger.info(f"   📝 Predicted Reasoning: {pred_reasoning[:150]}...")
+        if exp_reasoning:
+            self.logger.info(f"   📝 Expected Reasoning: {exp_reasoning[:150]}...")
+        # 🔥 NEW: Log structure comparison
+        self.logger.info(f"   📐 Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})")
+        self.logger.info(f"   📐 Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})")
+        if not structure_match:
+            self.logger.warning(f"   ⚠️  OUTPUT STRUCTURE MISMATCH!")
+        self.logger.info(f"{'─'*70}\n")
+        return {
+            "output_match": score,
+            "composite_score": score,  # This is what GEPA uses
+            "predicted_output": predicted_str,
+            "expected_output": expected_str,
+            "predicted_boolean": pred_bool,
+            "expected_boolean": exp_bool,
+            "predicted_reasoning": pred_reasoning,  # REQUIRED for LLM-as-judge
+            "expected_reasoning": exp_reasoning,     # REQUIRED for LLM-as-judge
+            "evaluation_reason": reason,
+            # 🔥 NEW: Structure metadata for LLM-as-judge
+            "expected_structure": expected_structure,
+            "predicted_structure": predicted_structure,
+            "output_structure_match": structure_match,
+            "expected_has_reasoning": expected_structure['has_reasoning'],
+            "predicted_has_reasoning": predicted_structure['has_reasoning'],
+            "reasoning_quality_gap": expected_structure['reasoning_quality'] + " → " + predicted_structure['reasoning_quality']
+        }
+    def _normalize_to_bool(self, value: str) -> Optional[bool]:
+        """
+        Normalize various formats to boolean.
+        Handles:
+        - "true", "True", "TRUE" → True
+        - "false", "False", "FALSE" → False
+        - "1", "0" → True, False
+        - "yes", "no" → True, False
+        - "correct", "incorrect" → True, False
+        - JSON: {"result": true} → True
+        - Text with boolean: "The result is true because..." → True
+        Args:
+            value: String that may contain a boolean value
+        Returns:
+            Boolean value or None if cannot be determined
+        """
+        if not value:
+            return None
+        value_lower = value.lower().strip()
+        # Direct boolean strings
+        if value_lower in ("true", "1", "yes", "correct", "valid", "pass"):
+            return True
+        if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"):
+            return False
+        # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"}
+        # This handles the production prompt's JSON output format
+        # Match both quoted and unquoted values, case-insensitive
+        action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower)
+        if action_match:
+            action_value = action_match.group(1).lower()
+            if action_value == "true":
+                return True
+            elif action_value == "false":
+                return False
+            elif action_value == "loading":
+                # Treat LOADING as False for validation purposes (screen not ready)
+                return False
+        # Also try to parse full JSON structure if present (more robust)
+        try:
+            import json
+            # Try to find and parse JSON object
+            json_start = value.find('{')
+            if json_start != -1:
+                # Try to extract JSON from the response
+                for end_idx in range(len(value), json_start, -1):
+                    try:
+                        json_str = value[json_start:end_idx]
+                        data = json.loads(json_str)
+                        # Check for "action" field (production prompt format)
+                        if "action" in data:
+                            action_val = str(data["action"]).upper()
+                            if action_val == "TRUE":
+                                return True
+                            elif action_val == "FALSE":
+                                return False
+                            elif action_val == "LOADING":
+                                return False  # Treat as False
+                        # Check for "result" field (alternative format)
+                        if "result" in data:
+                            result_val = data["result"]
+                            if isinstance(result_val, bool):
+                                return result_val
+                            elif isinstance(result_val, str):
+                                return result_val.lower() in ("true", "1", "yes")
+                    except (json.JSONDecodeError, KeyError, ValueError):
+                        continue
+        except Exception:
+            pass  # Fall through to other extraction methods
+        # JSON format: {"result": true} or {"result": false}
+        json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower)
+        if json_match:
+            return json_match.group(1) == "true"
+        # Pattern: "result is true" or "result: true"
+        pattern_match = re.search(r'result[:\s]+(true|false)', value_lower)
+        if pattern_match:
+            return pattern_match.group(1) == "true"
+        # Pattern: "is true" or "is false" (standalone)
+        is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower)
+        if is_match:
+            return is_match.group(2) == "true"
+        # Pattern: "true" or "false" as standalone word (not in other words)
+        standalone_match = re.search(r'\b(true|false)\b', value_lower)
+        if standalone_match:
+            return standalone_match.group(1) == "true"
+        # Last resort: check if "true" appears before "false" in text
+        true_pos = value_lower.find("true")
+        false_pos = value_lower.find("false")
+        if true_pos != -1 and false_pos != -1:
+            # Both found - use the one that appears first
+            return true_pos < false_pos
+        elif true_pos != -1:
+            return True
+        elif false_pos != -1:
+            return False
+        # Cannot determine
+        return None
+    def _detect_output_structure(self, output: str) -> Dict[str, Any]:
+        """
+        Dynamically detect the structure/components of the output.
+        This detects:
+        - Boolean result presence
+        - Reasoning/explanation presence and quality
+        - Output format (boolean only, boolean+reasoning, etc.)
+        Args:
+            output: Output string to analyze
+        Returns:
+            Dictionary with structure information:
+            {
+                "has_boolean": bool,
+                "has_reasoning": bool,
+                "reasoning_length": int,
+                "reasoning_quality": str,  # "missing", "minimal", "adequate", "detailed"
+                "format": str  # "boolean_only", "boolean_with_reasoning", "unknown"
+            }
+        """
+        if not output:
+            return {
+                "has_boolean": False,
+                "has_reasoning": False,
+                "reasoning_length": 0,
+                "reasoning_quality": "missing",
+                "format": "empty"
+            }
+        output_clean = output.strip()
+        # Detect boolean
+        has_boolean = self._normalize_to_bool(output_clean) is not None
+        # Extract reasoning
+        reasoning = self._extract_reasoning(output_clean)
+        has_reasoning = len(reasoning) > 15  # Minimum 15 chars to count as reasoning
+        reasoning_length = len(reasoning)
+        # Classify reasoning quality
+        if reasoning_length == 0:
+            reasoning_quality = "missing"
+        elif reasoning_length < 30:
+            reasoning_quality = "minimal"  # Just a few words
+        elif reasoning_length < 100:
+            reasoning_quality = "adequate"  # Brief explanation
+        else:
+            reasoning_quality = "detailed"  # Full explanation
+        # Determine format
+        if has_boolean and has_reasoning:
+            output_format = "boolean_with_reasoning"
+        elif has_boolean and not has_reasoning:
+            output_format = "boolean_only"
+        elif not has_boolean and has_reasoning:
+            output_format = "reasoning_only"
+        else:
+            output_format = "unknown"
+        return {
+            "has_boolean": has_boolean,
+            "has_reasoning": has_reasoning,
+            "reasoning_length": reasoning_length,
+            "reasoning_quality": reasoning_quality,
+            "format": output_format
+        }
+    def _extract_reasoning(self, output: str) -> str:
+        """
+        Extract reasoning/explanation from output string.
+        This is REQUIRED for LLM-as-judge feedback. The reasoning helps
+        the judge understand why the result was true/false and compare
+        predicted vs expected reasoning.
+        Args:
+            output: Full output string that may contain reasoning
+        Returns:
+            Extracted reasoning text, or empty string if not found
+        """
+        if not output:
+            return ""
+        # Patterns to find reasoning sections
+        reasoning_patterns = [
+            r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)',  # "Reason: ..."
+            r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)',  # "Explanation: ..."
+            r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)',  # "Because: ..."
+            r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)',  # "Why: ..."
+            r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)',  # "Details: ..."
+        ]
+        # Try each pattern
+        for pattern in reasoning_patterns:
+            match = re.search(pattern, output, re.DOTALL | re.IGNORECASE)
+            if match:
+                reasoning = match.group(1).strip()
+                if len(reasoning) > 20:  # Only return if substantial
+                    return reasoning
+        # If no explicit reasoning section, check if output has substantial text
+        # after boolean (likely contains reasoning)
+        bool_match = re.search(r'\b(true|false)\b', output.lower())
+        if bool_match:
+            # Get text after the boolean
+            bool_pos = bool_match.end()
+            remaining = output[bool_pos:].strip()
+            # If remaining text is substantial (more than just punctuation), use it
+            if len(remaining) > 30:
+                # Clean up common prefixes
+                remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining)
+                if remaining:
+                    return remaining
+        # If output is long and doesn't start with boolean, might be all reasoning
+        if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE):
+            # Return first 500 chars as reasoning
+            return output[:500].strip()
+        # No reasoning found
+        return ""
+    def get_evaluation_summary(self, results: list) -> Dict[str, Any]:
+        """
+        Get summary statistics for a batch of evaluations.
+        Args:
+            results: List of evaluation result dictionaries
+        Returns:
+            Summary statistics including accuracy, true/false distribution
+        """
+        if not results:
+            return {
+                "total_samples": 0,
+                "accuracy": 0.0,
+                "correct_predictions": 0,
+                "incorrect_predictions": 0,
+                "true_predictions": 0,
+                "false_predictions": 0
+            }
+        total = len(results)
+        correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0)
+        accuracy = correct / total if total > 0 else 0.0
+        # Count true/false predictions
+        true_preds = sum(1 for r in results if r.get("predicted_boolean") is True)
+        false_preds = sum(1 for r in results if r.get("predicted_boolean") is False)
+        return {
+            "total_samples": total,
+            "accuracy": accuracy,
+            "correct_predictions": correct,
+            "incorrect_predictions": total - correct,
+            "true_predictions": true_preds,
+            "false_predictions": false_preds
+        }
+# Example usage and testing
+if __name__ == "__main__":
+    print("🚀 Testing Validation Evaluator...")
+    evaluator = ValidationEvaluator()
+    # Test cases
+    test_cases = [
+        # (predicted, expected, should_match)
+        ("true", "true", True),
+        ("false", "false", True),
+        ("True", "true", True),
+        ("FALSE", "false", True),
+        ("1", "true", True),
+        ("0", "false", True),
+        ("true", "false", False),
+        ("false", "true", False),
+        ("The result is true because the button is visible", "true", True),
+        ("The result is false because the element is not found", "false", True),
+        ('{"result": true, "reasoning": "Button is visible"}', "true", True),
+        ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True),
+        ("", "true", False),
+        ("invalid", "true", False),
+    ]
+    print("\n📝 Running test cases:")
+    print("-" * 80)
+    results = []
+    for predicted, expected, should_match in test_cases:
+        result = evaluator.evaluate(predicted, expected)
+        match = result["composite_score"] == 1.0
+        status = "✅" if match == should_match else "❌"
+        pred_bool = result.get("predicted_boolean", "?")
+        exp_bool = result.get("expected_boolean", "?")
+        pred_reason = result.get("predicted_reasoning", "")[:50]
+        print(f"{status} Predicted: '{predicted[:40]}...' → {pred_bool}")
+        print(f"   Expected: '{expected}' → {exp_bool}")
+        print(f"   Match: {match} (should be {should_match})")
+        if pred_reason:
+            print(f"   Reasoning: {pred_reason}...")
+        print()
+        results.append(result)
+    # Summary
+    print("\n📊 Summary:")
+    summary = evaluator.get_evaluation_summary(results)
+    print(f"   Total: {summary['total_samples']}")
+    print(f"   Correct: {summary['correct_predictions']}")
+    print(f"   Accuracy: {summary['accuracy']:.1%}")
+    print(f"   True predictions: {summary['true_predictions']}")
+    print(f"   False predictions: {summary['false_predictions']}")

src/gepa_optimizer/infrastructure/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+Infrastructure module for cross-cutting concerns.
+This module contains infrastructure components that are used across
+the entire application, including logging, metrics, and configuration.
+"""
+from .logging import get_logger, configure_logging, LogContext
+__all__ = [
+    "get_logger",
+    "configure_logging",
+    "LogContext",
+]

src/gepa_optimizer/infrastructure/logging/__init__.py ADDED Viewed

	@@ -0,0 +1,43 @@

+"""
+Centralized Logging Infrastructure for GEPA Optimizer.
+This module provides a unified logging system with:
+- Structured logging with context
+- Consistent formatting across all modules
+- Log level configuration
+- Operation tracking with timing
+- Contextual logging for debugging
+Usage:
+    from gepa_optimizer.infrastructure.logging import get_logger, LogContext
+    logger = get_logger(__name__)
+    logger.info("Starting optimization", extra={"iteration": 1})
+    with LogContext(logger, "evaluation", sample_id=123):
+        logger.info("Evaluating sample")
+"""
+from .logger import (
+    get_logger,
+    configure_logging,
+    LogLevel,
+    GEPA_LOGGER_NAME,
+)
+from .context import LogContext, log_operation
+from .formatters import GepaFormatter, JsonFormatter
+__all__ = [
+    # Core logging
+    "get_logger",
+    "configure_logging",
+    "LogLevel",
+    "GEPA_LOGGER_NAME",
+    # Context management
+    "LogContext",
+    "log_operation",
+    # Formatters
+    "GepaFormatter",
+    "JsonFormatter",
+]

src/gepa_optimizer/infrastructure/logging/context.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+Logging Context Management.
+Provides context managers and decorators for:
+- Operation tracking with timing
+- Contextual logging with nested contexts
+- Automatic exception logging
+"""
+import logging
+import time
+import functools
+from contextlib import contextmanager
+from typing import Any, Callable, Dict, Optional, TypeVar, ParamSpec
+P = ParamSpec('P')
+R = TypeVar('R')
+class LogContext:
+    """
+    Context manager for logging operations with timing and context.
+    Features:
+    - Automatic start/end logging
+    - Timing measurement
+    - Exception capture
+    - Nested context support
+    Example:
+        logger = get_logger(__name__)
+        with LogContext(logger, "optimization", iteration=5):
+            # ... optimization code ...
+            logger.info("Processing sample")  # Inherits context
+        # Output:
+        # INFO | Starting optimization | iteration=5
+        # INFO | Processing sample | iteration=5
+        # INFO | Completed optimization | iteration=5 duration_ms=1234
+    """
+    def __init__(
+        self,
+        logger: logging.Logger,
+        operation: str,
+        log_start: bool = True,
+        log_end: bool = True,
+        log_level: int = logging.INFO,
+        **context_fields: Any
+    ):
+        """
+        Initialize log context.
+        Args:
+            logger: Logger instance to use
+            operation: Name of the operation being performed
+            log_start: Whether to log when entering context
+            log_end: Whether to log when exiting context
+            log_level: Log level for start/end messages
+            **context_fields: Additional fields to include in all logs
+        """
+        self.logger = logger
+        self.operation = operation
+        self.log_start = log_start
+        self.log_end = log_end
+        self.log_level = log_level
+        self.context_fields = context_fields
+        self.start_time: Optional[float] = None
+        self.exception: Optional[Exception] = None
+    def __enter__(self) -> "LogContext":
+        """Enter the context, logging start if configured."""
+        self.start_time = time.perf_counter()
+        if self.log_start:
+            self.logger.log(
+                self.log_level,
+                f"Starting {self.operation}",
+                extra=self.context_fields
+            )
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb) -> bool:
+        """Exit the context, logging completion or error."""
+        duration_ms = (time.perf_counter() - self.start_time) * 1000
+        extra = {
+            **self.context_fields,
+            "duration_ms": round(duration_ms, 2)
+        }
+        if exc_type is not None:
+            # Log exception
+            self.exception = exc_val
+            self.logger.error(
+                f"Failed {self.operation}: {exc_type.__name__}: {exc_val}",
+                extra=extra,
+                exc_info=True
+            )
+            # Don't suppress the exception
+            return False
+        if self.log_end:
+            self.logger.log(
+                self.log_level,
+                f"Completed {self.operation}",
+                extra=extra
+            )
+        return False
+    def log(self, level: int, message: str, **extra_fields: Any) -> None:
+        """Log a message within this context, inheriting context fields."""
+        self.logger.log(
+            level,
+            message,
+            extra={**self.context_fields, **extra_fields}
+        )
+    def info(self, message: str, **extra_fields: Any) -> None:
+        """Log info message within context."""
+        self.log(logging.INFO, message, **extra_fields)
+    def debug(self, message: str, **extra_fields: Any) -> None:
+        """Log debug message within context."""
+        self.log(logging.DEBUG, message, **extra_fields)
+    def warning(self, message: str, **extra_fields: Any) -> None:
+        """Log warning message within context."""
+        self.log(logging.WARNING, message, **extra_fields)
+    def error(self, message: str, **extra_fields: Any) -> None:
+        """Log error message within context."""
+        self.log(logging.ERROR, message, **extra_fields)
+def log_operation(
+    logger: Optional[logging.Logger] = None,
+    operation: Optional[str] = None,
+    log_args: bool = False,
+    log_result: bool = False,
+    log_level: int = logging.INFO,
+) -> Callable[[Callable[P, R]], Callable[P, R]]:
+    """
+    Decorator for logging function execution.
+    Automatically logs:
+    - Function entry (with arguments if configured)
+    - Function exit (with result if configured)
+    - Execution duration
+    - Exceptions
+    Args:
+        logger: Logger to use (defaults to logger named after module)
+        operation: Operation name (defaults to function name)
+        log_args: Whether to log function arguments
+        log_result: Whether to log function result
+        log_level: Log level for messages
+    Example:
+        @log_operation(log_args=True)
+        def process_batch(batch_id: int, items: List[str]) -> int:
+            return len(items)
+        # Output:
+        # INFO | Starting process_batch | batch_id=123 items=['a', 'b']
+        # INFO | Completed process_batch | duration_ms=45.2 result=2
+    """
+    def decorator(func: Callable[P, R]) -> Callable[P, R]:
+        nonlocal logger, operation
+        if logger is None:
+            logger = logging.getLogger(func.__module__)
+        if operation is None:
+            operation = func.__name__
+        @functools.wraps(func)
+        def wrapper(*args: P.args, **kwargs: P.kwargs) -> R:
+            start_time = time.perf_counter()
+            # Build context fields
+            extra: Dict[str, Any] = {}
+            if log_args:
+                # Include positional args (skip self for methods)
+                arg_names = func.__code__.co_varnames[:func.__code__.co_argcount]
+                for i, (name, value) in enumerate(zip(arg_names, args)):
+                    if name != 'self':
+                        extra[name] = _safe_repr(value)
+                # Include keyword args
+                for key, value in kwargs.items():
+                    extra[key] = _safe_repr(value)
+            logger.log(log_level, f"Starting {operation}", extra=extra)
+            try:
+                result = func(*args, **kwargs)
+                duration_ms = (time.perf_counter() - start_time) * 1000
+                result_extra: Dict[str, Any] = {"duration_ms": round(duration_ms, 2)}
+                if log_result:
+                    result_extra["result"] = _safe_repr(result)
+                logger.log(log_level, f"Completed {operation}", extra=result_extra)
+                return result
+            except Exception as e:
+                duration_ms = (time.perf_counter() - start_time) * 1000
+                logger.error(
+                    f"Failed {operation}: {type(e).__name__}: {e}",
+                    extra={"duration_ms": round(duration_ms, 2)},
+                    exc_info=True
+                )
+                raise
+        return wrapper
+    return decorator
+@contextmanager
+def timed_block(logger: logging.Logger, description: str, log_level: int = logging.DEBUG):
+    """
+    Simple context manager for timing a block of code.
+    Less verbose than LogContext, suitable for quick timing measurements.
+    Example:
+        with timed_block(logger, "data processing"):
+            process_data()
+        # Output: DEBUG | data processing completed in 123.45ms
+    """
+    start = time.perf_counter()
+    try:
+        yield
+    finally:
+        duration_ms = (time.perf_counter() - start) * 1000
+        logger.log(log_level, f"{description} completed in {duration_ms:.2f}ms")
+def _safe_repr(value: Any, max_length: int = 100) -> str:
+    """
+    Create a safe string representation of a value for logging.
+    Truncates long strings and handles non-serializable objects.
+    """
+    try:
+        repr_str = repr(value)
+        if len(repr_str) > max_length:
+            return repr_str[:max_length] + "..."
+        return repr_str
+    except Exception:
+        return f"<{type(value).__name__}>"

src/gepa_optimizer/infrastructure/logging/formatters.py ADDED Viewed

	@@ -0,0 +1,259 @@

+"""
+Custom Log Formatters for GEPA Optimizer.
+Provides formatters for:
+- Console output with colors and emoji
+- JSON structured logging for production
+- Plain text for file logging
+"""
+import json
+import logging
+from datetime import datetime
+from typing import Any, Dict, Optional
+# ANSI color codes for terminal output
+class Colors:
+    """ANSI color codes for terminal coloring."""
+    RESET = "\033[0m"
+    BOLD = "\033[1m"
+    DIM = "\033[2m"
+    # Log level colors
+    DEBUG = "\033[36m"      # Cyan
+    INFO = "\033[32m"       # Green
+    WARNING = "\033[33m"    # Yellow
+    ERROR = "\033[31m"      # Red
+    CRITICAL = "\033[35m"   # Magenta
+    # Semantic colors
+    TIMESTAMP = "\033[90m"  # Gray
+    MODULE = "\033[34m"     # Blue
+    MESSAGE = "\033[0m"     # Default
+# Emoji prefixes for visual log scanning
+LEVEL_EMOJI = {
+    logging.DEBUG: "🔍",
+    logging.INFO: "ℹ️ ",
+    logging.WARNING: "⚠️ ",
+    logging.ERROR: "❌",
+    logging.CRITICAL: "🚨",
+}
+# Level colors mapping
+LEVEL_COLORS = {
+    logging.DEBUG: Colors.DEBUG,
+    logging.INFO: Colors.INFO,
+    logging.WARNING: Colors.WARNING,
+    logging.ERROR: Colors.ERROR,
+    logging.CRITICAL: Colors.CRITICAL,
+}
+class GepaFormatter(logging.Formatter):
+    """
+    Custom formatter for GEPA Optimizer logs.
+    Features:
+    - Optional color output for console
+    - Optional emoji prefixes for visual scanning
+    - Structured extra fields support
+    - Clean, readable format
+    Example output:
+        2024-01-15 10:30:45 | INFO     | ℹ️  gepa_optimizer.core.optimizer | Starting optimization iteration=5
+    """
+    def __init__(
+        self,
+        fmt: Optional[str] = None,
+        datefmt: Optional[str] = None,
+        use_colors: bool = True,
+        include_emoji: bool = True,
+    ):
+        """
+        Initialize the formatter.
+        Args:
+            fmt: Format string (uses default if not provided)
+            datefmt: Date format string
+            use_colors: Whether to use ANSI colors
+            include_emoji: Whether to include emoji prefixes
+        """
+        super().__init__(fmt=fmt, datefmt=datefmt)
+        self.use_colors = use_colors
+        self.include_emoji = include_emoji
+    def format(self, record: logging.LogRecord) -> str:
+        """Format a log record with colors and emoji."""
+        # Store original values
+        original_msg = record.msg
+        original_levelname = record.levelname
+        try:
+            # Add emoji prefix if enabled
+            if self.include_emoji:
+                emoji = LEVEL_EMOJI.get(record.levelno, "")
+                record.levelname = f"{emoji} {record.levelname}"
+            # Add colors if enabled
+            if self.use_colors:
+                color = LEVEL_COLORS.get(record.levelno, Colors.RESET)
+                record.levelname = f"{color}{record.levelname}{Colors.RESET}"
+                record.name = f"{Colors.MODULE}{record.name}{Colors.RESET}"
+            # Format extra fields if present
+            extra_str = self._format_extra(record)
+            if extra_str:
+                record.msg = f"{record.msg} | {extra_str}"
+            # Call parent formatter
+            formatted = super().format(record)
+            return formatted
+        finally:
+            # Restore original values
+            record.msg = original_msg
+            record.levelname = original_levelname
+    def _format_extra(self, record: logging.LogRecord) -> str:
+        """
+        Format extra fields from the log record.
+        Extra fields are passed via the 'extra' parameter to logging calls:
+            logger.info("Message", extra={"key": "value"})
+        """
+        # Standard LogRecord attributes to exclude
+        standard_attrs = {
+            'name', 'msg', 'args', 'created', 'filename', 'funcName',
+            'levelname', 'levelno', 'lineno', 'module', 'msecs',
+            'pathname', 'process', 'processName', 'relativeCreated',
+            'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
+            'taskName', 'message'
+        }
+        # Collect extra fields
+        extra_fields = {
+            k: v for k, v in record.__dict__.items()
+            if k not in standard_attrs and not k.startswith('_')
+        }
+        if not extra_fields:
+            return ""
+        # Format as key=value pairs
+        parts = []
+        for key, value in extra_fields.items():
+            if isinstance(value, str):
+                parts.append(f"{key}={value}")
+            elif isinstance(value, (int, float)):
+                parts.append(f"{key}={value}")
+            elif isinstance(value, bool):
+                parts.append(f"{key}={str(value).lower()}")
+            else:
+                parts.append(f"{key}={repr(value)}")
+        return " ".join(parts)
+class JsonFormatter(logging.Formatter):
+    """
+    JSON formatter for structured logging.
+    Outputs each log record as a single JSON line, suitable for:
+    - Log aggregation systems (ELK, Splunk)
+    - Cloud logging (CloudWatch, Stackdriver)
+    - Log parsing and analysis
+    Example output:
+        {"timestamp": "2024-01-15T10:30:45.123Z", "level": "INFO", "logger": "gepa_optimizer.core", "message": "Starting optimization", "iteration": 5}
+    """
+    def __init__(
+        self,
+        include_timestamp: bool = True,
+        include_location: bool = False,
+    ):
+        """
+        Initialize JSON formatter.
+        Args:
+            include_timestamp: Include ISO timestamp
+            include_location: Include file/line information
+        """
+        super().__init__()
+        self.include_timestamp = include_timestamp
+        self.include_location = include_location
+    def format(self, record: logging.LogRecord) -> str:
+        """Format record as JSON string."""
+        log_dict: Dict[str, Any] = {}
+        # Timestamp
+        if self.include_timestamp:
+            log_dict["timestamp"] = datetime.utcfromtimestamp(
+                record.created
+            ).isoformat() + "Z"
+        # Core fields
+        log_dict["level"] = record.levelname
+        log_dict["logger"] = record.name
+        log_dict["message"] = record.getMessage()
+        # Location info
+        if self.include_location:
+            log_dict["file"] = record.filename
+            log_dict["line"] = record.lineno
+            log_dict["function"] = record.funcName
+        # Exception info
+        if record.exc_info:
+            log_dict["exception"] = self.formatException(record.exc_info)
+        # Extra fields
+        standard_attrs = {
+            'name', 'msg', 'args', 'created', 'filename', 'funcName',
+            'levelname', 'levelno', 'lineno', 'module', 'msecs',
+            'pathname', 'process', 'processName', 'relativeCreated',
+            'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName',
+            'taskName', 'message'
+        }
+        for key, value in record.__dict__.items():
+            if key not in standard_attrs and not key.startswith('_'):
+                try:
+                    # Ensure value is JSON serializable
+                    json.dumps(value)
+                    log_dict[key] = value
+                except (TypeError, ValueError):
+                    log_dict[key] = str(value)
+        return json.dumps(log_dict, default=str)
+class CompactFormatter(logging.Formatter):
+    """
+    Compact formatter for minimal log output.
+    Useful for:
+    - CI/CD pipelines
+    - Reduced log verbosity
+    - Quick debugging
+    Example output:
+        10:30:45 INFO optimizer: Starting optimization
+    """
+    def format(self, record: logging.LogRecord) -> str:
+        """Format record in compact form."""
+        # Short timestamp (time only)
+        time_str = datetime.fromtimestamp(record.created).strftime("%H:%M:%S")
+        # Short module name (last part only)
+        short_name = record.name.split(".")[-1]
+        return f"{time_str} {record.levelname:5s} {short_name}: {record.getMessage()}"

src/gepa_optimizer/infrastructure/logging/logger.py ADDED Viewed

	@@ -0,0 +1,260 @@

+"""
+Core Logger Factory and Configuration.
+This module provides the centralized logger factory that should be used
+across all GEPA Optimizer modules. It ensures consistent logging behavior
+and formatting throughout the application.
+Design Principles:
+- Single source of truth for logger configuration
+- Lazy initialization (loggers created on first use)
+- Thread-safe logger access
+- Configurable log levels per module
+"""
+import logging
+import sys
+from enum import Enum
+from typing import Optional, Dict, Any
+from functools import lru_cache
+from .formatters import GepaFormatter
+# Root logger name for GEPA Optimizer
+GEPA_LOGGER_NAME = "gepa_optimizer"
+# Default log format
+DEFAULT_FORMAT = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s"
+DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
+class LogLevel(str, Enum):
+    """Supported log levels with string representation."""
+    DEBUG = "DEBUG"
+    INFO = "INFO"
+    WARNING = "WARNING"
+    ERROR = "ERROR"
+    CRITICAL = "CRITICAL"
+    @classmethod
+    def from_string(cls, level: str) -> "LogLevel":
+        """Convert string to LogLevel enum."""
+        try:
+            return cls(level.upper())
+        except ValueError:
+            return cls.INFO
+class LoggerConfig:
+    """
+    Configuration class for GEPA logging.
+    This class holds all logging configuration and can be modified
+    before calling configure_logging() to customize behavior.
+    """
+    # Default configuration
+    level: LogLevel = LogLevel.INFO
+    format: str = DEFAULT_FORMAT
+    date_format: str = DEFAULT_DATE_FORMAT
+    # Module-specific log levels (for fine-grained control)
+    module_levels: Dict[str, LogLevel] = {}
+    # Output configuration
+    log_to_console: bool = True
+    log_to_file: Optional[str] = None
+    # Formatting options
+    use_colors: bool = True
+    include_emoji: bool = True  # For visual clarity in development
+    @classmethod
+    def reset(cls) -> None:
+        """Reset configuration to defaults."""
+        cls.level = LogLevel.INFO
+        cls.format = DEFAULT_FORMAT
+        cls.date_format = DEFAULT_DATE_FORMAT
+        cls.module_levels = {}
+        cls.log_to_console = True
+        cls.log_to_file = None
+        cls.use_colors = True
+        cls.include_emoji = True
+# Global flag to track if logging is configured
+_logging_configured = False
+def configure_logging(
+    level: Optional[str] = None,
+    log_file: Optional[str] = None,
+    use_colors: bool = True,
+    include_emoji: bool = True,
+    format_string: Optional[str] = None,
+    module_levels: Optional[Dict[str, str]] = None,
+) -> None:
+    """
+    Configure the GEPA logging system.
+    This should be called once at application startup. Subsequent calls
+    will update the configuration.
+    Args:
+        level: Global log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        log_file: Optional path to log file
+        use_colors: Whether to use colored output in console
+        include_emoji: Whether to include emoji prefixes for visual clarity
+        format_string: Custom format string (optional)
+        module_levels: Dict mapping module names to their specific log levels
+    Example:
+        configure_logging(
+            level="DEBUG",
+            log_file="optimization.log",
+            module_levels={
+                "gepa_optimizer.core.optimizer": "INFO",
+                "gepa_optimizer.llms": "DEBUG"
+            }
+        )
+    """
+    global _logging_configured
+    # Update configuration
+    if level:
+        LoggerConfig.level = LogLevel.from_string(level)
+    if log_file:
+        LoggerConfig.log_to_file = log_file
+    LoggerConfig.use_colors = use_colors
+    LoggerConfig.include_emoji = include_emoji
+    if format_string:
+        LoggerConfig.format = format_string
+    if module_levels:
+        LoggerConfig.module_levels = {
+            k: LogLevel.from_string(v) for k, v in module_levels.items()
+        }
+    # Get or create root GEPA logger
+    root_logger = logging.getLogger(GEPA_LOGGER_NAME)
+    root_logger.setLevel(getattr(logging, LoggerConfig.level.value))
+    # Remove existing handlers to avoid duplicates
+    root_logger.handlers.clear()
+    # Console handler
+    if LoggerConfig.log_to_console:
+        console_handler = logging.StreamHandler(sys.stdout)
+        console_handler.setLevel(getattr(logging, LoggerConfig.level.value))
+        # Use custom formatter
+        formatter = GepaFormatter(
+            fmt=LoggerConfig.format,
+            datefmt=LoggerConfig.date_format,
+            use_colors=use_colors,
+            include_emoji=include_emoji,
+        )
+        console_handler.setFormatter(formatter)
+        root_logger.addHandler(console_handler)
+    # File handler (if configured)
+    if LoggerConfig.log_to_file:
+        file_handler = logging.FileHandler(LoggerConfig.log_to_file)
+        file_handler.setLevel(getattr(logging, LoggerConfig.level.value))
+        # File logs don't use colors
+        file_formatter = GepaFormatter(
+            fmt=LoggerConfig.format,
+            datefmt=LoggerConfig.date_format,
+            use_colors=False,
+            include_emoji=False,
+        )
+        file_handler.setFormatter(file_formatter)
+        root_logger.addHandler(file_handler)
+    # Apply module-specific levels
+    for module_name, module_level in LoggerConfig.module_levels.items():
+        module_logger = logging.getLogger(module_name)
+        module_logger.setLevel(getattr(logging, module_level.value))
+    _logging_configured = True
+    # Log that configuration is complete
+    root_logger.debug(
+        f"Logging configured: level={LoggerConfig.level.value}, "
+        f"file={LoggerConfig.log_to_file}"
+    )
+@lru_cache(maxsize=128)
+def get_logger(name: str) -> logging.Logger:
+    """
+    Get a logger instance for the given module name.
+    This is the primary factory function for obtaining loggers.
+    All GEPA modules should use this instead of logging.getLogger().
+    Args:
+        name: Module name (typically __name__)
+    Returns:
+        Configured Logger instance
+    Example:
+        from gepa_optimizer.infrastructure.logging import get_logger
+        logger = get_logger(__name__)
+        logger.info("Starting process")
+        logger.error("Failed to connect", exc_info=True)
+    """
+    global _logging_configured
+    # Auto-configure with defaults if not yet configured
+    if not _logging_configured:
+        configure_logging()
+    # Ensure name is under GEPA namespace for consistent handling
+    if not name.startswith(GEPA_LOGGER_NAME) and name != GEPA_LOGGER_NAME:
+        # External module - still use our formatting
+        pass
+    logger = logging.getLogger(name)
+    # Apply module-specific level if configured
+    if name in LoggerConfig.module_levels:
+        logger.setLevel(getattr(logging, LoggerConfig.module_levels[name].value))
+    return logger
+def set_log_level(level: str, module: Optional[str] = None) -> None:
+    """
+    Dynamically change log level at runtime.
+    Args:
+        level: New log level (DEBUG, INFO, WARNING, ERROR, CRITICAL)
+        module: Optional module name. If None, changes global level.
+    Example:
+        # Enable debug for specific module
+        set_log_level("DEBUG", "gepa_optimizer.core.optimizer")
+        # Change global level
+        set_log_level("WARNING")
+    """
+    log_level = LogLevel.from_string(level)
+    if module:
+        # Set level for specific module
+        logger = logging.getLogger(module)
+        logger.setLevel(getattr(logging, log_level.value))
+        LoggerConfig.module_levels[module] = log_level
+    else:
+        # Set global level
+        LoggerConfig.level = log_level
+        root_logger = logging.getLogger(GEPA_LOGGER_NAME)
+        root_logger.setLevel(getattr(logging, log_level.value))
+        # Update all handlers
+        for handler in root_logger.handlers:
+            handler.setLevel(getattr(logging, log_level.value))

src/gepa_optimizer/llms/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+"""
+LLM module for GEPA Optimizer
+"""
+from .base_llm import BaseLLMClient
+from .vision_llm import VisionLLMClient
+from .batch_llm import BatchLLMClient
+from .llego_enhanced_llm import LLEGOEnhancedLLMClient
+__all__ = ["BaseLLMClient", "VisionLLMClient", "BatchLLMClient", "LLEGOEnhancedLLMClient"]

src/gepa_optimizer/llms/base_llm.py ADDED Viewed

	@@ -0,0 +1,56 @@

+"""
+Base LLM client class for all LLM providers.
+"""
+from abc import ABC, abstractmethod
+from typing import Any, Dict, Optional, Union
+import logging
+logger = logging.getLogger(__name__)
+class BaseLLMClient(ABC):
+    """
+    Abstract base class for all LLM clients.
+    Provides a consistent interface for different LLM providers and models.
+    """
+    def __init__(self, provider: str, model_name: str, **kwargs):
+        """
+        Initialize LLM client.
+        Args:
+            provider: LLM provider (e.g., 'openai', 'anthropic')
+            model_name: Specific model name
+            **kwargs: Additional provider-specific parameters
+        """
+        self.provider = provider
+        self.model_name = model_name
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        # Store additional configuration
+        self.config = kwargs
+    @abstractmethod
+    def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> Dict[str, Any]:
+        """
+        Generate response from LLM.
+        Args:
+            system_prompt: System-level instructions
+            user_prompt: User's input prompt
+            **kwargs: Additional generation parameters (e.g., image_base64)
+        Returns:
+            Dictionary with 'content' key containing the generated response
+            and additional metadata
+        """
+        pass
+    def get_model_info(self) -> Dict[str, str]:
+        """Get model information for logging and debugging"""
+        return {
+            'provider': self.provider,
+            'model_name': self.model_name,
+            'class': self.__class__.__name__
+        }

src/gepa_optimizer/llms/batch_llm.py ADDED Viewed

	@@ -0,0 +1,712 @@

+"""
+Batch LLM Client for cost-effective processing using Gemini Batch API.
+This client provides 50% cost savings by using Google's Gemini Batch API
+instead of real-time API calls. Ideal for large-scale prompt optimization
+where latency is acceptable.
+Features:
+- 50% cost reduction compared to standard API
+- Automatic batching and job management
+- Built-in retry and polling logic
+- Thread-safe operation
+- Comprehensive error handling
+Author: GEPA Optimizer Team
+"""
+import os
+import json
+import time
+import logging
+import tempfile
+import io
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+from .base_llm import BaseLLMClient
+try:
+    from PIL import Image
+    PIL_AVAILABLE = True
+except ImportError:
+    PIL_AVAILABLE = False
+    Image = None
+try:
+    from google import genai
+    from google.genai import types
+    GENAI_AVAILABLE = True
+except ImportError:
+    GENAI_AVAILABLE = False
+    genai = None
+    types = None
+logger = logging.getLogger(__name__)
+class BatchLLMClient(BaseLLMClient):
+    """
+    Batch LLM client that uses Gemini Batch API for cost-effective processing.
+    This client processes multiple requests together in batch jobs, providing:
+    - 50% cost savings vs standard API
+    - No rate limit impact
+    - Automatic job management and polling
+    Usage:
+        >>> from gepa_optimizer.llms import BatchLLMClient
+        >>>
+        >>> client = BatchLLMClient(
+        ...     provider="google",
+        ...     model_name="gemini-2.5-flash",
+        ...     api_key="your-key",
+        ...     batch_size=20,
+        ...     polling_interval=30
+        ... )
+        >>>
+        >>> # Use just like VisionLLMClient - adapter handles the rest!
+        >>> result = client.generate(
+        ...     system_prompt="You are a helpful assistant",
+        ...     user_prompt="Analyze this image",
+        ...     image_base64="..."
+        ... )
+    Performance Note:
+        Batch processing adds latency (30s+ polling time) but reduces costs by 50%.
+        Choose this mode for large-scale optimization where cost > speed.
+    """
+    def __init__(
+        self,
+        provider: str,
+        model_name: str,
+        api_key: Optional[str] = None,
+        batch_size: int = 20,
+        polling_interval: int = 30,
+        max_polling_time: int = 3600,
+        temp_dir: str = ".gepa_batch_temp",
+        **kwargs
+    ):
+        """
+        Initialize Batch LLM Client.
+        Args:
+            provider: Must be "google" or "gemini"
+            model_name: Gemini model (e.g., "gemini-2.5-flash", "gemini-1.5-flash")
+            api_key: Google API key (defaults to GEMINI_API_KEY env var)
+            batch_size: Number of samples to process per batch job (1-100)
+            polling_interval: Seconds between job status checks (default: 30)
+            max_polling_time: Maximum seconds to wait for job completion (default: 3600)
+            temp_dir: Directory for temporary files (default: ".gepa_batch_temp")
+            **kwargs: Additional parameters
+        Raises:
+            ValueError: If provider is not Google/Gemini
+            ImportError: If google-genai is not installed
+        """
+        super().__init__(provider=provider, model_name=model_name, **kwargs)
+        # Validate provider
+        if provider.lower() not in ["google", "gemini"]:
+            raise ValueError(
+                f"BatchLLMClient only supports Google/Gemini provider. Got: {provider}"
+            )
+        # Check dependencies
+        if not GENAI_AVAILABLE:
+            raise ImportError(
+                "google-genai not installed. Install with: pip install google-genai"
+            )
+        # Configuration
+        self.batch_size = batch_size
+        self.polling_interval = polling_interval
+        self.max_polling_time = max_polling_time
+        self.temp_dir = Path(temp_dir)
+        self.temp_dir.mkdir(exist_ok=True)
+        # Initialize Gemini client
+        from ..utils.api_keys import APIKeyManager
+        self.api_key = api_key or APIKeyManager().get_api_key("google")
+        if not self.api_key:
+            raise ValueError(
+                "Google API key required. Provide via api_key parameter or "
+                "set GEMINI_API_KEY environment variable."
+            )
+        self.client = genai.Client(api_key=self.api_key)
+        logger.info(
+            f"✓ BatchLLMClient initialized: {model_name} "
+            f"(batch_size={batch_size}, polling={polling_interval}s)"
+        )
+    def generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: Optional[str] = None,
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response using batch API.
+        Note: This method is primarily for compatibility. For batch optimization,
+        the adapter will call generate_batch() directly with multiple requests.
+        Args:
+            system_prompt: System-level instructions
+            user_prompt: User's input prompt
+            image_base64: Optional base64 encoded image
+            **kwargs: Additional generation parameters
+        Returns:
+            Dict with 'content' key containing generated text
+        """
+        # Single request - process as a batch of 1
+        requests = [{
+            'system_prompt': system_prompt,
+            'user_prompt': user_prompt,
+            'image_base64': image_base64
+        }]
+        results = self.generate_batch(requests)
+        return results[0] if results else {"content": "", "error": "No results"}
+    def generate_batch(
+        self,
+        requests: List[Dict[str, Any]],
+        timeout_override: Optional[int] = None
+    ) -> List[Dict[str, Any]]:
+        """
+        Process multiple requests in a single batch job.
+        This is the main method called by UniversalGepaAdapter during GEPA optimization.
+        Args:
+            requests: List of request dicts with keys:
+                - system_prompt: System instructions
+                - user_prompt: User input
+                - image_base64: Optional base64 image
+            timeout_override: Override max_polling_time for this batch
+        Returns:
+            List of response dicts with 'content' key
+        Raises:
+            RuntimeError: If batch job fails
+            TimeoutError: If polling exceeds timeout
+        """
+        logger.info(f"📦 Processing batch of {len(requests)} requests via Gemini Batch API...")
+        start_time = time.time()
+        try:
+            # Step 1: Upload images if needed
+            file_uris, mime_types = self._upload_images_for_batch(requests)
+            # Step 2: Create JSONL file
+            jsonl_path = self._create_batch_jsonl(requests, file_uris, mime_types)
+            # Step 3: Submit batch job
+            batch_job_name = self._submit_batch_job(jsonl_path)
+            # Step 4: Wait for completion
+            timeout = timeout_override or self.max_polling_time
+            self._wait_for_batch_completion(batch_job_name, timeout)
+            # Step 5: Retrieve results
+            results = self._retrieve_batch_results(batch_job_name)
+            # Cleanup
+            jsonl_path.unlink(missing_ok=True)
+            elapsed_time = time.time() - start_time
+            logger.info(
+                f"✓ Batch processing complete: {len(results)} results in {elapsed_time:.1f}s "
+                f"(~{elapsed_time/len(results):.1f}s per request)"
+            )
+            return results
+        except Exception as e:
+            elapsed_time = time.time() - start_time
+            logger.error(f"❌ Batch processing failed after {elapsed_time:.1f}s: {e}")
+            raise
+    def _upload_images_for_batch(self, requests: List[Dict]) -> Tuple[List[Optional[str]], List[Optional[str]]]:
+        """
+        Upload images to Gemini and return file URIs and MIME types.
+        Args:
+            requests: List of request dicts
+        Returns:
+            Tuple of (file_uris, mime_types) - both are lists with None for requests without images
+        """
+        file_uris = []
+        mime_types = []
+        images_to_upload = sum(1 for r in requests if r.get('image_base64'))
+        if images_to_upload > 0:
+            logger.info(f"   ⬆️  Uploading {images_to_upload} images to Gemini...")
+        for i, request in enumerate(requests):
+            image_base64 = request.get('image_base64')
+            if not image_base64:
+                file_uris.append(None)
+                mime_types.append(None)
+                continue
+            try:
+                # Decode image data
+                import base64
+                image_data = base64.b64decode(image_base64)
+                # Detect image format using Pillow
+                image_format = None
+                if PIL_AVAILABLE:
+                    try:
+                        img = Image.open(io.BytesIO(image_data))
+                        image_format = img.format.lower() if img.format else None
+                    except Exception as e:
+                        logger.warning(f"   ⚠️  Could not detect image format: {e}")
+                # Map format to extension and MIME type
+                format_map = {
+                    'jpeg': ('.jpg', 'image/jpeg'),
+                    'jpg': ('.jpg', 'image/jpeg'),
+                    'png': ('.png', 'image/png'),
+                    'gif': ('.gif', 'image/gif'),
+                    'webp': ('.webp', 'image/webp'),
+                    'bmp': ('.bmp', 'image/bmp'),
+                    'tiff': ('.tiff', 'image/tiff'),
+                    'tif': ('.tiff', 'image/tiff'),
+                }
+                # Get extension and MIME type (default to PNG if unknown)
+                ext, mime_type = format_map.get(image_format, ('.png', 'image/png'))
+                if image_format and image_format not in format_map:
+                    logger.warning(f"   ⚠️  Unknown image format '{image_format}' for image {i}, defaulting to PNG")
+                elif not image_format:
+                    logger.debug(f"   ℹ️  Could not detect format for image {i}, using PNG")
+                # Save to temp file with correct extension
+                temp_file = tempfile.NamedTemporaryFile(
+                    delete=False,
+                    suffix=ext,
+                    dir=self.temp_dir
+                )
+                temp_file.write(image_data)
+                temp_file.close()
+                # Upload to Gemini with correct MIME type
+                uploaded_file = self.client.files.upload(
+                    file=temp_file.name,
+                    config=types.UploadFileConfig(
+                        display_name=f"batch_image_{i}_{int(time.time())}{ext}",
+                        mime_type=mime_type
+                    )
+                )
+                logger.debug(f"   ✓ Uploaded image {i} as {mime_type}")
+                # Wait for file to be active
+                self._wait_for_file_active(uploaded_file)
+                file_uris.append(uploaded_file.uri)
+                mime_types.append(mime_type)
+                # Cleanup temp file
+                Path(temp_file.name).unlink()
+            except Exception as e:
+                logger.error(f"   ✗ Failed to upload image {i}: {e}")
+                file_uris.append(None)
+                mime_types.append(None)
+        if images_to_upload > 0:
+            successful = sum(1 for uri in file_uris if uri is not None)
+            logger.info(f"   ✓ Uploaded {successful}/{images_to_upload} images successfully")
+        return file_uris, mime_types
+    def _create_batch_jsonl(
+        self,
+        requests: List[Dict],
+        file_uris: List[Optional[str]],
+        mime_types: List[Optional[str]]
+    ) -> Path:
+        """
+        Create JSONL file for batch job.
+        Args:
+            requests: List of request dicts
+            file_uris: List of uploaded file URIs
+            mime_types: List of MIME types for uploaded files
+        Returns:
+            Path to created JSONL file
+        """
+        timestamp = int(time.time())
+        jsonl_path = self.temp_dir / f"batch_{timestamp}.jsonl"
+        with open(jsonl_path, 'w', encoding='utf-8') as f:
+            for i, (request, file_uri, mime_type) in enumerate(zip(requests, file_uris, mime_types)):
+                # Combine system and user prompts
+                system_prompt = request.get('system_prompt', '')
+                user_prompt = request.get('user_prompt', '')
+                full_prompt = f"{system_prompt}\n\n{user_prompt}".strip()
+                # Build request parts
+                parts = [{"text": full_prompt}]
+                if file_uri:
+                    parts.append({
+                        "file_data": {
+                            "file_uri": file_uri,
+                            "mime_type": mime_type or "image/png"  # Use actual MIME type
+                        }
+                    })
+                # Gemini Batch API format according to official docs
+                # Reference: https://ai.google.dev/gemini-api/docs/batch-inference
+                # NOTE: The "request" wrapper is REQUIRED for Gemini 2.5 batch API
+                batch_request = {
+                    "custom_id": f"request-{i}",
+                    "request": {
+                        "contents": [{
+                            "role": "user",
+                            "parts": parts
+                        }]
+                    }
+                }
+                f.write(json.dumps(batch_request, ensure_ascii=False) + '\n')
+        logger.info(f"   📝 Created JSONL file: {jsonl_path.name} ({len(requests)} requests)")
+        return jsonl_path
+    def _submit_batch_job(self, jsonl_path: Path) -> str:
+        """
+        Submit batch job to Gemini.
+        Args:
+            jsonl_path: Path to JSONL file
+        Returns:
+            Batch job name
+        """
+        # Upload JSONL file
+        # Try multiple methods as the google-genai SDK can be finicky
+        try:
+            logger.info(f"   📤 Uploading JSONL file: {jsonl_path.name}")
+            # Read and validate file content
+            with open(jsonl_path, 'r', encoding='utf-8') as f:
+                content = f.read()
+                line_count = len(content.strip().split('\n'))
+                logger.debug(f"   📄 JSONL: {len(content)} bytes, {line_count} lines")
+                # Validate JSONL format
+                for line_num, line in enumerate(content.strip().split('\n'), 1):
+                    try:
+                        json.loads(line)
+                    except json.JSONDecodeError as e:
+                        logger.error(f"   ❌ Invalid JSON at line {line_num}: {e}")
+                        logger.error(f"      Content: {line[:100]}...")
+                        raise ValueError(f"Invalid JSONL format at line {line_num}") from e
+            # Method 1: Try uploading with Path object
+            logger.info(f"   🔄 Upload method 1: Using Path object...")
+            try:
+                jsonl_file = self.client.files.upload(
+                    file=jsonl_path,
+                    config=types.UploadFileConfig(
+                        display_name=f'gepa-batch-{int(time.time())}',
+                        mime_type='application/json'  # Try application/json instead of application/jsonl
+                    )
+                )
+                logger.info(f"   ✓ JSONL file uploaded: {jsonl_file.name}")
+            except Exception as e1:
+                logger.warning(f"   ⚠️  Method 1 failed: {e1}")
+                logger.info(f"   🔄 Upload method 2: Using string path...")
+                # Method 2: Fallback to string path
+                try:
+                    jsonl_file = self.client.files.upload(
+                        file=str(jsonl_path.absolute()),
+                        config=types.UploadFileConfig(
+                            display_name=f'gepa-batch-{int(time.time())}',
+                            mime_type='application/json'
+                        )
+                    )
+                    logger.info(f"   ✓ JSONL file uploaded (method 2): {jsonl_file.name}")
+                except Exception as e2:
+                    logger.error(f"   ❌ Method 2 also failed: {e2}")
+                    raise e2
+        except KeyError as e:
+            logger.error(f"❌ KeyError during JSONL upload: {e}")
+            logger.error(f"   This suggests the Gemini API response format changed")
+            logger.error(f"   Try updating google-genai: pip install --upgrade google-genai")
+            raise RuntimeError(f"Gemini Batch API response format error: {e}") from e
+        except Exception as e:
+            logger.error(f"❌ Failed to upload JSONL file: {e}")
+            logger.error(f"   File path: {jsonl_path}")
+            logger.error(f"   File exists: {jsonl_path.exists()}")
+            logger.error(f"   File size: {jsonl_path.stat().st_size if jsonl_path.exists() else 'N/A'} bytes")
+            raise RuntimeError(f"Gemini Batch API file upload failed: {e}") from e
+        # Wait for JSONL to be active
+        try:
+            logger.info(f"   ⏳ Waiting for JSONL file to be processed...")
+            self._wait_for_file_active(jsonl_file)
+        except Exception as e:
+            logger.error(f"❌ JSONL file processing failed: {e}")
+            raise
+        # Create batch job
+        try:
+            logger.info(f"   🚀 Creating batch job...")
+            batch_job = self.client.batches.create(
+                model=self.model_name,
+                src=jsonl_file.name,
+                config={'display_name': f'gepa-opt-{int(time.time())}'}
+            )
+            logger.info(f"   ✓ Batch job submitted: {batch_job.name}")
+            return batch_job.name
+        except Exception as e:
+            logger.error(f"❌ Failed to create batch job: {e}")
+            raise RuntimeError(f"Batch job creation failed: {e}") from e
+    def _wait_for_batch_completion(self, job_name: str, timeout: int):
+        """
+        Poll batch job until completion.
+        Args:
+            job_name: Batch job name
+            timeout: Maximum seconds to wait
+        Raises:
+            TimeoutError: If polling exceeds timeout
+            RuntimeError: If batch job fails
+        """
+        logger.info(f"   ⏳ Polling for completion (checking every {self.polling_interval}s)...")
+        start_time = time.time()
+        poll_count = 0
+        while True:
+            elapsed = time.time() - start_time
+            if elapsed > timeout:
+                raise TimeoutError(
+                    f"Batch job timeout after {elapsed:.0f}s "
+                    f"(max: {timeout}s)"
+                )
+            try:
+                batch_job = self.client.batches.get(name=job_name)
+                state = batch_job.state.name
+                # Success states
+                if state in ['JOB_STATE_SUCCEEDED', 'SUCCEEDED']:
+                    logger.info(f"   ✓ Batch job completed in {elapsed:.0f}s")
+                    return
+                # Failure states
+                if state in ['JOB_STATE_FAILED', 'FAILED']:
+                    raise RuntimeError(f"Batch job failed with state: {state}")
+                if state in ['JOB_STATE_CANCELLED', 'CANCELLED']:
+                    raise RuntimeError(f"Batch job was cancelled: {state}")
+                # Still processing
+                poll_count += 1
+                if poll_count % 5 == 0:  # Log every 5 polls
+                    logger.info(f"   ... still processing ({elapsed:.0f}s elapsed, state: {state})")
+                time.sleep(self.polling_interval)
+            except (TimeoutError, RuntimeError):
+                raise
+            except Exception as e:
+                logger.warning(f"   ⚠️  Error checking job status: {e}, retrying...")
+                time.sleep(5)
+    def _retrieve_batch_results(self, job_name: str) -> List[Dict[str, Any]]:
+        """
+        Retrieve and parse batch results.
+        Args:
+            job_name: Batch job name
+        Returns:
+            List of result dicts
+        """
+        batch_job = self.client.batches.get(name=job_name)
+        # Check for inline responses (preferred)
+        if hasattr(batch_job.dest, 'inlined_responses') and batch_job.dest.inlined_responses:
+            logger.info(f"   📥 Processing inline responses...")
+            return self._parse_inline_results(batch_job.dest.inlined_responses)
+        # Download results file (fallback)
+        if hasattr(batch_job.dest, 'file_name') and batch_job.dest.file_name:
+            logger.info(f"   📥 Downloading results file: {batch_job.dest.file_name}")
+            file_data = self.client.files.download(file=batch_job.dest.file_name)
+            return self._parse_file_results(file_data)
+        raise RuntimeError("No results available from batch job")
+    def _parse_inline_results(self, inline_responses) -> List[Dict[str, Any]]:
+        """Parse inline batch results."""
+        results = []
+        for response_obj in inline_responses:
+            if hasattr(response_obj, 'response') and response_obj.response:
+                text = self._extract_text_from_response(response_obj.response)
+                results.append({
+                    "content": text,
+                    "role": "assistant",
+                    "model": self.model_name,
+                    "provider": "google"
+                })
+            else:
+                error_msg = str(getattr(response_obj, 'error', 'Unknown error'))
+                logger.warning(f"   ⚠️  Response error: {error_msg}")
+                results.append({
+                    "content": "",
+                    "error": error_msg
+                })
+        return results
+    def _parse_file_results(self, file_data) -> List[Dict[str, Any]]:
+        """Parse JSONL results file."""
+        if isinstance(file_data, bytes):
+            jsonl_content = file_data.decode('utf-8')
+        else:
+            jsonl_content = file_data
+        results = []
+        for line_num, line in enumerate(jsonl_content.strip().split('\n'), 1):
+            if not line.strip():
+                continue
+            try:
+                result = json.loads(line)
+                if 'response' in result:
+                    text = self._extract_text_from_dict(result['response'])
+                    results.append({
+                        "content": text,
+                        "role": "assistant",
+                        "model": self.model_name,
+                        "provider": "google"
+                    })
+                else:
+                    error_msg = result.get('error', 'Unknown error')
+                    logger.warning(f"   ⚠️  Line {line_num} error: {error_msg}")
+                    results.append({
+                        "content": "",
+                        "error": error_msg
+                    })
+            except json.JSONDecodeError as e:
+                logger.error(f"   ✗ Line {line_num}: JSON decode error: {e}")
+                results.append({"content": "", "error": f"JSON decode error: {e}"})
+        return results
+    def _extract_text_from_response(self, response_obj) -> str:
+        """Extract text from response object."""
+        try:
+            # Direct text attribute
+            if hasattr(response_obj, 'text'):
+                return response_obj.text
+            # Navigate through candidates
+            if hasattr(response_obj, 'candidates') and response_obj.candidates:
+                candidate = response_obj.candidates[0]
+                if hasattr(candidate, 'content'):
+                    content = candidate.content
+                    if hasattr(content, 'parts') and content.parts:
+                        part = content.parts[0]
+                        if hasattr(part, 'text'):
+                            return part.text
+            # Fallback to string representation
+            return str(response_obj)
+        except Exception as e:
+            logger.error(f"Error extracting text from response: {e}")
+            return ""
+    def _extract_text_from_dict(self, response_dict: Dict) -> str:
+        """Extract text from response dictionary."""
+        try:
+            # Direct text key
+            if 'text' in response_dict:
+                return response_dict['text']
+            # Navigate through candidates
+            if 'candidates' in response_dict and response_dict['candidates']:
+                candidate = response_dict['candidates'][0]
+                if 'content' in candidate and 'parts' in candidate['content']:
+                    parts = candidate['content']['parts']
+                    if parts and 'text' in parts[0]:
+                        return parts[0]['text']
+            # Fallback to JSON string
+            return json.dumps(response_dict)
+        except Exception as e:
+            logger.error(f"Error extracting text from dict: {e}")
+            return ""
+    def _wait_for_file_active(self, uploaded_file, timeout: int = 60):
+        """
+        Wait for uploaded file to become active.
+        Args:
+            uploaded_file: Uploaded file object
+            timeout: Maximum seconds to wait
+        Raises:
+            TimeoutError: If file processing exceeds timeout
+            RuntimeError: If file processing fails
+        """
+        start_time = time.time()
+        while uploaded_file.state.name == "PROCESSING":
+            if time.time() - start_time > timeout:
+                raise TimeoutError(f"File processing timeout: {uploaded_file.name}")
+            time.sleep(1)
+            uploaded_file = self.client.files.get(name=uploaded_file.name)
+        if uploaded_file.state.name != "ACTIVE":
+            raise RuntimeError(
+                f"File processing failed: {uploaded_file.name} "
+                f"(state: {uploaded_file.state.name})"
+            )
+    def get_model_info(self) -> Dict[str, str]:
+        """Get model information for logging and debugging."""
+        return {
+            'provider': self.provider,
+            'model_name': self.model_name,
+            'class': self.__class__.__name__,
+            'mode': 'batch',
+            'batch_size': str(self.batch_size),
+            'polling_interval': f'{self.polling_interval}s'
+        }

src/gepa_optimizer/llms/llego_enhanced_llm.py ADDED Viewed

	@@ -0,0 +1,1625 @@

+"""
+LLEGO-Enhanced LLM Client Wrapper
+This wrapper intercepts LLM calls and uses LLEGO genetic operators
+when generating new prompt candidates during GEPA's reflection phase.
+"""
+import logging
+import re
+from typing import Optional, Dict, Any, Callable, List
+from .base_llm import BaseLLMClient
+logger = logging.getLogger(__name__)
+# Fallback system prompt for sequential generation (when JSON parsing fails)
+# Uses Linear Command structure for reliability when complex JSON generation fails
+_FALLBACK_SYSTEM_PROMPT = """You are a Prompt Optimization Engine operating in **SAFE MODE**.
+<task>
+Rewrite the prompt based on the feedback provided below.
+</task>
+<output_rules>
+1. Output **ONLY** the new prompt text.
+2. No JSON. No Explanations. No "Here is the prompt".
+3. The prompt must be fully functional and self-contained.
+4. START directly with the prompt content (e.g., "You are a..." or task instructions).
+5. Preserve the core task/domain - only improve HOW it's described.
+</output_rules>
+<quality_standards>
+- Be specific and concrete (no vague instructions)
+- Use clear, imperative language
+- Include edge case handling if feedback identifies confusion
+- Ensure the prompt is self-contained and unambiguous
+- Add explicit constraints for format/output if needed
+</quality_standards>
+<forbidden_outputs>
+- Analysis of what went wrong
+- Explanations of your changes
+- Meta-text like "Here's an improved version..."
+- Anything other than the raw prompt text
+</forbidden_outputs>
+Start of New Prompt:"""
+class LLEGOEnhancedLLMClient(BaseLLMClient):
+    """
+    Wrapper around BaseLLMClient that uses LLEGO for candidate generation.
+    This wrapper detects when GEPA is asking for new prompt candidates
+    and routes those requests through LLEGO's genetic operators instead
+    of standard LLM generation.
+    """
+    def __init__(
+        self,
+        base_llm: BaseLLMClient,
+        llego_layer,
+        config=None,
+        verbose: bool = True
+    ):
+        """
+        Initialize LLEGO-enhanced LLM client.
+        Args:
+            base_llm: The underlying LLM client (VisionLLMClient, etc.)
+            llego_layer: LLEGOIntegrationLayer instance
+            config: Optional OptimizationConfig for hybrid mode settings
+            verbose: Whether to log LLEGO operations
+        """
+        self.base_llm = base_llm
+        self.llego = llego_layer
+        self.config = config
+        self.verbose = verbose
+        # Get log level from config (default to INFO)
+        self.log_level = getattr(config, 'log_level', 'INFO') if config else 'INFO'
+        # Track context for detecting reflection calls
+        self.reflection_context = {
+            'current_prompt': None,
+            'feedback': None,
+            'in_reflection': False
+        }
+        # Queue for hybrid mode candidates (GEPA will call generate() multiple times)
+        self._candidate_queue = []
+        self._hybrid_generation_complete = False
+        # 🔥 CRITICAL: Queue for adapter-generated candidates (from make_reflective_dataset)
+        # When adapter generates candidates at adapter level, they're stored here
+        # GEPA will call generate() for proposals, and we'll return these candidates
+        self._adapter_generated_candidates = []
+        # 🔥 FORMAT AWARENESS: Store format info from adapter for use in candidate generation
+        self._detected_format = None  # Will be set by adapter after format detection
+        # FIX #5: Circuit breaker for LLEGO failures
+        self._llego_failures = 0
+        self._llego_disabled = False
+        self._llego_failure_threshold = 3  # Disable after 3 consecutive failures
+        logger.info("🧬 LLEGO-Enhanced LLM Client initialized")
+        logger.info(f"   Base LLM: {base_llm.__class__.__name__}")
+        logger.info(f"   LLEGO enabled: {llego_layer is not None}")
+        if config and hasattr(config, 'enable_gepa_reflection_with_llego'):
+            logger.info(f"   Hybrid mode: {config.enable_gepa_reflection_with_llego}")
+        logger.debug(f"   Log level: {self.log_level}")
+    def _should_log_debug(self) -> bool:
+        """
+        Check if DEBUG logging is enabled.
+        Returns:
+            True if DEBUG level logging is enabled, False otherwise
+        """
+        return self.log_level == "DEBUG" or (
+            hasattr(logging, 'getLogger') and
+            logging.getLogger().isEnabledFor(logging.DEBUG)
+        )
+    def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str:
+        """
+        🛡️ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions.
+        NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text.
+        However, this extraction logic serves as a safety net in case the LLM still adds:
+        "Based on the performance analysis...
+        ### Recommendations...
+        ### Revised Prompt Example:
+        [THE ACTUAL PROMPT HERE]
+        ### Conclusion..."
+        This is now a defensive measure, not the primary mechanism.
+        Args:
+            reflection_output: Full reflection output (should be clean prompt, but may contain analysis)
+        Returns:
+            str: Clean, extracted prompt (or original if extraction fails or not needed)
+        """
+        if not reflection_output or not isinstance(reflection_output, str):
+            return reflection_output
+        # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:"
+        patterns = [
+            r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)',
+            r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)',
+        ]
+        for pattern in patterns:
+            match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL)
+            if match:
+                extracted = match.group(1).strip()
+                # Clean up common artifacts
+                extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE)
+                extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE)
+                extracted = extracted.strip()
+                if len(extracted) > 50:  # Reasonable minimum length for a prompt
+                    logger.debug(f"✅ Extracted clean prompt using pattern: {pattern[:50]}...")
+                    logger.debug(f"   Original length: {len(reflection_output)} chars")
+                    logger.debug(f"   Extracted length: {len(extracted)} chars")
+                    return extracted
+        # Pattern 2: If output starts with a quote or prompt-like structure
+        # Look for text that starts with "You are..." and is substantial
+        if 'You are' in reflection_output:
+            # Find the longest continuous block that starts with "You are"
+            prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)',
+                                    reflection_output, re.IGNORECASE | re.DOTALL)
+            if prompt_match:
+                extracted = prompt_match.group(1).strip()
+                if len(extracted) > 50:
+                    logger.debug(f"✅ Extracted prompt starting with 'You are...'")
+                    return extracted
+        # Pattern 3: If the reflection output is actually just a clean prompt (no analysis)
+        # Check if it's relatively short and doesn't contain analysis keywords
+        analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion',
+                           'optimization', 'analysis', 'feedback']
+        if (len(reflection_output) < 2000 and
+            not any(keyword in reflection_output.lower() for keyword in analysis_keywords)):
+            # Likely a clean prompt, return as-is
+            logger.debug(f"✅ Reflection output appears to be a clean prompt (no analysis detected)")
+            return reflection_output.strip()
+        # Fallback: Try to extract ANY valid prompt-like text
+        # Look for text that might be a prompt even if not perfectly formatted
+        if 'You are' in reflection_output:
+            # Try to find a substantial block starting with "You are"
+            potential_prompt = re.search(
+                r'(You are(?:[^\.]|\.(?!\s*(?:Here|This|These|The above)))*?)(?:\n\n|\n###|Conclusion|\Z)',
+                reflection_output,
+                re.IGNORECASE | re.DOTALL
+            )
+            if potential_prompt and len(potential_prompt.group(1)) > 100:
+                extracted = potential_prompt.group(1).strip()
+                logger.warning(f"⚠️  Could not extract clean prompt using standard patterns")
+                logger.warning(f"   Falling back to 'You are...' block (length: {len(extracted)} chars)")
+                logger.warning(f"   This may still contain some analysis text")
+                return extracted
+        # Final fallback: If still nothing, return original but log strongly
+        logger.warning(f"⚠️  Could not extract clean prompt from reflection output")
+        logger.warning(f"   Output length: {len(reflection_output)} chars")
+        logger.warning(f"   Output preview: {reflection_output[:200]}...")
+        logger.warning(f"   ⚠️  WARNING: Returning original output (may contain analysis text or be invalid)")
+        logger.warning(f"   This candidate may perform poorly - consider improving extraction logic")
+        return reflection_output.strip()
+    def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]:
+        """
+        🔥 OPTIMIZED: Parse N prompt variations from JSON format response.
+        Uses robust JSON parsing with multiple fallback strategies.
+        Handles common LLM output issues:
+        - Markdown code blocks (```json ... ```)
+        - Extra text before/after JSON
+        - Trailing commas
+        - Comments in JSON
+        - Newlines in strings
+        """
+        import json
+        import re
+        if not response_text or not isinstance(response_text, str):
+            raise ValueError("Empty or invalid response text")
+        # 🔥 PREPROCESSING: Clean LLM output
+        cleaned = response_text.strip()
+        # Remove BOM and invisible chars
+        cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d')
+        # Strategy 0: Handle Python dict syntax (single quotes -> double quotes)
+        # LLMs sometimes return Python dict syntax {'key': 'value'} instead of JSON {"key": "value"}
+        if "'variations'" in cleaned or (cleaned.startswith("{'") or cleaned.startswith("{'variations'")):
+            try:
+                import ast
+                # Try to parse as Python literal (handles single quotes, True/False, None)
+                python_dict = ast.literal_eval(cleaned)
+                if isinstance(python_dict, dict) and 'variations' in python_dict:
+                    # Convert to JSON-compatible format
+                    json_str = json.dumps(python_dict)
+                    data = json.loads(json_str)
+                    if 'variations' in data:
+                        # #region agent log
+                        import json as _json_debug
+                        import time as _time_debug
+                        import os as _os_debug
+                        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                        _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                        with open(_debug_log_path, "a") as _f:
+                            _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_parse", "message": "Successfully parsed Python dict syntax", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                        # #endregion
+                        return self._extract_variations_from_json(data, num_expected)
+            except (ValueError, SyntaxError, TypeError) as e:
+                # If ast.literal_eval fails, try string replacement as fallback
+                try:
+                    # Simple conversion: replace single quotes with double quotes (with escaping)
+                    # This is a heuristic and may not work for all cases
+                    converted = cleaned.replace("'", '"')
+                    data = json.loads(converted)
+                    if 'variations' in data:
+                        # #region agent log
+                        import json as _json_debug
+                        import time as _time_debug
+                        import os as _os_debug
+                        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                        _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                        with open(_debug_log_path, "a") as _f:
+                            _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_string_replace", "message": "Parsed Python dict via string replacement", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                        # #endregion
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    pass
+        # Strategy 1: Direct JSON parse (cleanest case)
+        try:
+            data = json.loads(cleaned)
+            if 'variations' in data:
+                return self._extract_variations_from_json(data, num_expected)
+        except json.JSONDecodeError:
+            pass
+        # Strategy 2: Extract from markdown code block
+        # More permissive regex that handles various formats
+        code_block_patterns = [
+            r'```(?:json|JSON)?\s*(\{[\s\S]*?\})\s*```',  # Standard markdown
+            r'```\s*(\{[\s\S]*"variations"[\s\S]*\})\s*```',  # With "variations" keyword
+        ]
+        for pattern in code_block_patterns:
+            json_match = re.search(pattern, cleaned)
+            if json_match:
+                json_str = json_match.group(1)
+                try:
+                    data = json.loads(json_str)
+                    if 'variations' in data:
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    # Try repair
+                    repaired = self._repair_json_string(json_str)
+                    try:
+                        data = json.loads(repaired)
+                        if 'variations' in data:
+                            return self._extract_variations_from_json(data, num_expected)
+                    except json.JSONDecodeError:
+                        pass
+        # Strategy 3: Balanced brace extraction (handles nested objects)
+        json_str = self._extract_balanced_json(cleaned)
+        if json_str:
+            try:
+                data = json.loads(json_str)
+                if 'variations' in data:
+                    return self._extract_variations_from_json(data, num_expected)
+            except json.JSONDecodeError:
+                repaired = self._repair_json_string(json_str)
+                try:
+                    data = json.loads(repaired)
+                    if 'variations' in data:
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    pass
+        # Strategy 4: Find JSON object with "variations" keyword
+        # Use greedy matching to get the full object
+        json_match = re.search(r'(\{[\s\S]*"variations"[\s\S]*\})', cleaned)
+        if json_match:
+            json_str = json_match.group(1)
+            # Find the balanced JSON within
+            balanced = self._extract_balanced_json(json_str)
+            if balanced:
+                try:
+                    data = json.loads(balanced)
+                    if 'variations' in data:
+                        return self._extract_variations_from_json(data, num_expected)
+                except json.JSONDecodeError:
+                    repaired = self._repair_json_string(balanced)
+                    try:
+                        data = json.loads(repaired)
+                        if 'variations' in data:
+                            return self._extract_variations_from_json(data, num_expected)
+                    except json.JSONDecodeError:
+                        pass
+        # Strategy 5: Fallback to numbered sections
+        logger.warning(f"JSON parsing failed, trying numbered section fallback...")
+        try:
+            return self._parse_numbered_section_variations(response_text, num_expected)
+        except ValueError:
+            pass
+        # #region agent log
+        import json as _json_debug
+        import time as _time_debug
+        _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+        with open(_debug_log_path, "a") as _f:
+            _f.write(_json_debug.dumps({"hypothesisId": "D", "location": "llego_enhanced_llm.py:json_parse_fail", "message": "JSON parsing failed completely", "data": {"num_expected": num_expected, "response_preview": response_text[:500] if response_text else "EMPTY", "response_length": len(response_text) if response_text else 0}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        # #endregion
+        raise ValueError(f"Could not parse {num_expected} variations from response")
+    def _extract_balanced_json(self, text: str) -> Optional[str]:
+        """Extract JSON with balanced braces."""
+        brace_count = 0
+        start_idx = -1
+        in_string = False
+        escape_next = False
+        for i, char in enumerate(text):
+            # Handle string escaping
+            if escape_next:
+                escape_next = False
+                continue
+            if char == '\\' and in_string:
+                escape_next = True
+                continue
+            if char == '"' and not escape_next:
+                in_string = not in_string
+                continue
+            # Skip characters inside strings
+            if in_string:
+                continue
+            if char == '{':
+                if brace_count == 0:
+                    start_idx = i
+                brace_count += 1
+            elif char == '}':
+                brace_count -= 1
+                if brace_count == 0 and start_idx >= 0:
+                    return text[start_idx:i+1]
+        return None
+    def _repair_json_string(self, json_str: str) -> str:
+        """
+        Repair common JSON issues from LLM output.
+        Fixes:
+        - Trailing commas
+        - Comments
+        - Unescaped newlines in strings
+        """
+        repaired = json_str
+        # Remove trailing commas before } or ]
+        repaired = re.sub(r',\s*}', '}', repaired)
+        repaired = re.sub(r',\s*]', ']', repaired)
+        # Remove single-line comments
+        repaired = re.sub(r'//[^\n]*\n', '\n', repaired)
+        # Remove multi-line comments
+        repaired = re.sub(r'/\*[\s\S]*?\*/', '', repaired)
+        return repaired
+    def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]:
+        """Extract and validate variations from parsed JSON data."""
+        if not isinstance(data, dict):
+            raise ValueError("JSON data is not a dictionary")
+        variations_list = data.get('variations', [])
+        if not isinstance(variations_list, list):
+            raise ValueError("'variations' field is not a list")
+        # Extract and sort by index
+        variations_with_index = []
+        for var in variations_list:
+            if not isinstance(var, dict):
+                continue
+            index = var.get('index', 0)
+            prompt = var.get('prompt', '')
+            if prompt and isinstance(prompt, str):
+                variations_with_index.append((index, prompt.strip()))
+        variations_with_index.sort(key=lambda x: x[0])
+        variations = [v[1] for v in variations_with_index]
+        # Validate count
+        if len(variations) < num_expected:
+            logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}")
+            while len(variations) < num_expected:
+                variations.append(variations[-1] if variations else "")
+        variations = variations[:num_expected]
+        if not all(v for v in variations):
+            raise ValueError(f"Some variations are empty after parsing")
+        return variations
+    def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]:
+        """Fallback parser: Extract variations from numbered sections."""
+        import re
+        variations = []
+        pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)'
+        matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE)
+        pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)'
+        matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE)
+        pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)'
+        matches3 = re.findall(pattern3, response_text, re.DOTALL)
+        matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3)
+        if len(matches) >= num_expected:
+            matches.sort(key=lambda x: int(x[0]))
+            variations = [match[1].strip() for match in matches[:num_expected]]
+        if len(variations) != num_expected:
+            raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}")
+        return variations
+    def _is_valid_prompt(self, prompt: str) -> bool:
+        """
+        Validate that extracted text is actually a valid system prompt.
+        Uses minimal, conservative filtering: only rejects OBVIOUSLY wrong text.
+        Let evaluation decide on quality - false negatives (rejecting good prompts)
+        are worse than false positives (accepting bad prompts).
+        Args:
+            prompt: Extracted text to validate
+        Returns:
+            True if appears to be a valid prompt, False if obviously wrong
+        """
+        if not prompt or not prompt.strip():
+            return False
+        prompt_lower = prompt.lower().strip()
+        # STRONG indicators of analysis text (high confidence rejection)
+        # These are phrases that almost never appear in actual prompts
+        strong_analysis_patterns = [
+            'in conclusion',
+            'to summarize',
+            'based on the analysis',
+            'the analysis shows',
+            'here are some suggestions',
+            'it seems you\'re looking for',
+        ]
+        # Check first 200 characters for strong patterns
+        first_200 = prompt_lower[:200]
+        for pattern in strong_analysis_patterns:
+            if pattern in first_200:
+                if self._should_log_debug():
+                    logger.debug(f"Rejected prompt: contains analysis pattern '{pattern}'")
+                return False
+        # POSITIVE indicators of valid prompt (high confidence acceptance)
+        # These are common prompt starters
+        valid_starters = [
+            'you are',
+            'you\'re',
+            'your task',
+            'your role',
+            'analyze',
+            'identify',
+            'select',
+            'determine',
+            'given',
+            'when',
+        ]
+        # If starts with valid prompt pattern, accept immediately
+        first_100 = prompt_lower[:100]
+        if any(first_100.startswith(starter) for starter in valid_starters):
+            return True
+        # DEFAULT: Accept everything else and let evaluation decide
+        # This is conservative - we'd rather evaluate a bad prompt than reject a good one
+        return True
+    def set_reflection_context(
+        self,
+        current_prompt: Optional[str] = None,
+        feedback: Optional[Any] = None,
+        in_reflection: bool = False
+    ):
+        """
+        Set context for the next generate() call.
+        Args:
+            current_prompt: The prompt being reflected upon
+            feedback: Evaluation feedback
+            in_reflection: Whether we're in reflection mode
+        """
+        self.reflection_context = {
+            'current_prompt': current_prompt,
+            'feedback': feedback,
+            'in_reflection': in_reflection
+        }
+        # Reset candidate queue when entering new reflection phase
+        if in_reflection:
+            self._candidate_queue = []
+            self._hybrid_generation_complete = False
+            if self._should_log_debug():
+                logger.debug("🔄 Entering LLEGO reflection mode (queue reset)")
+            else:
+                logger.info("🔄 Entering LLEGO reflection mode")
+    def generate(
+        self,
+        system_prompt: str = "",
+        user_prompt: str = "",
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response, using LLEGO for reflection calls.
+        🔥 CRITICAL: This method intercepts ALL LLM calls. For candidate generation,
+        it checks if we have pre-generated candidates from hybrid mode and returns those.
+        Args:
+            system_prompt: System prompt
+            user_prompt: User prompt
+            image_base64: Base64-encoded image (if any)
+            **kwargs: Additional arguments
+        Returns:
+            Dict with 'content' key containing the generated text
+        """
+        # 🔍 DEBUG: Log generate calls (full details at DEBUG level)
+        if self._should_log_debug():
+            logger.debug(f"🔍 LLEGO Wrapper: generate() called")
+            logger.debug(f"   system_prompt: '{system_prompt[:100]}...' (truncated)")
+            logger.debug(f"   user_prompt length: {len(user_prompt)} chars")
+            logger.debug(f"   in_reflection: {self.reflection_context['in_reflection']}")
+            logger.debug(f"   has_image: {bool(image_base64)}")
+        # #region agent log
+        try:
+            import json as _json_debug
+            import time as _time_debug
+            import os as _os_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+            with open(_debug_log_path, "a") as _f:
+                _f.write(_json_debug.dumps({"hypothesisId": "INTERCEPTION", "location": "llego_enhanced_llm.py:generate", "message": "Generate called", "data": {"system_prompt_len": len(system_prompt), "user_prompt_len": len(user_prompt), "has_image": bool(image_base64), "has_candidates": len(getattr(self, '_adapter_generated_candidates', [])), "in_reflection": self.reflection_context.get('in_reflection', False)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+        except Exception:
+            pass
+        # #endregion
+        # 🔥 CRITICAL: Check if we have pre-generated candidates from adapter-level generation
+        # This happens when GEPA calls adapter.llm_client to generate candidates
+        # We intercept and return our pre-generated candidates instead
+        # 🔥 NEW: Select BEST candidate instead of FIFO
+        # 🔥 FIX: DON'T intercept evaluation calls (those have images!)
+        # Only intercept proposal calls (no images, just asking for new candidate)
+        # 🔥 FIX 2: DON'T intercept TEST EVALUATION calls!
+        # Test evaluation has no images but uses the OPTIMIZED prompt to execute tasks
+        # We detect test evaluation by checking if this is a TASK EXECUTION call (not reflection)
+        is_task_execution = (
+            # Task execution prompts contain task instructions, not optimization requests
+            not any(kw in system_prompt.lower() for kw in ['evolutionary', 'mutation', 'variation', 'optimize', 'improve prompt', 'rewrite', 'generate variations']) and
+            # Short prompts are usually task prompts, not optimization prompts
+            len(system_prompt) < 1000 and
+            # User prompt is the actual task input (short), not feedback (long)
+            len(user_prompt) < 2000
+        )
+        # Log task execution detection for debugging
+        if is_task_execution and hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates:
+            logger.info(f"🔒 NOT intercepting: Task execution detected (not optimization)")
+            logger.debug(f"   system_prompt_len={len(system_prompt)}, user_prompt_len={len(user_prompt)}")
+        if hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates and not image_base64 and not is_task_execution:
+            # 🔥 BEST-CANDIDATE SELECTION: Find candidate with highest Dpareto score
+            # This ensures we use the best candidate for the current iteration
+            best_candidate = None
+            best_score = -float('inf')
+            best_idx = -1
+            # Check if candidates have scores stored
+            for idx, cand in enumerate(self._adapter_generated_candidates):
+                if isinstance(cand, dict):
+                    # Try to get score from candidate dict
+                    score = cand.get('score', -float('inf'))
+                    # If score not in dict, try to get from Pareto logger
+                    if score == -float('inf'):
+                        from ..utils.pareto_logger import get_pareto_logger
+                        pareto_log = get_pareto_logger()
+                        # Look up score in Pareto front or evaluated candidates
+                        cand_prompt = cand.get('prompt', '')
+                        if cand_prompt:
+                            normalized = cand_prompt.strip().strip('"\'')
+                            # Check in Pareto front
+                            for front_cand in pareto_log.pareto_front:
+                                if front_cand.get('prompt', '').strip().strip('"\'') == normalized:
+                                    score = front_cand.get('score', -float('inf'))
+                                    break
+                            # If not in front, check evaluated candidates
+                            if score == -float('inf'):
+                                for eval_cand in pareto_log.candidates_evaluated:
+                                    if eval_cand.get('prompt', '').strip().strip('"\'') == normalized:
+                                        score = eval_cand.get('score', -float('inf'))
+                                        break
+                    if score > best_score:
+                        best_score = score
+                        best_candidate = cand
+                        best_idx = idx
+            # If no scores found, fall back to FIFO (first candidate)
+            if best_candidate is None and self._adapter_generated_candidates:
+                best_candidate = self._adapter_generated_candidates[0]
+                best_idx = 0
+                logger.info(f"⚠️  No scores found for candidates - using FIFO selection")
+            # Remove selected candidate from queue
+            if best_idx >= 0:
+                self._adapter_generated_candidates.pop(best_idx)
+            # Important event - keep at INFO
+            if best_score > -float('inf'):
+                logger.info(f"🎯 INTERCEPTING GEPA PROPOSAL CALL - Returning BEST candidate (score: {best_score:.4f})!")
+                logger.info(f"🎯 Remaining candidates: {len(self._adapter_generated_candidates)}")
+            else:
+                logger.info(f"🎯 INTERCEPTING GEPA PROPOSAL CALL - Returning pre-generated candidate!")
+                logger.info(f"🎯 Remaining candidates: {len(self._adapter_generated_candidates)}")
+            if isinstance(best_candidate, dict) and 'prompt' in best_candidate:
+                prompt = best_candidate['prompt']
+                # Detailed logging only in DEBUG mode
+                if self._should_log_debug():
+                    logger.debug(f"✅ Pre-generated candidate details:")
+                    logger.debug(f"{'▓'*80}")
+                    logger.debug(f"{prompt}")
+                    logger.debug(f"{'▓'*80}")
+                else:
+                    source = best_candidate.get('source', 'unknown')
+                    score_info = f" (score: {best_score:.4f})" if best_score > -float('inf') else ""
+                    logger.info(f"✅ Candidate length: {len(prompt)} chars, Source: {source}{score_info}")
+                return {'content': prompt, 'source': best_candidate.get('source', 'adapter_generated')}
+            elif isinstance(best_candidate, str):
+                if self._should_log_debug():
+                    logger.debug(f"✅ Pre-generated candidate (string format):")
+                    logger.debug(f"{'▓'*80}")
+                    logger.debug(f"{best_candidate}")
+                    logger.debug(f"{'▓'*80}")
+                else:
+                    logger.info(f"✅ Candidate length: {len(best_candidate)} chars")
+                return {'content': best_candidate, 'source': 'adapter_generated'}
+        # 🔥 ENHANCED CALL TYPE DETECTION
+        # We need to distinguish between 4 types of calls:
+        # 1. Evaluation calls: Image + task command → identify element (pass through)
+        # 2. Judge calls: Image + "prompt engineer" → analyze feedback (pass through)
+        # 3. Proposal calls: No image + feedback → generate candidate (intercept)
+        # 4. JSON batch calls: JSON generation request (pass through)
+        # FIX: DON'T intercept JSON batch generation calls
+        is_json_batch_request = (
+            '"variations"' in system_prompt or
+            'MUST BE VALID JSON' in system_prompt or
+            'Output ONLY the JSON object' in system_prompt or
+            '```json' in system_prompt.lower()
+        )
+        # FIX: DON'T intercept LLM-as-Judge calls (they analyze feedback with images)
+        is_judge_call = (
+            'prompt engineer' in system_prompt.lower() or
+            'analyzing mobile ui automation' in system_prompt.lower() or
+            'expert prompt engineer' in system_prompt.lower() or
+            ('analyze' in system_prompt.lower() and 'screenshot with numbered bounding boxes' in system_prompt.lower() and image_base64)
+        )
+        # Check if this is a reflection call (GEPA asking for new candidate)
+        is_reflection_call = (
+            self.reflection_context['in_reflection'] or
+            self._detect_reflection_call(system_prompt, user_prompt)
+        )
+        # Proposal calls are reflection calls WITHOUT images and NOT judge/JSON calls
+        # These are the calls we want to intercept with LLEGO
+        is_proposal_call = (
+            not is_json_batch_request and  # Not a JSON generation request
+            not is_judge_call and  # Not an LLM-as-Judge analysis
+            not image_base64 and  # No image = not an evaluation/judge call
+            (
+                is_reflection_call or
+                'improve' in system_prompt.lower() or
+                'optimize' in system_prompt.lower() or
+                'suggest' in system_prompt.lower() or
+                'feedback' in system_prompt.lower() or
+                'reflection' in system_prompt.lower()
+            ) and
+            len(user_prompt) > 100  # Proposal calls have substantial feedback
+        )
+        # Detailed call detection logging only in DEBUG mode
+        if self._should_log_debug():
+            logger.debug(f"   is_json_batch_request: {is_json_batch_request}")
+            logger.debug(f"   is_judge_call: {is_judge_call}")
+            logger.debug(f"   is_reflection_call: {is_reflection_call}")
+            logger.debug(f"   is_proposal_call: {is_proposal_call}")
+            logger.debug(f"   has_image: {bool(image_base64)}")
+            logger.debug(f"   has_llego: {self.llego is not None}")
+        # Only intercept proposal calls (not judge, not evaluation, not JSON)
+        if is_proposal_call and self.llego:
+            # FIX #5: Check if LLEGO is disabled due to repeated failures
+            if self._llego_disabled:
+                logger.warning("⚠️  LLEGO is disabled (circuit breaker), using base LLM")
+                return self.base_llm.generate(
+                    system_prompt=system_prompt,
+                    user_prompt=user_prompt,
+                    image_base64=image_base64,
+                    **kwargs
+                )
+            # Important event - keep at INFO
+            logger.info("🔥 INTERCEPTING REFLECTION/PROPOSAL CALL FOR CANDIDATE GENERATION")
+            return self._llego_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+        else:
+            # Standard LLM call (for evaluation, not reflection)
+            if self._should_log_debug():
+                logger.debug("   → Standard LLM call (evaluation, not reflection)")
+            return self.base_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                image_base64=image_base64,
+                **kwargs
+            )
+    def _clean_reflection_feedback(self, feedback_text: str, max_length: int = 50000) -> str:
+        """
+        Clean reflection feedback by removing base64 images and truncating.
+        🔥 CRITICAL: GEPA's feedback can include massive base64 images (7MB+).
+        This function removes them and keeps feedback concise.
+        Args:
+            feedback_text: Original feedback (may contain base64)
+            max_length: Maximum length after cleaning (default: 50K chars)
+        Returns:
+            Cleaned feedback without base64, within size limits
+        """
+        if not feedback_text:
+            return feedback_text
+        # Step 1: Remove very long base64-like sequences (50K+ chars of alphanumeric)
+        base64_pattern = r'[A-Za-z0-9+/=]{5000,}'
+        cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', feedback_text)
+        # Step 2: Remove explicit image_base64 references and their values
+        cleaned = re.sub(r'image_base64["\']?\s*[:=]\s*["\']?[A-Za-z0-9+/=]+["\']?',
+                        'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE)
+        # Step 3: Remove detailed_scores sections that might contain base64
+        cleaned = re.sub(r'##\s+detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*',
+                        '## detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE)
+        # Step 4: Remove any remaining very long strings (likely base64)
+        cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned)
+        # Step 5: Truncate if still too long (keep beginning which has most important info)
+        if len(cleaned) > max_length:
+            truncated_size = len(cleaned) - max_length
+            cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters - keeping essential feedback only]"
+            logger.warning(f"⚠️  Reflection feedback truncated: {len(feedback_text)} → {len(cleaned)} chars")
+        return cleaned
+    def _detect_reflection_call(self, system_prompt: str, user_prompt: str) -> bool:
+        """
+        Heuristic to detect if this is a reflection call from GEPA.
+        GEPA's reflection calls typically contain feedback/error analysis.
+        """
+        reflection_keywords = [
+            'improve', 'feedback', 'error', 'failure', 'reflection',
+            'better prompt', 'modify', 'enhance', 'optimize'
+        ]
+        combined = (system_prompt + " " + user_prompt).lower()
+        return any(keyword in combined for keyword in reflection_keywords)
+    def _llego_generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        Use LLEGO (or Hybrid mode) to generate new prompt candidates.
+        Args:
+            system_prompt: System prompt
+            user_prompt: User prompt (contains reflection feedback)
+            image_base64: Image data (for reflection, always empty)
+            **kwargs: Additional arguments (may contain image_base64, will be removed)
+        Returns:
+            Dict with 'content' key containing a new prompt candidate
+        """
+        try:
+            # 🔥 CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error
+            kwargs.pop('image_base64', None)  # Remove if present to avoid conflict
+            # 🔥 HYBRID MODE: Generate from BOTH GEPA reflection AND LLEGO
+            if (self.config and
+                hasattr(self.config, 'enable_gepa_reflection_with_llego') and
+                self.config.enable_gepa_reflection_with_llego):
+                return self._hybrid_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+            # STANDARD LLEGO MODE (LLEGO only)
+            return self._llego_only_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs)
+        except Exception as e:
+            # FIX #5: Circuit breaker - track failures and disable LLEGO if needed
+            self._llego_failures += 1
+            logger.error(f"❌ LLEGO generation failed ({self._llego_failures}/{self._llego_failure_threshold}): {e}")
+            logger.error("⚠️  Falling back to base LLM")
+            if self._llego_failures >= self._llego_failure_threshold:
+                self._llego_disabled = True
+                logger.error(f"🚫 LLEGO DISABLED - {self._llego_failures} consecutive failures detected")
+                logger.error("   All future requests will use base LLM only")
+            import traceback
+            logger.debug(traceback.format_exc())
+            # Fallback to base LLM - ensure image_base64 is not in kwargs
+            kwargs.pop('image_base64', None)
+            return self.base_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                image_base64=image_base64,
+                **kwargs
+            )
+    def _hybrid_generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        🔥 HYBRID MODE: Generate candidates from BOTH GEPA reflection AND LLEGO operators.
+        Smart Compensation Strategy:
+        - When crossover can't run (< 2 parents), compensates with extra GEPA reflection
+        - GEPA is smarter than mutation (uses semantic understanding of feedback)
+        - Crossover only runs when we have 2+ scored parents to combine
+        GEPA will call generate() multiple times. On first call, we generate all candidates
+        and queue them. Subsequent calls return from the queue.
+        """
+        # If we already generated candidates, return next from queue
+        if self._hybrid_generation_complete and self._candidate_queue:
+            candidate = self._candidate_queue.pop(0)
+            source = candidate.get('source', 'unknown')
+            logger.info(f"📦 Returning queued candidate (source: {source}, {len(self._candidate_queue)} remaining)")
+            return {'content': candidate['prompt'], 'source': source}
+        # First call: Generate ALL candidates
+        from ..utils.clean_logger import get_clean_logger
+        clean_log = get_clean_logger()
+        all_candidates = []
+        # ─────────────────────────────────────────────────────
+        # PHASE 0: Check if crossover will be possible
+        # ─────────────────────────────────────────────────────
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        gepa_pareto_front = pareto_log.pareto_front
+        # Determine if we need to compensate for crossover
+        crossover_possible = len(gepa_pareto_front) >= 2
+        n_crossover_config = self.config.n_crossover if hasattr(self.config, 'n_crossover') else 2
+        crossover_compensation = 0 if crossover_possible else n_crossover_config
+        if not crossover_possible:
+            logger.info(f"⚠️  Crossover NOT possible (have {len(gepa_pareto_front)} parents, need 2+)")
+            logger.info(f"   → Smart compensation: +{crossover_compensation} extra GEPA reflection candidates")
+        # ─────────────────────────────────────────────────────
+        # PHASE 1: GEPA REFLECTION (Semantic Understanding)
+        # More GEPA = better, it understands WHY things fail
+        # ─────────────────────────────────────────────────────
+        base_gepa_count = self.config.num_gepa_reflection_candidates if hasattr(self.config, 'num_gepa_reflection_candidates') else 3
+        # 🔥 SMART COMPENSATION: More GEPA when crossover can't run
+        num_gepa = base_gepa_count + crossover_compensation
+        logger.info("─" * 80)
+        logger.info("PHASE 1: GEPA REFLECTION (Semantic Understanding)")
+        if crossover_compensation > 0:
+            logger.info(f"Generating {num_gepa} candidates ({base_gepa_count} base + {crossover_compensation} compensation for skipped crossover)")
+        else:
+            logger.info(f"Generating {num_gepa} candidates")
+        logger.info("─" * 80)
+        # 🔥 OPTIMIZED: Single call with JSON format for multiple variations
+        try:
+            # Clean user_prompt before sending to LLM
+            cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+            # Build diversity requirements based on num_gepa
+            diversity_requirements = self._build_diversity_requirements(num_gepa)
+            # 🔥 FORMAT AWARENESS: Get format constraint if available
+            format_constraint = ""
+            if self._detected_format and self._detected_format.get('format_constraint'):
+                format_constraint = self._detected_format['format_constraint']
+                logger.info(f"📐 Injecting format constraint into candidate generation")
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "Format constraint injected", "data": {"format_type": self._detected_format.get('format_type', 'unknown'), "constraint_length": len(format_constraint), "avg_length": self._detected_format.get('avg_length', 0)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+            else:
+                format_constraint = "No specific format detected - ensure output is CONCISE and matches expected examples."
+                # #region agent log
+                import json as _json_debug
+                import time as _time_debug
+                import os as _os_debug
+                _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+                _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True)
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "No format constraint available", "data": {"has_detected_format": bool(self._detected_format)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n")
+                # #endregion
+            # 🔥 EVOLUTIONARY PROMPT ENGINEER: Forces radically different mutations
+            # Each variation MUST use a distinct genetic strategy to maximize search space
+            optimization_system_prompt = f"""<system_core>
+You are an **Evolutionary Prompt Engineer**. Your task is to mutate a [FAILING_PROMPT] into a high-performance instruction set using genetic strategies.
+You must generate {num_gepa} radically different prompt variations based on the [FAILURE_FEEDBACK].
+</system_core>
+<input_data>
+    <failure_feedback_log>
+{cleaned_user_prompt}
+    </failure_feedback_log>
+</input_data>
+<mutation_strategies>
+You MUST use a different strategy for each variation. Assign strategies in order:
+1. **STRATEGY A: The Strict Auditor (Constraints)**
+   - Focus: Add "Negative Constraints" (e.g., "Do NOT...", "NEVER...", "FORBIDDEN:").
+   - Use strict XML tagging for the output schema.
+   - Goal: Fix hallucinations and formatting errors.
+2. **STRATEGY B: The Reasoning Expert (Chain of Thought)**
+   - Focus: Add a "Reasoning Steps" section.
+   - Instruct the model to "Think step-by-step" before generating the final output.
+   - Goal: Fix logic errors and complex multi-step reasoning failures.
+3. **STRATEGY C: The Few-Shot Teacher (Examples)**
+   - Focus: Generate a *synthetic* example of Input -> Correct Output within the prompt.
+   - Goal: Fix understanding of abstract concepts or strict schema requirements.
+4. **STRATEGY D: The Role-Player (Persona)**
+   - Focus: Change the persona to a hyper-specific expert (e.g., "Senior Data Engineer at Fortune 500" vs "Coder").
+   - Add domain-specific vocabulary and expertise markers.
+   - Goal: Fix domain-specific terminology errors.
+5. **STRATEGY E: The Structure Architect (Format)**
+   - Focus: Add explicit output schema with field-by-field instructions.
+   - Use markdown or XML headers to organize the prompt.
+   - Goal: Fix output structure and field naming errors.
+</mutation_strategies>
+<output_constraints>
+1. **Self-Contained**: Each variation must be the FULL prompt text (100-500 words), ready to run.
+2. **No Meta-Talk**: Do not explain your strategy inside the prompt. Just output the optimized prompt.
+3. **Preserve Core Task**: Keep the original task/domain - only improve HOW it's described.
+4. **JSON Output**: Follow the schema below exactly.
+5. **ENFORCE OUTPUT FORMAT**: The generated prompt MUST instruct the model to output in the EXACT format shown in examples.
+</output_constraints>
+<critical_output_format_requirement>
+🚨 THE GENERATED PROMPTS MUST INCLUDE EXPLICIT OUTPUT FORMAT INSTRUCTIONS!
+Common failure: The model generates explanations/prose instead of the required concise format.
+{format_constraint}
+Your generated prompts MUST include:
+- Explicit instruction to output ONLY in the required format
+- "Do NOT explain", "No reasoning", "Output ONLY [format]" constraints
+- Length constraint to prevent verbose responses
+</critical_output_format_requirement>
+<response_format>
+You MUST output ONLY valid JSON. No comments, no explanations, no markdown code blocks.
+Generate exactly {num_gepa} variations in this exact format:
+{{
+    "variations": [
+        {{
+            "index": 1,
+            "strategy": "Strict Auditor",
+            "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]"
+        }},
+        {{
+            "index": 2,
+            "strategy": "Reasoning Expert",
+            "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]"
+        }}
+    ]
+}}
+CRITICAL RULES:
+1. Output ONLY the JSON object - no text before or after
+2. Do NOT use markdown code blocks (no ```json)
+3. Do NOT include comments (no // or /* */)
+4. Ensure all strings are properly escaped
+5. Generate exactly {num_gepa} variations
+6. Each variation must have: index (number), strategy (string), prompt (string)
+</response_format>
+"""
+            # Standard GEPA reflection call
+            call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+            result = self.base_llm.generate(
+                system_prompt=optimization_system_prompt,
+                user_prompt=cleaned_user_prompt,
+                image_base64=image_base64,
+                **call_kwargs
+            )
+            if isinstance(result, dict):
+                response_text = result.get("content", str(result))
+            else:
+                response_text = str(result)
+            # Parse JSON variations
+            gepa_variations = self._parse_json_variations(response_text, num_gepa)
+            # Add all variations to candidates
+            for idx, variation_prompt in enumerate(gepa_variations, 1):
+                # 🛡️ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis
+                gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt)
+                # Validate extracted prompt before adding
+                if not self._is_valid_prompt(gepa_candidate):
+                    logger.warning(f"   ⚠️  Variation {idx} appears invalid, skipping")
+                    continue
+                # 🔍 DIAGNOSTIC: Log candidate length to help diagnose scoring issues
+                if self._should_log_debug():
+                    logger.debug(f"   Candidate {idx} length: {len(gepa_candidate)} chars")
+                    logger.debug(f"   Candidate {idx} preview: {gepa_candidate[:100]}...")
+                all_candidates.append({
+                    'prompt': gepa_candidate,
+                    'source': 'gepa_reflection',
+                    'index': idx
+                })
+                clean_log.log_gepa_reflection_candidate(idx, gepa_candidate)
+            gepa_count = len(all_candidates)
+            logger.info(f"✅ GEPA Reflection: {gepa_count} candidates generated in single optimized call")
+        except Exception as e:
+            logger.error(f"❌ Error generating GEPA reflection candidates: {e}")
+            logger.warning(f"   Falling back to sequential generation...")
+            import traceback
+            logger.debug(traceback.format_exc())
+            # Fallback: Sequential generation (when JSON parsing fails)
+            gepa_count = self._fallback_sequential_gepa_generation(
+                num_gepa, user_prompt, image_base64, kwargs, all_candidates, clean_log
+            )
+        if gepa_count > 0:
+            logger.info(f"GEPA Reflection Complete: {gepa_count} candidates")
+        # ─────────────────────────────────────────────────────
+        # PHASE 2: LLEGO GENETIC OPERATORS
+        # ─────────────────────────────────────────────────────
+        logger.info("─" * 80)
+        logger.info("PHASE 2: LLEGO GENETIC OPERATORS")
+        logger.info("─" * 80)
+        # Extract current prompt from context
+        current_prompt = self.reflection_context.get('current_prompt', '')
+        if not current_prompt:
+            current_prompt = self._extract_prompt_from_feedback(user_prompt)
+        if not current_prompt and self.llego.population:
+            current_prompt = self.llego.population[0].prompt
+            logger.info(f"   Using population prompt (length: {len(current_prompt)})")
+        # Convert GEPA Pareto front to PromptCandidate format (already fetched in Phase 0)
+        pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+        pareto_front = pareto_candidates
+        logger.info(f"   Pareto front: {len(pareto_front)} candidates with scores")
+        for idx, p in enumerate(pareto_front, 1):
+            notation = p.metadata.get('notation', 'S') if p.metadata else 'S'
+            logger.info(f"      {notation}: fitness={p.fitness:.3f}")
+        # Create LLM callable for LLEGO genetic operations (crossover/mutation)
+        call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+        # LLEGO genetic prompt with SAFETY LOCKS to prevent task drift
+        # Directed mutations ensure prompts improve without losing core functionality
+        genetic_operator_system_prompt = """<system_role>
+You are a **Prompt Mutation Engine**. Your input is a [PARENT_PROMPT]. Your output is a [MUTATED_CHILD].
+</system_role>
+<mutation_directives>
+Apply ONE of the following micro-mutations to improve the prompt:
+1. **COMPRESS**: Remove fluff words ("please", "ensure to", "kindly"). Make it telegraphic and efficient.
+2. **INTENSIFY**: Capitalize key constraints (e.g., "must return JSON" -> "**MUST** return **VALID JSON**").
+3. **STRUCTURIZE**: Add markdown headers or XML tags to organize a messy prompt.
+4. **CLARIFY**: Expand vague nouns (e.g., "code" -> "production-ready Python code with type hints").
+5. **CONSTRAIN**: Add negative constraints ("Do NOT include explanations", "NEVER output markdown").
+</mutation_directives>
+<safety_locks>
+1. **IMMUTABLE CORE**: You MUST NOT change the core task (e.g., do not change "Extract JSON" to "Write a Summary").
+2. **NO EXPLANATION**: Output ONLY the new prompt string. No meta-commentary.
+3. **VALIDITY**: The output must remain a functional system prompt.
+4. **LENGTH LIMIT**: Keep mutations within 20% of original length (no excessive expansion).
+</safety_locks>"""
+        def llm_callable(genetic_prompt: str) -> str:
+            result = self.base_llm.generate(
+                system_prompt=genetic_operator_system_prompt,
+                user_prompt=genetic_prompt,
+                image_base64="",
+                **call_kwargs
+            )
+            if isinstance(result, dict):
+                return result.get('content', str(result))
+            return str(result)
+        # Generate LLEGO offspring (crossover will be skipped if < 2 parents)
+        llego_prompts = self.llego.evolve_generation(
+            llm=llm_callable,
+            pareto_front=pareto_front
+        )
+        # Track actual crossover count from LLEGO (it tracks internally now)
+        actual_crossover = getattr(self.llego, '_actual_crossover_count', 0)
+        crossover_skipped = getattr(self.llego, '_crossover_skipped', False)
+        crossover_idx = 1
+        mutation_idx = 1
+        for i, prompt in enumerate(llego_prompts):
+            if i < actual_crossover:
+                source = 'llego_crossover'
+                clean_log.log_llego_crossover_candidate(crossover_idx, prompt)
+                crossover_idx += 1
+            else:
+                source = 'llego_mutation'
+                clean_log.log_llego_mutation_candidate(mutation_idx, prompt)
+                mutation_idx += 1
+            all_candidates.append({
+                'prompt': prompt,
+                'source': source,
+                'index': i + 1
+            })
+        mutation_count = len(llego_prompts) - actual_crossover
+        logger.info(f"🧬 LLEGO: {actual_crossover} crossover + {mutation_count} mutation = {len(llego_prompts)} candidates")
+        if crossover_skipped:
+            logger.info(f"   (Crossover was skipped - compensated with extra GEPA reflection)")
+        # ─────────────────────────────────────────────────────
+        # SUMMARY
+        # ─────────────────────────────────────────────────────
+        total_gepa = len([c for c in all_candidates if c.get('source') == 'gepa_reflection'])
+        total_crossover = len([c for c in all_candidates if c.get('source') == 'llego_crossover'])
+        total_mutation = len([c for c in all_candidates if c.get('source') == 'llego_mutation'])
+        logger.info("─" * 80)
+        logger.info("CANDIDATE GENERATION SUMMARY")
+        logger.info("─" * 80)
+        logger.info(f"   GEPA Reflection:  {total_gepa} candidates (semantic understanding)")
+        logger.info(f"   LLEGO Crossover:  {total_crossover} candidates (combine best)")
+        logger.info(f"   LLEGO Mutation:   {total_mutation} candidates (exploration)")
+        logger.info(f"   TOTAL:            {len(all_candidates)} candidates")
+        if crossover_skipped:
+            logger.info(f"   📝 Note: Crossover skipped (waiting for 2+ scored parents)")
+        logger.info("─" * 80)
+        clean_log.log_candidate_generation_summary()
+        # Store in queue (skip first one - return it now)
+        self._candidate_queue = all_candidates[1:] if len(all_candidates) > 1 else []
+        self._hybrid_generation_complete = True
+        # Return first candidate
+        if all_candidates:
+            first = all_candidates[0]
+            logger.info(f"📤 Returning FIRST candidate (source: {first['source']})")
+            return {'content': first['prompt'], 'source': first['source']}
+        else:
+            logger.error("❌ No candidates generated!")
+            return {'content': '', 'source': 'error'}
+    def _llego_only_generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: str = "",
+        **kwargs
+    ) -> Dict[str, Any]:
+        """
+        STANDARD LLEGO MODE: Generate candidates using only LLEGO operators.
+        """
+        # 🔥 CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error
+        kwargs.pop('image_base64', None)
+        # 🔥 FIX: Clean user_prompt if it contains feedback (might have base64)
+        cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+        # Extract current prompt from context or user_prompt
+        current_prompt = self.reflection_context.get('current_prompt', '')
+        if not current_prompt:
+            # Try to extract from cleaned user_prompt
+            current_prompt = self._extract_prompt_from_feedback(cleaned_user_prompt)
+        logger.info(f"🧬 LLEGO: Evolving prompt...")
+        if self._should_log_debug():
+            logger.debug(f"   Current prompt: '{current_prompt[:100]}...' (length: {len(current_prompt)} chars)")
+        else:
+            logger.info(f"   Prompt length: {len(current_prompt)} chars")
+        # 🔥 FIX 2: Get Pareto front from GEPA (not LLEGO population)
+        # This ensures LLEGO operators use true non-dominated solutions
+        from ..utils.pareto_logger import get_pareto_logger
+        pareto_log = get_pareto_logger()
+        gepa_pareto_front = pareto_log.pareto_front
+        # Convert GEPA Pareto front to PromptCandidate format
+        pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front)
+        pareto_front = pareto_candidates
+        logger.info(f"   Using GEPA Pareto front (size: {len(gepa_pareto_front)})")
+        logger.info(f"   Converted to {len(pareto_front)} PromptCandidate objects")
+        # Create LLM callable for LLEGO genetic operations
+        # Uses Genetic Mutation Engine prompt for micro-mutations
+        call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+        genetic_system_prompt = """You are a **Genetic Mutation Engine** for Text Prompts.
+<task>
+Apply a specific micro-mutation to the provided prompt to increase its clarity, strictness, or effectiveness.
+</task>
+<mutation_types>
+1. **Compress**: Shorten verbose instructions without losing meaning.
+2. **Expand**: Add detail to vague nouns (e.g., "code" -> "production-ready Python 3.10 code").
+3. **Emphasize**: Highlight CRITICAL constraints using caps, bold, or explicit markers.
+4. **Constrain**: Add explicit boundaries (what NOT to do, format rules, length limits).
+5. **Exemplify**: Add a brief example if the task is ambiguous.
+</mutation_types>
+<output_rules>
+1. Output ONLY the mutated prompt text.
+2. Do NOT change the core intent or task domain.
+3. Do NOT add explanations or meta-commentary.
+4. Apply ONE primary mutation type while preserving all existing strengths.
+</output_rules>"""
+        def llm_callable(prompt: str) -> str:
+            # Clean prompt before sending (might contain base64 if from feedback)
+            cleaned_prompt = self._clean_reflection_feedback(prompt)
+            result = self.base_llm.generate(
+                system_prompt=genetic_system_prompt,
+                user_prompt=cleaned_prompt,
+                image_base64="",  # Always empty for LLEGO genetic operations
+                **call_kwargs
+            )
+            if isinstance(result, dict):
+                return result.get('content', str(result))
+            return str(result)
+        # Generate offspring using LLEGO
+        new_prompts = self.llego.evolve_generation(
+            llm=llm_callable,
+            pareto_front=pareto_front
+        )
+        if new_prompts:
+            new_prompt = new_prompts[0]
+            logger.info(f"✅ LLEGO generated new candidate (length: {len(new_prompt)} chars)")
+            if self._should_log_debug():
+                logger.debug(f"   Full prompt:")
+                logger.debug(f"   '{new_prompt}'")
+            return {
+                'content': new_prompt,
+                'source': 'llego',
+                'num_candidates': len(new_prompts)
+            }
+        else:
+            logger.warning("⚠️  LLEGO returned no candidates, falling back to base LLM")
+            return self.base_llm.generate(
+                system_prompt=system_prompt,
+                user_prompt=user_prompt,
+                image_base64="",
+                **kwargs
+            )
+    def _build_diversity_requirements(self, num_gepa: int) -> str:
+        """
+        Build diversity requirements using research-backed Prompt Design Patterns.
+        These are proven strategies from prompt engineering literature:
+        - Chain-of-Thought (CoT)
+        - Few-Shot Learning
+        - Negative Constraints
+        - Persona Pattern
+        Args:
+            num_gepa: Number of GEPA variations to generate
+        Returns:
+            String with diversity requirements for the optimization prompt
+        """
+        # Research-backed Prompt Design Patterns that solve specific classes of problems
+        strategies = [
+            """
+    <variation_1>
+        **STRATEGY: COGNITIVE DECOMPOSITION (Chain-of-Thought)**
+        - **Goal**: Fixes logic/reasoning errors.
+        - **Action**: Add a thinking process section that forces step-by-step reasoning.
+        - **Implementation**: Include instructions like "First analyze..., then identify..., finally conclude..."
+        - **Pattern**: Force the model to "Plan before executing".
+    </variation_1>
+            """,
+            """
+    <variation_2>
+        **STRATEGY: FEW-SHOT SIMULATION (In-Context Learning)**
+        - **Goal**: Fixes formatting/syntax errors and output structure issues.
+        - **Action**: Invent 1-2 realistic "Input -> Output" examples that mirror the expected format.
+        - **Implementation**: Add "Example: Given [input], respond with: [expected output format]"
+        - **Pattern**: Show, don't just tell. Demonstrate the gold standard.
+    </variation_2>
+            """,
+            """
+    <variation_3>
+        **STRATEGY: SEMANTIC CONSTRAINING (Negative Constraints)**
+        - **Goal**: Fixes hallucinations, verbosity, and off-topic responses.
+        - **Action**: Add explicit forbidden actions and boundaries.
+        - **Implementation**: Include "Do NOT explain your reasoning", "Do NOT add preambles", "Do NOT include information not asked for"
+        - **Pattern**: Define the walls, not just the path.
+    </variation_3>
+            """,
+            """
+    <variation_4>
+        **STRATEGY: PERSONA & ROLE HARDENING**
+        - **Goal**: Fixes tone, domain knowledge gaps, and inconsistent behavior.
+        - **Action**: Define a hyper-specific expert role with clear responsibilities.
+        - **Implementation**: Instead of "You are a helpful assistant", use "You are a Senior Data Analyst with 10 years of experience in [domain]"
+        - **Pattern**: Adopt the mental model and rigorous standards of a real expert.
+    </variation_4>
+            """,
+            """
+    <variation_5>
+        **STRATEGY: OUTPUT SCHEMA ENFORCEMENT**
+        - **Goal**: Fixes structural and format compliance issues.
+        - **Action**: Define an explicit output schema with field names and types.
+        - **Implementation**: Include "Your response MUST follow this exact format: {field1: type, field2: type}"
+        - **Pattern**: Leave no ambiguity about what the output should look like.
+    </variation_5>
+            """,
+            """
+    <variation_6>
+        **STRATEGY: SELF-VERIFICATION LOOP**
+        - **Goal**: Fixes errors that could be caught by double-checking.
+        - **Action**: Add instructions for the model to verify its own output.
+        - **Implementation**: Include "Before responding, verify: 1) Does this match the required format? 2) Did I include all requested information?"
+        - **Pattern**: Build in quality control before submission.
+    </variation_6>
+            """,
+            """
+    <variation_7>
+        **STRATEGY: TASK DECOMPOSITION**
+        - **Goal**: Fixes complex tasks that overwhelm the model.
+        - **Action**: Break the task into numbered sub-tasks.
+        - **Implementation**: "Step 1: [subtask]. Step 2: [subtask]. Step 3: Combine results."
+        - **Pattern**: Divide and conquer complexity.
+    </variation_7>
+            """
+        ]
+        # Select strategies based on num_gepa
+        selected = strategies[:min(num_gepa, len(strategies))]
+        requirements = "<required_strategies>\n"
+        requirements += "Each variation MUST use a DIFFERENT strategy from the list below:\n"
+        requirements += "\n".join(selected)
+        requirements += "\n</required_strategies>"
+        requirements += """
+<strategy_application_rules>
+    1. Each variation must apply its assigned strategy comprehensively.
+    2. Each variation must ALSO address ALL issues mentioned in the feedback.
+    3. The strategies are not mutually exclusive - but the PRIMARY focus of each variation should be its assigned strategy.
+    4. Do not just add a single line - transform the prompt structure according to the strategy.
+</strategy_application_rules>
+"""
+        return requirements
+    def _fallback_sequential_gepa_generation(
+        self,
+        num_gepa: int,
+        user_prompt: str,
+        image_base64: str,
+        kwargs: dict,
+        all_candidates: list,
+        clean_log
+    ) -> int:
+        """
+        Fallback to sequential generation when JSON parsing fails.
+        Args:
+            num_gepa: Number of candidates to generate
+            user_prompt: The feedback/context
+            image_base64: Image data (if any)
+            kwargs: Additional kwargs
+            all_candidates: List to append candidates to
+            clean_log: Logger for clean output
+        Returns:
+            Number of candidates generated
+        """
+        generated_count = 0
+        for i in range(num_gepa):
+            logger.debug(f"Generating Reflection Candidate #{i+1}/{num_gepa} (fallback mode)...")
+            try:
+                cleaned_user_prompt = self._clean_reflection_feedback(user_prompt)
+                # Use research-backed strategy for each variation
+                strategy_prompts = [
+                    "<optimization_rule>\nApply CHAIN-OF-THOUGHT: Add step-by-step reasoning instructions. Force the model to 'think before answering'.\n</optimization_rule>",
+                    "<optimization_rule>\nApply FEW-SHOT LEARNING: Add 1-2 concrete input/output examples within the prompt. Show, don't just tell.\n</optimization_rule>",
+                    "<optimization_rule>\nApply NEGATIVE CONSTRAINTS: Add explicit 'Do NOT' rules. Define what the model must avoid.\n</optimization_rule>",
+                    "<optimization_rule>\nApply PERSONA HARDENING: Define a specific expert role with clear responsibilities and standards.\n</optimization_rule>",
+                    "<optimization_rule>\nApply OUTPUT SCHEMA: Define the exact output format with field names and types. Leave no ambiguity.\n</optimization_rule>",
+                ]
+                strategy = strategy_prompts[i % len(strategy_prompts)]
+                fallback_prompt = f"""You are a Prompt Optimization Engine in **SAFE MODE**.
+{strategy}
+{_FALLBACK_SYSTEM_PROMPT}"""
+                call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'}
+                result = self.base_llm.generate(
+                    system_prompt=fallback_prompt,
+                    user_prompt=cleaned_user_prompt,
+                    image_base64=image_base64,
+                    **call_kwargs
+                )
+                if isinstance(result, dict):
+                    gepa_candidate_raw = result.get("content", str(result))
+                else:
+                    gepa_candidate_raw = str(result)
+                gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw)
+                if not self._is_valid_prompt(gepa_candidate):
+                    logger.warning(f"   ⚠️  Fallback candidate #{i+1} appears invalid, skipping")
+                    continue
+                all_candidates.append({
+                    'prompt': gepa_candidate,
+                    'source': 'gepa_reflection',
+                    'index': i + 1
+                })
+                clean_log.log_gepa_reflection_candidate(i + 1, gepa_candidate)
+                generated_count += 1
+            except Exception as fallback_error:
+                logger.error(f"❌ Error in fallback generation #{i+1}: {fallback_error}")
+        return generated_count
+    def _extract_prompt_from_feedback(self, user_prompt: str) -> str:
+        """
+        Try to extract the current prompt from GEPA's reflection feedback.
+        Args:
+            user_prompt: The feedback text from GEPA
+        Returns:
+            Extracted prompt or empty string
+        """
+        # Look for common patterns in GEPA's feedback
+        if "current prompt:" in user_prompt.lower():
+            lines = user_prompt.split('\n')
+            for i, line in enumerate(lines):
+                if "current prompt:" in line.lower():
+                    # Return the next line(s) as the prompt
+                    return '\n'.join(lines[i+1:i+10])
+        return ""
+    # Forward other methods to base LLM
+    def get_model_info(self) -> str:
+        """Get model information."""
+        return f"LLEGO({self.base_llm.get_model_info()})"
+    def __getattr__(self, name):
+        """Forward unknown attributes to base LLM."""
+        return getattr(self.base_llm, name)

src/gepa_optimizer/llms/vision_llm.py ADDED Viewed

	@@ -0,0 +1,813 @@

+"""
+Vision LLM Client for GEPA Optimizer
+"""
+import json
+import logging
+import time
+from enum import Enum
+import requests
+from typing import Dict, Optional, Any, TYPE_CHECKING, Union
+# Assuming APIKeyManager is available from utils
+from ..utils.api_keys import APIKeyManager
+# Import ModelConfig only for type checking to avoid circular imports
+if TYPE_CHECKING:
+    from ..models.config import ModelConfig
+from .base_llm import BaseLLMClient
+class ProviderType(str, Enum):
+    OPENAI = "openai"
+    ANTHROPIC = "anthropic"
+    HUGGINGFACE = "huggingface"
+    VLLM = "vllm"
+    GOOGLE = "google"
+    GEMINI = "gemini"
+class ErrorType(str, Enum):
+    API_ERROR = "api_error"
+    VALIDATION_ERROR = "validation_error"
+    NETWORK_ERROR = "network_error"
+    RATE_LIMIT = "rate_limit"
+    TIMEOUT = "timeout"
+class GepaLLMError(Exception):
+    """Base exception for GEPA LLM related errors"""
+    def __init__(self, message: str, error_type: ErrorType, status_code: Optional[int] = None):
+        self.message = message
+        self.error_type = error_type
+        self.status_code = status_code
+        super().__init__(self.message)
+    def __str__(self):
+        if self.status_code:
+            return f"{self.error_type.value} (HTTP {self.status_code}): {self.message}"
+        return f"{self.error_type.value}: {self.message}"
+logger = logging.getLogger(__name__)
+OPENAI_API_URL = "https://api.openai.com/v1/chat/completions"
+class VisionLLMClient(BaseLLMClient):
+    """
+    A client for interacting with multi-modal Vision LLMs (e.g., OpenAI GPT-4 Vision).
+    Example:
+        ```python
+        # Basic usage
+        client = VisionLLMClient(
+            provider="openai",
+            model_name="gpt-4-vision-preview",
+            temperature=0.7,
+            max_tokens=2048
+        )
+        # With custom configuration
+        config = ModelConfig(
+            provider="openai",
+            model_name="gpt-4-vision-preview",
+            temperature=0.5,
+            max_tokens=1024
+        )
+        client = VisionLLMClient.from_config(config)
+        ```
+    """
+    def __init__(
+        self,
+        provider: Union[str, ProviderType],
+        model_name: str,
+        api_key: Optional[str] = None,
+        base_url: Optional[str] = None,
+        temperature: float = 0.7,
+        max_tokens: int = 2048,
+        top_p: float = 1.0,
+        frequency_penalty: float = 0.0,
+        presence_penalty: float = 0.0,
+        timeout: int = 120,  # Increase to 2 minutes for large prompts
+        max_retries: int = 3
+    ):
+        """
+        Initializes the VisionLLMClient with model configuration.
+        Args:
+            provider: The provider of the model (e.g., 'openai', 'anthropic')
+            model_name: The name of the multi-modal LLM model to use (e.g., "gpt-4-vision-preview").
+            api_key: Optional API key. If not provided, it will be fetched from APIKeyManager.
+            base_url: Optional base URL for the API endpoint.
+            temperature: Controls randomness in the response generation.
+            max_tokens: Maximum number of tokens to generate.
+            top_p: Controls diversity via nucleus sampling.
+            frequency_penalty: Penalizes repeated tokens.
+            presence_penalty: Penalizes new tokens based on their presence in the text so far.
+        """
+        # Initialize parent class
+        super().__init__(provider=str(provider), model_name=model_name, **{
+            'api_key': api_key,
+            'base_url': base_url,
+            'temperature': temperature,
+            'max_tokens': max_tokens,
+            'top_p': top_p,
+            'frequency_penalty': frequency_penalty,
+            'presence_penalty': presence_penalty,
+            'timeout': timeout,
+            'max_retries': max_retries
+        })
+        # Initialize the actual client
+        self._initialize_client(provider, model_name, api_key, base_url, temperature,
+                              max_tokens, top_p, frequency_penalty, presence_penalty,
+                              timeout, max_retries)
+    def _initialize_client(self, provider, model_name, api_key, base_url, temperature,
+                          max_tokens, top_p, frequency_penalty, presence_penalty,
+                          timeout, max_retries):
+        """Initialize the actual client (existing logic)"""
+        # Input validation
+        try:
+            self.provider = ProviderType(provider.lower())
+        except ValueError:
+            raise GepaLLMError(
+                f"Unsupported provider: {provider}. "
+                f"Supported providers: {[p.value for p in ProviderType]}",
+                ErrorType.VALIDATION_ERROR
+            )
+        if not model_name:
+            raise GepaLLMError("model_name cannot be empty", ErrorType.VALIDATION_ERROR)
+        if not isinstance(temperature, (int, float)) or not 0 <= temperature <= 2:
+            raise GepaLLMError(
+                f"temperature must be between 0 and 2, got {temperature}",
+                ErrorType.VALIDATION_ERROR
+            )
+        if not isinstance(max_tokens, int) or max_tokens <= 0:
+            raise GepaLLMError(
+                f"max_tokens must be a positive integer, got {max_tokens}",
+                ErrorType.VALIDATION_ERROR
+            )
+        # Initialize API key
+        try:
+            self.api_key = api_key or APIKeyManager().get_api_key(self.provider.value)
+            if not self.api_key:
+                raise GepaLLMError(
+                    f"No API key found for provider: {self.provider}",
+                    ErrorType.VALIDATION_ERROR
+                )
+        except Exception as e:
+            raise GepaLLMError(
+                f"Failed to initialize API key: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+        self.model_name = model_name
+        self.base_url = base_url or OPENAI_API_URL
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.frequency_penalty = frequency_penalty
+        self.presence_penalty = presence_penalty
+        self.timeout = timeout
+        self.max_retries = max_retries
+        self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}")
+        # Configure session with retry
+        self.session = requests.Session()
+        retry_strategy = requests.adapters.Retry(
+            total=max_retries,
+            backoff_factor=1,
+            status_forcelist=[429, 500, 502, 503, 504],
+            allowed_methods=["POST"]
+        )
+        adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy)
+        self.session.mount("https://", adapter)
+        self.session.mount("http://", adapter)
+        # No hardcoded model restrictions - user can specify any model name
+        # The API provider will validate if the model exists and supports vision
+    def _get_api_key(self) -> Optional[str]:
+        """Get API key based on provider"""
+        if self.provider == 'openai':
+            return APIKeyManager().get_api_key('openai')
+        elif self.provider == 'anthropic':
+            return APIKeyManager().get_api_key('anthropic')
+        elif self.provider in ['google', 'gemini']:
+            return APIKeyManager().get_api_key('google')
+        # Add other providers as needed
+        return None
+    @classmethod
+    def from_config(cls, config: 'ModelConfig') -> 'VisionLLMClient':
+        """Create a VisionLLMClient from a ModelConfig object.
+        Args:
+            config: ModelConfig instance with provider and model settings
+        Returns:
+            Configured VisionLLMClient instance
+        Example:
+            ```python
+            config = ModelConfig(
+                provider="openai",
+                model_name="gpt-4-vision-preview",
+                temperature=0.7
+            )
+            client = VisionLLMClient.from_config(config)
+            ```
+        """
+        return cls(
+            provider=config.provider,
+            model_name=config.model_name,
+            api_key=config.api_key,
+            base_url=config.base_url,
+            temperature=config.temperature,
+            max_tokens=config.max_tokens,
+            top_p=config.top_p,
+            frequency_penalty=config.frequency_penalty,
+            presence_penalty=config.presence_penalty
+        )
+    @classmethod
+    def from_model_string(cls, model_string: str, **kwargs) -> 'VisionLLMClient':
+        """Create a VisionLLMClient from a model string like "provider/model-name".
+        Args:
+            model_string: Model identifier in format "provider/model-name" or just "model-name"
+                         Examples: "google/gemini-2.0-flash", "openai/gpt-4o", "gemini-1.5-pro"
+            **kwargs: Additional configuration options (temperature, max_tokens, etc.)
+        Returns:
+            Configured VisionLLMClient instance
+        Example:
+            ```python
+            # With provider
+            client = VisionLLMClient.from_model_string("google/gemini-2.0-flash")
+            # Without provider (defaults to openai)
+            client = VisionLLMClient.from_model_string("gpt-4o")
+            # With additional options
+            client = VisionLLMClient.from_model_string(
+                "google/gemini-2.0-flash",
+                temperature=0.5,
+                max_tokens=4096
+            )
+            ```
+        """
+        import os
+        # Parse "provider/model-name" format
+        if "/" in model_string:
+            provider, model_name = model_string.split("/", 1)
+        else:
+            # Default to openai if no provider specified
+            provider = "openai"
+            model_name = model_string
+        # Normalize provider names
+        provider = provider.lower()
+        if provider == "gemini":
+            provider = "google"
+        # Get API key from environment if not provided
+        api_key = kwargs.pop('api_key', None)
+        if not api_key:
+            env_var_map = {
+                "openai": "OPENAI_API_KEY",
+                "anthropic": "ANTHROPIC_API_KEY",
+                "google": "GOOGLE_API_KEY",
+            }
+            env_var = env_var_map.get(provider, f"{provider.upper()}_API_KEY")
+            api_key = os.getenv(env_var)
+        return cls(
+            provider=provider,
+            model_name=model_name,
+            api_key=api_key,
+            **kwargs
+        )
+    def generate(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: Optional[str] = None,
+        **generation_kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generates a response from the Vision LLM.
+        Args:
+            system_prompt: The system-level instructions for the LLM.
+            user_prompt: The user's query or task.
+            image_base64: Optional Base64 encoded image string.
+            **generation_kwargs: Additional model-specific generation parameters
+        Returns:
+            A dictionary containing the generated response and metadata.
+        Raises:
+            GepaLLMError: If there's an error during generation
+        Example:
+            ```python
+            response = client.generate(
+                system_prompt="You are a helpful assistant.",
+                user_prompt="What's in this image?",
+                image_base64="base64_encoded_image"
+            )
+            ```
+        """
+        if not system_prompt or not user_prompt:
+            raise GepaLLMError(
+                "system_prompt and user_prompt are required",
+                ErrorType.VALIDATION_ERROR
+            )
+        try:
+            if self.provider == ProviderType.OPENAI:
+                return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs)
+            elif self.provider in [ProviderType.GOOGLE, ProviderType.GEMINI]:
+                return self._generate_google(system_prompt, user_prompt, image_base64, **generation_kwargs)
+            else:
+                raise GepaLLMError(
+                    f"Provider {self.provider} is not yet supported",
+                    ErrorType.VALIDATION_ERROR
+                )
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Network error during generation: {str(e)}")
+            raise GepaLLMError(
+                f"Network error: {str(e)}",
+                ErrorType.NETWORK_ERROR,
+                getattr(e.response, 'status_code', None) if hasattr(e, 'response') else None
+            ) from e
+        except GepaLLMError:
+            raise
+        except Exception as e:
+            self.logger.error(f"Unexpected error during generation: {str(e)}")
+            raise GepaLLMError(
+                f"Generation failed: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+    def _generate_openai(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: Optional[str] = None,
+        **generation_kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response using OpenAI's API with configured parameters.
+        Args:
+            system_prompt: System instructions for the model
+            user_prompt: User's input prompt
+            image_base64: Optional base64 encoded image
+        Returns:
+            Dictionary containing the API response
+        Raises:
+            GepaDependencyError: If API call fails
+        """
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {self.api_key}",
+            "User-Agent": "GepaOptimizer/1.0 (Python)"
+        }
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": user_prompt}
+                ]
+            }
+        ]
+        if image_base64:
+            # #region agent log
+            import json as _json_debug
+            import time as _time_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            try:
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({
+                        "id": f"log_{int(_time_debug.time() * 1000)}",
+                        "timestamp": int(_time_debug.time() * 1000),
+                        "location": "vision_llm.py:_generate_openai",
+                        "message": "Image base64 BEFORE processing",
+                        "data": {
+                            "image_base64_length": len(image_base64) if image_base64 else 0,
+                            "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False,
+                            "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64,
+                            "is_none": image_base64 is None,
+                            "is_empty": image_base64 == "" if image_base64 else True
+                        },
+                        "sessionId": "debug-session",
+                        "runId": "run1",
+                        "hypothesisId": "A,C,D"
+                    }) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            # Detect and extract image format
+            detected_format = "jpeg"  # Default fallback
+            clean_base64 = image_base64
+            # Extract format from data URI prefix if present
+            if image_base64.startswith("data:image"):
+                # Parse format from prefix: data:image/png;base64,...
+                if "," in image_base64:
+                    prefix_part = image_base64.split(",", 1)[0]
+                    clean_base64 = image_base64.split(",", 1)[1]
+                    # Extract format from "data:image/PNG;base64" or "data:image/png"
+                    if "/" in prefix_part and ";" in prefix_part:
+                        detected_format = prefix_part.split("/")[1].split(";")[0].lower()
+                    elif "/" in prefix_part:
+                        detected_format = prefix_part.split("/")[1].lower()
+                else:
+                    # Fallback: try to extract format
+                    if "/" in image_base64:
+                        detected_format = image_base64.split("/")[1].split(";")[0].lower() if ";" in image_base64 else "jpeg"
+                    clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "")
+            # If no format detected from prefix, try to detect from image data
+            if detected_format == "jpeg" or not detected_format:
+                try:
+                    import base64 as b64
+                    from PIL import Image
+                    import io
+                    image_data = b64.b64decode(clean_base64)
+                    img = Image.open(io.BytesIO(image_data))
+                    if img.format:
+                        detected_format = img.format.lower()
+                        # Normalize format names
+                        if detected_format in ["jpg", "jpeg"]:
+                            detected_format = "jpeg"
+                except Exception:
+                    # If detection fails, keep default
+                    pass
+            # Normalize format for data URI (OpenAI accepts: jpeg, png, gif, webp)
+            format_map = {
+                "jpg": "jpeg",
+                "jpeg": "jpeg",
+                "png": "png",
+                "gif": "gif",
+                "webp": "webp",
+                "bmp": "png",  # Convert BMP to PNG (OpenAI doesn't support BMP)
+                "tiff": "png",  # Convert TIFF to PNG
+                "tif": "png"
+            }
+            final_format = format_map.get(detected_format, "jpeg")
+            final_url = f"data:image/{final_format};base64,{clean_base64}"
+            # #region agent log
+            try:
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({
+                        "id": f"log_{int(_time_debug.time() * 1000)}",
+                        "timestamp": int(_time_debug.time() * 1000),
+                        "location": "vision_llm.py:_generate_openai",
+                        "message": "Image URL AFTER processing",
+                        "data": {
+                            "detected_format": detected_format,
+                            "final_format": final_format,
+                            "clean_base64_length": len(clean_base64),
+                            "final_url_length": len(final_url),
+                            "final_url_prefix": final_url[:60]
+                        },
+                        "sessionId": "debug-session",
+                        "runId": "run1",
+                        "hypothesisId": "A,B"
+                    }) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            messages[1]["content"].append({
+                "type": "image_url",
+                "image_url": {
+                    "url": final_url
+                }
+            })
+        payload = {
+            "model": self.model_name,
+            "messages": messages,
+            # "temperature": self.temperature,
+            # "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "frequency_penalty": self.frequency_penalty,
+            "presence_penalty": self.presence_penalty
+        }
+        self.logger.debug(f"Sending request to {self.base_url} with model {self.model_name}")
+        try:
+            self.logger.debug(f"Sending request to {self.model_name}")
+            # Make the API request with retry
+            response = self.session.post(
+                self.base_url,
+                headers=headers,
+                json=payload,
+                timeout=300
+            )
+            # Handle rate limiting
+            if response.status_code == 429:
+                retry_after = int(response.headers.get('Retry-After', 5))
+                self.logger.warning(f"Rate limited. Retrying after {retry_after} seconds...")
+                time.sleep(retry_after)
+                return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs)
+            response.raise_for_status()
+            result = response.json()
+            self.logger.debug(f"Received response from {self.model_name}")
+            # Extract and validate the response
+            try:
+                message = result["choices"][0]["message"]
+                llm_response_content = message["content"]
+                # Log token usage if available
+                if "usage" in result:
+                    usage = result["usage"]
+                    self.logger.info(
+                        f"Tokens used - Prompt: {usage.get('prompt_tokens', 'N/A')}, "
+                        f"Completion: {usage.get('completion_tokens', 'N/A')}, "
+                        f"Total: {usage.get('total_tokens', 'N/A')}"
+                    )
+                # Try to parse JSON if the response looks like JSON
+                if isinstance(llm_response_content, str) and (
+                    llm_response_content.startswith('{') or
+                    llm_response_content.startswith('[')
+                ):
+                    try:
+                        return json.loads(llm_response_content)
+                    except json.JSONDecodeError:
+                        pass
+                # Default response format
+                return {
+                    "content": llm_response_content,
+                    "role": message.get("role", "assistant"),
+                    "model": self.model_name,
+                    "provider": self.provider.value
+                }
+            except (KeyError, IndexError) as e:
+                self.logger.error(f"Unexpected response format: {result}")
+                raise GepaLLMError(
+                    f"Unexpected response format from {self.provider} API",
+                    ErrorType.API_ERROR,
+                    response.status_code
+                ) from e
+        except requests.exceptions.HTTPError as e:
+            status_code = e.response.status_code if hasattr(e, 'response') else None
+            error_msg = f"HTTP error {status_code} from {self.provider} API"
+            try:
+                error_data = e.response.json()
+                error_msg = error_data.get('error', {}).get('message', error_msg)
+            except Exception:
+                error_msg = str(e)
+            self.logger.error(f"{error_msg}: {error_data if 'error_data' in locals() else str(e)}")
+            raise GepaLLMError(
+                error_msg,
+                ErrorType.RATE_LIMIT if status_code == 429 else ErrorType.API_ERROR,
+                status_code
+            ) from e
+        except requests.exceptions.Timeout:
+            self.logger.error(f"Request to {self.provider} API timed out after {self.timeout} seconds")
+            raise GepaLLMError(
+                f"Request timed out after {self.timeout} seconds",
+                ErrorType.TIMEOUT
+            )
+        except requests.exceptions.RequestException as e:
+            self.logger.error(f"Network error: {str(e)}")
+            raise GepaLLMError(
+                f"Network error: {str(e)}",
+                ErrorType.NETWORK_ERROR
+            ) from e
+        except Exception as e:
+            self.logger.error(f"Unexpected error: {str(e)}", exc_info=True)
+            raise GepaLLMError(
+                f"Unexpected error: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+    def _generate_google(
+        self,
+        system_prompt: str,
+        user_prompt: str,
+        image_base64: Optional[str] = None,
+        **generation_kwargs
+    ) -> Dict[str, Any]:
+        """
+        Generate response using Google Gemini API with configured parameters.
+        Args:
+            system_prompt: System instructions for the model
+            user_prompt: User's input prompt
+            image_base64: Optional base64 encoded image
+        Returns:
+            Dictionary containing the API response
+        Raises:
+            GepaLLMError: If API call fails
+        """
+        try:
+            import google.generativeai as genai
+            import base64
+            from PIL import Image
+            import io
+        except ImportError as e:
+            raise GepaLLMError(
+                f"Required dependencies for Google Gemini not installed: {str(e)}. "
+                f"Please install: pip install google-generativeai Pillow",
+                ErrorType.VALIDATION_ERROR
+            ) from e
+        # Configure Gemini
+        genai.configure(api_key=self.api_key)
+        # Use the model name directly as specified by the user
+        # No hardcoded mappings or restrictions - fully configurable
+        # The Gemini API will validate if the model exists
+        gemini_model_name = self.model_name
+        try:
+            model = genai.GenerativeModel(gemini_model_name)
+        except Exception as e:
+            raise GepaLLMError(
+                f"Failed to initialize Gemini model {gemini_model_name}: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e
+        # Prepare content
+        content_parts = []
+        # Add system prompt and user prompt
+        full_prompt = f"{system_prompt}\n\n{user_prompt}"
+        content_parts.append(full_prompt)
+        # Add image if provided
+        if image_base64:
+            # #region agent log
+            import json as _json_debug
+            import time as _time_debug
+            _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log"
+            try:
+                with open(_debug_log_path, "a") as _f:
+                    _f.write(_json_debug.dumps({
+                        "id": f"log_{int(_time_debug.time() * 1000)}",
+                        "timestamp": int(_time_debug.time() * 1000),
+                        "location": "vision_llm.py:_generate_google",
+                        "message": "Image base64 BEFORE processing (Google)",
+                        "data": {
+                            "image_base64_length": len(image_base64) if image_base64 else 0,
+                            "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False,
+                            "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64,
+                            "is_none": image_base64 is None,
+                            "is_empty": image_base64 == "" if image_base64 else True
+                        },
+                        "sessionId": "debug-session",
+                        "runId": "run1",
+                        "hypothesisId": "A,C,D"
+                    }) + "\n")
+            except Exception:
+                pass
+            # #endregion
+            try:
+                # Strip data URI prefix if present (hypothesis A fix)
+                clean_base64 = image_base64
+                if image_base64.startswith("data:image"):
+                    # Extract just the base64 part after the comma
+                    if "," in image_base64:
+                        clean_base64 = image_base64.split(",", 1)[1]
+                    else:
+                        clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "")
+                # Decode base64 image
+                image_data = base64.b64decode(clean_base64)
+                image = Image.open(io.BytesIO(image_data))
+                content_parts.append(image)
+                self.logger.debug(f"Added image to Gemini request")
+            except Exception as e:
+                self.logger.warning(f"Failed to process image for Gemini: {str(e)}")
+                # Continue without image rather than failing
+        self.logger.debug(f"Sending request to Gemini model {gemini_model_name}")
+        try:
+            # Generate response with retry logic
+            max_retries = 3
+            for attempt in range(max_retries):
+                try:
+                    # Configure generation parameters
+                    generation_config = genai.types.GenerationConfig(
+                        temperature=self.temperature,
+                        max_output_tokens=self.max_tokens,
+                        top_p=self.top_p,
+                    )
+                    response = model.generate_content(
+                        content_parts,
+                        generation_config=generation_config
+                    )
+                    # Check if response was blocked
+                    if response.prompt_feedback and response.prompt_feedback.block_reason:
+                        raise GepaLLMError(
+                            f"Gemini blocked the prompt: {response.prompt_feedback.block_reason}",
+                            ErrorType.VALIDATION_ERROR
+                        )
+                    # Check if response was blocked
+                    if not response.text:
+                        if response.candidates and response.candidates[0].finish_reason:
+                            finish_reason = response.candidates[0].finish_reason
+                            if finish_reason == genai.types.FinishReason.SAFETY:
+                                raise GepaLLMError(
+                                    "Gemini response blocked due to safety concerns",
+                                    ErrorType.VALIDATION_ERROR
+                                )
+                            elif finish_reason == genai.types.FinishReason.RECITATION:
+                                raise GepaLLMError(
+                                    "Gemini response blocked due to recitation concerns",
+                                    ErrorType.VALIDATION_ERROR
+                                )
+                        raise GepaLLMError(
+                            "Gemini returned empty response",
+                            ErrorType.API_ERROR
+                        )
+                    self.logger.debug(f"Received response from Gemini model {gemini_model_name}")
+                    # Log usage information if available
+                    if hasattr(response, 'usage_metadata') and response.usage_metadata:
+                        usage = response.usage_metadata
+                        self.logger.info(
+                            f"Tokens used - Prompt: {usage.prompt_token_count}, "
+                            f"Completion: {usage.candidates_token_count}, "
+                            f"Total: {usage.total_token_count}"
+                        )
+                    # Try to parse JSON if the response looks like JSON
+                    response_text = response.text
+                    if isinstance(response_text, str) and (
+                        response_text.startswith('{') or
+                        response_text.startswith('[')
+                    ):
+                        try:
+                            return json.loads(response_text)
+                        except json.JSONDecodeError:
+                            pass
+                    # Default response format
+                    return {
+                        "content": response_text,
+                        "role": "assistant",
+                        "model": gemini_model_name,
+                        "provider": "google"
+                    }
+                except Exception as e:
+                    if attempt < max_retries - 1:
+                        self.logger.warning(f"Gemini API attempt {attempt + 1} failed: {str(e)}. Retrying...")
+                        time.sleep(2 ** attempt)  # Exponential backoff
+                        continue
+                    else:
+                        raise
+        except GepaLLMError:
+            raise
+        except Exception as e:
+            self.logger.error(f"Unexpected error with Gemini API: {str(e)}")
+            raise GepaLLMError(
+                f"Gemini API error: {str(e)}",
+                ErrorType.API_ERROR
+            ) from e

src/gepa_optimizer/models/__init__.py ADDED Viewed

	@@ -0,0 +1,15 @@

+"""
+Models module for GEPA Optimizer
+"""
+from .config import ModelConfig, OptimizationConfig
+from .dataset import DatasetItem
+from .result import OptimizationResult, OptimizedResult
+__all__ = [
+    "ModelConfig",
+    "OptimizationConfig",
+    "DatasetItem",
+    "OptimizationResult",
+    "OptimizedResult"
+]

src/gepa_optimizer/models/config.py ADDED Viewed

	@@ -0,0 +1,488 @@

+"""
+Configuration models for GEPA Optimizer
+"""
+import os
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Any, Union, Tuple
+@dataclass
+class ModelConfig:
+    """Configuration for any LLM provider"""
+    provider: str  # Required: "openai", "anthropic", "huggingface", "vllm", etc.
+    model_name: str  # Required: actual model name
+    api_key: str  # Required: API key for the provider
+    base_url: Optional[str] = None  # Optional: custom endpoint URL
+    temperature: float = 0.7
+    max_tokens: int = 2048
+    top_p: float = 1.0
+    frequency_penalty: float = 0.0
+    presence_penalty: float = 0.0
+    def __post_init__(self):
+        """Validate required fields after initialization"""
+        if not self.provider:
+            raise ValueError("Provider is required (e.g., 'openai', 'anthropic', 'huggingface')")
+        if not self.model_name:
+            raise ValueError("Model name is required (e.g., 'gpt-4', 'claude-3-opus')")
+        if not self.api_key:
+            raise ValueError(f"API key is required for {self.provider} provider")
+    @classmethod
+    def from_string(cls, model_string: str) -> 'ModelConfig':
+        """Create ModelConfig from string like 'openai/gpt-4' or 'gpt-4'"""
+        if "/" in model_string:
+            provider, model_name = model_string.split("/", 1)
+        else:
+            # Default to OpenAI if no provider specified
+            provider = "openai"
+            model_name = model_string
+        # Get API key from environment
+        api_key = cls._get_api_key_for_provider(provider)
+        if not api_key:
+            raise ValueError(
+                f"No API key found for {provider}. Please set {provider.upper()}_API_KEY environment variable"
+            )
+        return cls(
+            provider=provider,
+            model_name=model_name,
+            api_key=api_key
+        )
+    @classmethod
+    def from_dict(cls, config_dict: dict) -> 'ModelConfig':
+        """Create ModelConfig from dictionary"""
+        return cls(**config_dict)
+    def to_dict(self) -> dict:
+        """Convert ModelConfig to dictionary"""
+        return {
+            'provider': self.provider,
+            'model_name': self.model_name,
+            'api_key': self.api_key,
+            'base_url': self.base_url,
+            'temperature': self.temperature,
+            'max_tokens': self.max_tokens,
+            'top_p': self.top_p,
+            'frequency_penalty': self.frequency_penalty,
+            'presence_penalty': self.presence_penalty
+        }
+    @staticmethod
+    def _get_api_key_for_provider(provider: str) -> Optional[str]:
+        """Get API key for provider from environment variables"""
+        env_var_map = {
+            "openai": "OPENAI_API_KEY",
+            "anthropic": "ANTHROPIC_API_KEY",
+            "huggingface": "HUGGINGFACE_API_KEY",
+            "cohere": "COHERE_API_KEY",
+            "ai21": "AI21_API_KEY",
+            "together": "TOGETHER_API_KEY",
+            "replicate": "REPLICATE_API_TOKEN",
+            "groq": "GROQ_API_KEY",
+            "ollama": "OLLAMA_API_KEY"
+        }
+        env_var = env_var_map.get(provider.lower())
+        if env_var:
+            return os.getenv(env_var)
+        # Fallback: try generic pattern
+        return os.getenv(f"{provider.upper()}_API_KEY")
+@dataclass
+class DataSplitConfig:
+    """Configuration for dataset splitting into train/val/test sets
+    🔥 ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size for optimal results.
+    - Small datasets (< 15): Prioritizes validation set (70/25/5) for reliable candidate ranking
+    - Medium datasets (15-50): Balanced split (60/20/20)
+    - Large datasets (50+): More training data (70/15/15)
+    """
+    # Split ratios (must sum to 1.0) - used as defaults, but adaptive strategy overrides for small datasets
+    train_ratio: float = 0.6  # 60% for training (Dfeedback - reflection examples)
+    val_ratio: float = 0.2    # 20% for validation (Dpareto - Pareto selection)
+    test_ratio: float = 0.2   # 20% for test (held-out final evaluation)
+    # Minimum samples per split
+    min_train_samples: int = 3
+    min_val_samples: int = 3  # 🔥 INCREASED from 2 to 3 for more reliable validation scores
+    min_test_samples: int = 1  # 🔥 REDUCED from 2 to 1 (test set less critical, only used once)
+    # Strategy for handling small datasets
+    small_dataset_strategy: str = 'adaptive'  # 🔥 DEFAULT: 'adaptive', 'duplicate_val', 'no_test', 'error'
+    def __post_init__(self):
+        """Validate split configuration"""
+        total = self.train_ratio + self.val_ratio + self.test_ratio
+        if not (0.99 <= total <= 1.01):  # Allow small floating point errors
+            raise ValueError(
+                f"Split ratios must sum to 1.0, got {total:.3f} "
+                f"(train={self.train_ratio}, val={self.val_ratio}, test={self.test_ratio})"
+            )
+        if self.train_ratio <= 0 or self.val_ratio <= 0 or self.test_ratio < 0:
+            raise ValueError("Split ratios must be positive (test_ratio can be 0 to disable)")
+        if self.small_dataset_strategy not in {'adaptive', 'duplicate_val', 'no_test', 'error'}:
+            raise ValueError(
+                f"Invalid small_dataset_strategy: {self.small_dataset_strategy}. "
+                f"Must be 'adaptive', 'duplicate_val', 'no_test', or 'error'"
+            )
+    def get_adaptive_ratios(self, dataset_size: int) -> Tuple[float, float, float]:
+        """
+        🔥 NEW: Get adaptive split ratios based on dataset size.
+        For prompt optimization:
+        - Small datasets (< 15): Prioritize validation (70/25/5) for reliable candidate ranking
+        - Medium (15-50): Balanced (60/20/20)
+        - Large (50+): More training (70/15/15)
+        Args:
+            dataset_size: Total number of samples in dataset
+        Returns:
+            Tuple of (train_ratio, val_ratio, test_ratio)
+        """
+        if dataset_size < 15:
+            # Small dataset: Prioritize validation for reliable candidate ranking
+            # Validation set is CRITICAL - used for every candidate evaluation
+            return (0.70, 0.25, 0.05)  # 70% train, 25% val, 5% test
+        elif dataset_size < 50:
+            # Medium dataset: Balanced split
+            return (0.60, 0.20, 0.20)  # 60% train, 20% val, 20% test
+        else:
+            # Large dataset: More training data, can reduce validation/test
+            return (0.70, 0.15, 0.15)  # 70% train, 15% val, 15% test
+    def get_split_indices(self, dataset_size: int) -> Tuple[int, int, int, int]:
+        """
+        Calculate split indices for a dataset with adaptive ratios.
+        🔥 ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size.
+        This ensures optimal allocation:
+        - Small datasets: More validation samples for reliable ranking
+        - Medium datasets: Balanced split
+        - Large datasets: More training data
+        Args:
+            dataset_size: Total number of samples in dataset
+        Returns:
+            Tuple of (train_end, val_end, test_end, dataset_size) indices
+        Raises:
+            ValueError: If dataset is too small for configured splits
+        """
+        # 🔥 NEW: Use adaptive ratios if strategy is 'adaptive'
+        if self.small_dataset_strategy == 'adaptive':
+            train_ratio, val_ratio, test_ratio = self.get_adaptive_ratios(dataset_size)
+        else:
+            train_ratio, val_ratio, test_ratio = self.train_ratio, self.val_ratio, self.test_ratio
+        if dataset_size < self.min_train_samples + self.min_val_samples:
+            if self.small_dataset_strategy == 'error':
+                raise ValueError(
+                    f"Dataset too small ({dataset_size} samples). "
+                    f"Need at least {self.min_train_samples + self.min_val_samples} samples."
+                )
+        # Calculate ideal split points with adaptive ratios
+        train_end = max(self.min_train_samples, int(dataset_size * train_ratio))
+        val_end = train_end + max(self.min_val_samples, int(dataset_size * val_ratio))
+        # Adjust for small datasets
+        if val_end >= dataset_size:
+            if self.small_dataset_strategy in {'adaptive', 'duplicate_val'}:
+                # Ensure minimum validation samples, use remainder for test
+                val_end = min(dataset_size, train_end + self.min_val_samples)
+                test_end = dataset_size
+            elif self.small_dataset_strategy == 'no_test':
+                # No test set for small datasets
+                val_end = dataset_size
+                test_end = dataset_size
+            else:  # error
+                raise ValueError(
+                    f"Dataset too small ({dataset_size} samples) for train/val/test split. "
+                    f"Need at least {self.min_train_samples + self.min_val_samples + self.min_test_samples} samples."
+                )
+        else:
+            test_end = dataset_size
+        return train_end, val_end, test_end, dataset_size
+@dataclass
+class OptimizationConfig:
+    """Configuration class for GEPA optimization process"""
+    # Core models - REQUIRED by user
+    model: Union[str, ModelConfig]  # No default - user must specify
+    reflection_model: Union[str, ModelConfig]  # No default - user must specify
+    # Optimization parameters - REQUIRED by user
+    max_iterations: int  # No default - user decides their budget
+    max_metric_calls: int  # No default - user sets their budget
+    batch_size: int  # No default - user decides based on memory
+    # Dataset splitting configuration
+    data_split: DataSplitConfig = field(default_factory=DataSplitConfig)
+    # Reflection settings (separate from evaluation batch_size)
+    reflection_examples: int = 3  # Number of examples for each reflection (small!)
+    # Optional optimization settings with sensible fallbacks
+    early_stopping: bool = True
+    learning_rate: float = 0.01
+    # Multi-objective optimization
+    multi_objective: bool = False
+    objectives: List[str] = field(default_factory=lambda: ["accuracy"])
+    # Advanced settings
+    custom_metrics: Optional[Dict[str, Any]] = None
+    use_cache: bool = True
+    parallel_evaluation: bool = False
+    # Backwards compatibility (deprecated)
+    train_split_ratio: Optional[float] = None  # Use data_split instead
+    min_dataset_size: int = 2
+    # Cost and budget - user controlled
+    max_cost_usd: Optional[float] = None
+    timeout_seconds: Optional[int] = None
+    # GEPA-specific optimization parameters (based on actual GEPA library)
+    candidate_selection_strategy: str = 'pareto'  # Use Pareto selection strategy
+    skip_perfect_score: bool = False  # Don't skip perfect scores (set to True for early stopping)
+    reflection_minibatch_size: Optional[int] = None  # Will use reflection_examples if None
+    perfect_score: float = 1.0  # Perfect score threshold
+    module_selector: str = 'round_robin'  # Component selection strategy
+    verbose: bool = True  # Enable detailed GEPA logging
+    # Test set evaluation
+    evaluate_on_test: bool = True  # Evaluate final prompt on held-out test set
+    # 🆕 LLEGO Genetic Operator Parameters (Optional - for faster convergence)
+    # Based on ICLR 2025 paper: "Decision Tree Induction Through LLMs via Semantically-Aware Evolution"
+    # Optimized for small datasets (6-10 samples)
+    use_llego_operators: bool = False  # Enable LLEGO genetic operators
+    # 🔥 HYBRID MODE: Combine GEPA Reflection + LLEGO Operators
+    # When both enabled, candidates are generated from BOTH sources for maximum diversity
+    enable_gepa_reflection_with_llego: bool = False  # Enable hybrid GEPA+LLEGO mode
+    num_gepa_reflection_candidates: int = 3  # Number of GEPA reflection candidates per iteration (default: 3 for better exploration, range: 2-5)
+    # Fitness-guided crossover parameters (FIX #3: Conservative alpha)
+    alpha: float = 0.05  # FIX #3: Fitness extrapolation (0.05 = 5% above best parent, realistic for prompt optimization)
+    n_crossover: int = 2  # Number of offspring from crossover per iteration
+    # Diversity-guided mutation parameters
+    tau: float = 8.0  # Diversity temperature (8.0 = moderate diversity, balanced exploration/exploitation)
+    nu: int = 3  # Parent arity (3 parents optimal for small populations ~6 samples)
+    n_mutation: int = 2  # Number of offspring from mutation per iteration (total 4 offspring with crossover)
+    # Population management (for genetic operators)
+    population_size: int = 8  # Size of prompt population (small but diverse for 6-sample dataset)
+    # 🆕 LLM-as-Judge configuration (Phase 2)
+    use_llm_as_judge: bool = True  # Enable LLM-as-Judge feedback for detailed, actionable analysis
+    llm_as_judge_threshold: float = 0.8  # Use LLM-as-Judge for scores below this threshold
+    llm_as_judge_model: Optional[ModelConfig] = None  # Optional: use different model (defaults to reflection_model)
+    # 🆕 Logging configuration (Phase 3)
+    log_level: str = "INFO"  # Logging level: "DEBUG", "INFO", "WARNING", "ERROR"
+    def __post_init__(self):
+        """Validate and process configuration after initialization"""
+        # Handle backwards compatibility for train_split_ratio
+        if self.train_split_ratio is not None and self.train_split_ratio != 0.8:
+            import warnings
+            warnings.warn(
+                "train_split_ratio is deprecated. Use data_split=DataSplitConfig(...) instead. "
+                "Converting to 3-way split with your ratio.",
+                DeprecationWarning,
+                stacklevel=2
+            )
+            # Convert 2-way split to 3-way: use train_ratio, split remainder between val/test
+            remainder = 1.0 - self.train_split_ratio
+            self.data_split = DataSplitConfig(
+                train_ratio=self.train_split_ratio,
+                val_ratio=remainder * 0.5,
+                test_ratio=remainder * 0.5
+            )
+        # Convert string models to ModelConfig objects
+        self.model = self._parse_model_config(self.model, "model")
+        self.reflection_model = self._parse_model_config(self.reflection_model, "reflection_model")
+        # Set reflection_minibatch_size default
+        if self.reflection_minibatch_size is None:
+            self.reflection_minibatch_size = self.reflection_examples
+        # Validate required parameters
+        self._validate_required_params()
+        # Validate ranges
+        self._validate_ranges()
+    def _parse_model_config(self, model: Union[str, ModelConfig], field_name: str) -> ModelConfig:
+        """Parse string model specification into ModelConfig"""
+        if isinstance(model, ModelConfig):
+            return model
+        if isinstance(model, str):
+            # Parse "provider/model-name" format
+            if "/" in model:
+                provider, model_name = model.split("/", 1)
+            else:
+                # Default to openai if no provider specified
+                provider = "openai"
+                model_name = model
+            # Try to get API key from environment
+            api_key = self._get_api_key_for_provider(provider)
+            if not api_key:
+                raise ValueError(
+                    f"No API key found for {provider}. Please set environment variable "
+                    f"or provide ModelConfig with api_key for {field_name}"
+                )
+            return ModelConfig(
+                provider=provider,
+                model_name=model_name,
+                api_key=api_key
+            )
+        raise ValueError(f"{field_name} must be either a string or ModelConfig object")
+    def _get_api_key_for_provider(self, provider: str) -> Optional[str]:
+        """Get API key for provider from environment variables"""
+        return ModelConfig._get_api_key_for_provider(provider)
+    def _validate_required_params(self):
+        """Validate that all required parameters are provided"""
+        required_fields = {
+            "max_iterations": self.max_iterations,
+            "max_metric_calls": self.max_metric_calls,
+            "batch_size": self.batch_size,
+        }
+        for field_name, value in required_fields.items():
+            if value is None:
+                raise ValueError(f"{field_name} is required and must be specified by user")
+    def _validate_ranges(self):
+        """Validate parameter ranges"""
+        if self.max_iterations <= 0:
+            raise ValueError("max_iterations must be positive")
+        if self.max_metric_calls <= 0:
+            raise ValueError("max_metric_calls must be positive")
+        if self.batch_size <= 0:
+            raise ValueError("batch_size must be positive")
+        if self.reflection_examples <= 0 or self.reflection_examples > 10:
+            raise ValueError("reflection_examples must be between 1 and 10 (recommended: 2-5)")
+        if self.reflection_minibatch_size <= 0:
+            raise ValueError("reflection_minibatch_size must be positive")
+        if hasattr(self.model, 'max_tokens') and self.model.max_tokens <= 0:
+            raise ValueError("model.max_tokens must be a positive integer")
+        # Validate hybrid mode parameters
+        if self.enable_gepa_reflection_with_llego and not self.use_llego_operators:
+            raise ValueError("enable_gepa_reflection_with_llego requires use_llego_operators=True")
+        if self.num_gepa_reflection_candidates <= 0 or self.num_gepa_reflection_candidates > 5:
+            raise ValueError("num_gepa_reflection_candidates must be between 1 and 5 (recommended: 3 for balanced exploration)")
+        # Validate log_level
+        valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"]
+        if self.log_level.upper() not in valid_log_levels:
+            raise ValueError(f"log_level must be one of {valid_log_levels}, got: {self.log_level}")
+    def validate_api_connectivity(self) -> Dict[str, bool]:
+        """Test API connectivity for both models"""
+        results = {}
+        for model_name, model_config in [("model", self.model), ("reflection_model", self.reflection_model)]:
+            try:
+                # This would be implemented to actually test the API
+                # For now, just check if we have the required info
+                if model_config.api_key and model_config.provider and model_config.model_name:
+                    results[model_name] = True
+                else:
+                    results[model_name] = False
+            except Exception:
+                results[model_name] = False
+        return results
+    def get_estimated_cost(self) -> Dict[str, Any]:
+        """Estimate cost based on configuration"""
+        # This would calculate estimated costs based on:
+        # - max_metric_calls
+        # - model pricing
+        # - expected tokens per call
+        return {
+            "max_calls": self.max_metric_calls,
+            "estimated_cost_range": "To be calculated based on provider pricing",
+            "cost_factors": {
+                "model_calls": self.max_metric_calls,
+                "reflection_calls": self.max_iterations,
+                "batch_size": self.batch_size
+            }
+        }
+    @classmethod
+    def create_example_config(cls, provider: str = "openai") -> str:
+        """Generate example configuration code for users"""
+        examples = {
+            "openai": '''
+# Example OpenAI Configuration
+config = OptimizationConfig(
+    model="openai/gpt-4-turbo",  # or ModelConfig(...)
+    reflection_model="openai/gpt-4-turbo",
+    max_iterations=50,  # Your choice based on budget
+    max_metric_calls=300,  # Your choice based on budget
+    batch_size=8,  # Your choice based on memory
+    early_stopping=True,
+    learning_rate=0.01
+)
+''',
+            "anthropic": '''
+# Example Anthropic Configuration
+config = OptimizationConfig(
+    model=ModelConfig(
+        provider="anthropic",
+        model_name="claude-3-opus-20240229",
+        api_key="your-anthropic-key",
+        temperature=0.7
+    ),
+    reflection_model="anthropic/claude-3-sonnet-20240229",
+    max_iterations=30,
+    max_metric_calls=200,
+    batch_size=4
+)
+''',
+            "mixed": '''
+# Example Mixed Providers Configuration
+config = OptimizationConfig(
+    model="openai/gpt-4-turbo",  # Main model
+    reflection_model="anthropic/claude-3-opus",  # Reflection model
+    max_iterations=25,
+    max_metric_calls=250,
+    batch_size=6,
+    max_cost_usd=100.0,  # Budget limit
+    timeout_seconds=3600  # 1 hour limit
+)
+'''
+        }
+        return examples.get(provider, examples["openai"])

src/gepa_optimizer/models/dataset.py ADDED Viewed

	@@ -0,0 +1,89 @@

+"""
+Dataset models for GEPA Optimizer
+"""
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional
+import uuid
+@dataclass
+class DatasetItem:
+    """Single item in a dataset"""
+    # Identifiers
+    item_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    # Core data
+    input_data: Any = ""
+    expected_output: Optional[str] = None
+    image_base64: Optional[str] = None
+    # Metadata
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    tags: List[str] = field(default_factory=list)
+    # File references
+    file_paths: List[str] = field(default_factory=list)
+    # Quality indicators
+    quality_score: float = 1.0
+    is_validated: bool = False
+    validation_notes: List[str] = field(default_factory=list)
+    def __post_init__(self):
+        """Validate item after initialization"""
+        if self.quality_score < 0 or self.quality_score > 1:
+            raise ValueError("quality_score must be between 0 and 1")
+    def add_tag(self, tag: str):
+        """Add a tag to this item"""
+        if tag not in self.tags:
+            self.tags.append(tag)
+    def mark_validated(self, notes: Optional[List[str]] = None):
+        """Mark item as validated"""
+        self.is_validated = True
+        if notes:
+            self.validation_notes.extend(notes)
+@dataclass
+class ProcessedDataset:
+    """Dataset after processing for GEPA optimization"""
+    # Identifiers
+    dataset_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    name: str = "Untitled Dataset"
+    # Data
+    items: List[DatasetItem] = field(default_factory=list)
+    train_split: List[DatasetItem] = field(default_factory=list)
+    val_split: List[DatasetItem] = field(default_factory=list)
+    # Metadata
+    source_info: Dict[str, Any] = field(default_factory=dict)
+    processing_stats: Dict[str, Any] = field(default_factory=dict)
+    # Quality metrics
+    total_items: int = 0
+    validated_items: int = 0
+    avg_quality_score: float = 0.0
+    def __post_init__(self):
+        """Calculate derived fields"""
+        self.total_items = len(self.items)
+        if self.items:
+            self.validated_items = sum(1 for item in self.items if item.is_validated)
+            self.avg_quality_score = sum(item.quality_score for item in self.items) / len(self.items)
+    def get_stats(self) -> Dict[str, Any]:
+        """Get dataset statistics"""
+        return {
+            'total_items': self.total_items,
+            'validated_items': self.validated_items,
+            'validation_rate': self.validated_items / self.total_items if self.total_items > 0 else 0,
+            'avg_quality_score': self.avg_quality_score,
+            'train_size': len(self.train_split),
+            'val_size': len(self.val_split),
+            'has_expected_outputs': sum(1 for item in self.items if item.expected_output),
+        }

src/gepa_optimizer/models/result.py ADDED Viewed

	@@ -0,0 +1,204 @@

+"""
+Result models for GEPA Optimizer
+"""
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Dict, Any, Optional, List
+import uuid
+@dataclass
+class OptimizationResult:
+    """Complete optimization result with all metadata"""
+    # Identifiers
+    session_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    # Core results
+    original_prompt: str = ""
+    optimized_prompt: str = ""
+    # Performance metrics
+    improvement_data: Dict[str, Any] = field(default_factory=dict)
+    baseline_metrics: Dict[str, float] = field(default_factory=dict)
+    final_metrics: Dict[str, float] = field(default_factory=dict)
+    # Process metadata
+    optimization_time: float = 0.0
+    dataset_size: int = 0
+    total_iterations: int = 0
+    # Status and error handling
+    status: str = "pending"  # pending, running, completed, failed
+    error_message: Optional[str] = None
+    # Timestamps
+    created_at: datetime = field(default_factory=datetime.now)
+    completed_at: Optional[datetime] = None
+    # Reflection history
+    reflection_history: List[Dict[str, Any]] = field(default_factory=list)
+    # Cost and resource usage
+    estimated_cost: Optional[float] = None
+    api_calls_made: int = 0
+    def mark_completed(self):
+        """Mark optimization as completed"""
+        self.status = "completed"
+        self.completed_at = datetime.now()
+    def mark_failed(self, error: str):
+        """Mark optimization as failed"""
+        self.status = "failed"
+        self.error_message = error
+        self.completed_at = datetime.now()
+class OptimizedResult:
+    """
+    User-facing result class that provides clean interface
+    """
+    def __init__(self,
+                 original_prompt: str = "",
+                 optimized_prompt: str = "",
+                 improvement_data: Dict[str, Any] = None,
+                 optimization_time: float = 0.0,
+                 dataset_size: int = 0,
+                 total_iterations: int = 0,
+                 status: str = "pending",
+                 error_message: Optional[str] = None,
+                 detailed_result: Optional[OptimizationResult] = None,
+                 session_id: Optional[str] = None):
+        """
+        Initialize OptimizedResult with individual parameters
+        Args:
+            original_prompt: Original seed prompt
+            optimized_prompt: Optimized prompt
+            improvement_data: Performance improvement data
+            optimization_time: Time taken for optimization
+            dataset_size: Size of dataset used
+            total_iterations: Number of optimization iterations
+            status: Optimization status
+            error_message: Error message if failed
+            detailed_result: Optional detailed OptimizationResult
+            session_id: Optional session ID
+        """
+        if improvement_data is None:
+            improvement_data = {}
+        # Create internal OptimizationResult
+        self._result = OptimizationResult(
+            session_id=session_id or str(uuid.uuid4()),
+            original_prompt=original_prompt,
+            optimized_prompt=optimized_prompt,
+            improvement_data=improvement_data,
+            optimization_time=optimization_time,
+            dataset_size=dataset_size,
+            total_iterations=total_iterations,
+            status=status,
+            error_message=error_message
+        )
+        # If detailed_result is provided, use it instead
+        if detailed_result is not None:
+            self._result = detailed_result
+    @property
+    def prompt(self) -> str:
+        """The optimized prompt ready for production use"""
+        return self._result.optimized_prompt
+    @property
+    def original_prompt(self) -> str:
+        """The original seed prompt for reference"""
+        return self._result.original_prompt
+    @property
+    def session_id(self) -> str:
+        """Unique session identifier"""
+        return self._result.session_id
+    @property
+    def improvement_data(self) -> Dict[str, Any]:
+        """Performance improvement data"""
+        return self._result.improvement_data
+    @property
+    def status(self) -> str:
+        """Optimization status"""
+        return self._result.status
+    @property
+    def error_message(self) -> Optional[str]:
+        """Error message if optimization failed"""
+        return self._result.error_message
+    @property
+    def is_successful(self) -> bool:
+        """Whether optimization completed successfully"""
+        return (
+            self._result.status == "completed" and
+            self._result.error_message is None
+        )
+    @property
+    def optimization_time(self) -> float:
+        """Time taken for optimization in seconds"""
+        return self._result.optimization_time
+    @property
+    def dataset_size(self) -> int:
+        """Size of dataset used for optimization"""
+        return self._result.dataset_size
+    @property
+    def total_iterations(self) -> int:
+        """Total optimization iterations performed"""
+        return self._result.total_iterations
+    @property
+    def estimated_cost(self) -> Optional[float]:
+        """Estimated cost in USD"""
+        return self._result.estimated_cost
+    def get_improvement_summary(self) -> Dict[str, Any]:
+        """Get summary of improvements made"""
+        summary = {
+            'has_improvement': bool(self._result.improvement_data),
+            'optimization_time': self.optimization_time,
+            'iterations': self.total_iterations,
+            'dataset_size': self.dataset_size
+        }
+        # Add improvement percentage if available
+        if 'improvement_percent' in self._result.improvement_data:
+            summary['improvement_percent'] = self._result.improvement_data['improvement_percent']
+        return summary
+    def get_reflection_summary(self) -> Dict[str, Any]:
+        """Get summary of reflection process"""
+        if not self._result.reflection_history:
+            return {'total_reflections': 0}
+        return {
+            'total_reflections': len(self._result.reflection_history),
+            'reflection_points': [
+                r.get('summary', 'No summary')
+                for r in self._result.reflection_history[:3]  # First 3
+            ]
+        }
+    def get_detailed_result(self) -> OptimizationResult:
+        """Get the full detailed result for advanced users"""
+        return self._result
+    def __str__(self) -> str:
+        """String representation"""
+        status_emoji = "✅" if self.is_successful else "❌" if self.status == "failed" else "⏳"
+        return f"OptimizedResult({status_emoji} {self.status}, time={self.optimization_time:.2f}s)"
+    def __repr__(self) -> str:
+        return self.__str__()

src/gepa_optimizer/operators/__init__.py ADDED Viewed

	@@ -0,0 +1,45 @@

+"""
+LLEGO Genetic Operators for GEPA.
+This module provides genetic operators for prompt optimization:
+- FitnessGuidedCrossover: Combines high-performing prompts
+- DiversityGuidedMutation: Explores diverse variations
+- LLEGOIntegrationLayer: Manages the genetic algorithm workflow
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+# Base interfaces (SOLID: Interface Segregation)
+from .base_operator import (
+    BaseGeneticOperator,
+    BaseCrossoverOperator,
+    BaseMutationOperator,
+)
+# Data models
+from .models import (
+    PromptCandidate,
+    PromptMetadata,
+)
+# Concrete operators (SOLID: Single Responsibility)
+from .crossover import FitnessGuidedCrossover
+from .mutation import DiversityGuidedMutation
+# Integration layer
+from .llego_operators import LLEGOIntegrationLayer
+__all__ = [
+    # Base interfaces
+    'BaseGeneticOperator',
+    'BaseCrossoverOperator',
+    'BaseMutationOperator',
+    # Data models
+    'PromptCandidate',
+    'PromptMetadata',
+    # Operators
+    'FitnessGuidedCrossover',
+    'DiversityGuidedMutation',
+    # Integration
+    'LLEGOIntegrationLayer',
+]

src/gepa_optimizer/operators/base_operator.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""
+Base Genetic Operator Interface.
+Defines the abstract interface for all genetic operators following
+the Interface Segregation Principle (ISP) of SOLID.
+"""
+from abc import ABC, abstractmethod
+from typing import List, Callable
+import logging
+logger = logging.getLogger(__name__)
+class BaseGeneticOperator(ABC):
+    """
+    Abstract base class for genetic operators.
+    All genetic operators (crossover, mutation, etc.) should inherit from this
+    class and implement the __call__ method.
+    Design Principles:
+    - Single Responsibility: Each operator does one thing
+    - Open/Closed: Extend via inheritance, don't modify
+    - Liskov Substitution: Any operator works where base is expected
+    - Interface Segregation: Minimal required interface
+    - Dependency Inversion: Depend on abstractions (LLM callable)
+    """
+    @abstractmethod
+    def __call__(self, *args, **kwargs) -> str:
+        """
+        Execute the genetic operation.
+        Returns:
+            str: New prompt generated by the operation
+        """
+        pass
+    @abstractmethod
+    def _build_prompt(self, *args, **kwargs) -> str:
+        """
+        Build the LLM prompt for this operation.
+        Returns:
+            str: Prompt to send to the LLM
+        """
+        pass
+class BaseCrossoverOperator(BaseGeneticOperator):
+    """
+    Abstract base class for crossover operators.
+    Crossover combines multiple parent prompts to create offspring
+    that inherit good traits from both parents.
+    """
+    @abstractmethod
+    def __call__(
+        self,
+        parents: List,  # List[PromptCandidate]
+        target_fitness: float,
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Combine parent prompts to create offspring.
+        Args:
+            parents: List of parent PromptCandidate objects
+            target_fitness: Desired fitness for offspring
+            llm: Language model callable
+        Returns:
+            str: Offspring prompt
+        """
+        pass
+class BaseMutationOperator(BaseGeneticOperator):
+    """
+    Abstract base class for mutation operators.
+    Mutation creates variations of a parent prompt to explore
+    new regions of the search space.
+    """
+    @abstractmethod
+    def __call__(
+        self,
+        parent,  # PromptCandidate
+        population: List,  # List[PromptCandidate]
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Mutate a parent prompt to create a variation.
+        Args:
+            parent: Parent PromptCandidate to mutate
+            population: Current population for diversity guidance
+            llm: Language model callable
+        Returns:
+            str: Mutated prompt
+        """
+        pass

src/gepa_optimizer/operators/crossover.py ADDED Viewed

	@@ -0,0 +1,120 @@

+"""
+Fitness-Guided Crossover Operator.
+Adapts LLEGO's fitness-guided crossover for text prompts.
+Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025)
+"""
+from typing import List, Callable, TYPE_CHECKING
+import logging
+from .base_operator import BaseCrossoverOperator
+if TYPE_CHECKING:
+    from .models import PromptCandidate
+logger = logging.getLogger(__name__)
+class FitnessGuidedCrossover(BaseCrossoverOperator):
+    """
+    Fitness-guided crossover for text prompts.
+    Combines high-performing parent prompts to generate offspring
+    that target specific fitness levels using LLM semantic understanding.
+    From LLEGO paper:
+    "Fitness-guided crossover exploits high-performing regions of the search space
+    by combining parent trees targeting a desired fitness level f* = f_max + α(f_max - f_min)"
+    Reference: https://github.com/nicolashuynh/LLEGO
+    """
+    def __init__(self, alpha: float = 0.1):
+        """
+        Initialize crossover operator.
+        Args:
+            alpha: Fitness extrapolation parameter.
+                   Higher α = target higher fitness than parents.
+                   Default 0.1 from LLEGO paper (target 10% above best parent).
+        """
+        self.alpha = alpha
+        logger.debug(f"FitnessGuidedCrossover initialized with α={alpha}")
+    def __call__(
+        self,
+        parents: List["PromptCandidate"],
+        target_fitness: float,
+        llm: Callable[[str], str]
+    ) -> str:
+        """
+        Combine parent prompts targeting specific fitness.
+        Args:
+            parents: List of PromptCandidate objects (2+ parents)
+            target_fitness: Desired fitness for offspring
+            llm: Language model callable
+        Returns:
+            str: Offspring prompt
+        Raises:
+            ValueError: If fewer than 2 parents provided
+        """
+        if len(parents) < 2:
+            raise ValueError("Crossover requires at least 2 parents")
+        # Sort parents by fitness (best first)
+        sorted_parents = sorted(parents, key=lambda p: p.fitness, reverse=True)
+        logger.debug(f"Crossover: {len(parents)} parents, target fitness={target_fitness:.3f}")
+        # Build crossover prompt and call LLM
+        crossover_prompt = self._build_prompt(sorted_parents, target_fitness)
+        new_prompt = llm(crossover_prompt)
+        return new_prompt
+    def _build_prompt(
+        self,
+        parents: List["PromptCandidate"],
+        target_fitness: float
+    ) -> str:
+        """
+        Build LLM prompt for crossover operation.
+        Args:
+            parents: Sorted list of parent candidates (best first)
+            target_fitness: Target fitness for offspring
+        Returns:
+            str: Prompt for LLM
+        """
+        # Truncate parents to prevent safety filter issues
+        MAX_PARENT_LENGTH = 350
+        # Build parent descriptions (limit to top 2)
+        parent_descriptions = []
+        for i, parent in enumerate(parents[:2]):
+            truncated = parent.prompt[:MAX_PARENT_LENGTH]
+            if len(parent.prompt) > MAX_PARENT_LENGTH:
+                truncated += "..."
+            parent_descriptions.append(
+                f"P{i+1} (f={parent.fitness:.2f}): {truncated}\n"
+            )
+        prompt = f"""Combine these prompts into ONE improved version (target fitness: {target_fitness:.2f}).
+{' '.join(parent_descriptions)}
+Instructions:
+1. Merge the best rules/principles from both parents
+2. Organize logic clearly (e.g., "For X tasks: do Y", "If Z: then A")
+3. Add structure to handle different cases systematically
+4. Keep output format (Element: X, Description:, Reason:)
+5. Max 600 chars
+Output ONLY the combined prompt:"""
+        return prompt