diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000000000000000000000000000000000..a6344aac8c09253b3b630fb776ae94478aa0275b --- /dev/null +++ b/.gitattributes @@ -0,0 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..b87cde4ef37e7285d3c0477b2b76c1909fb790b5 --- /dev/null +++ b/.gitignore @@ -0,0 +1,27 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python + +# Virtual environments +venv/ +env/ +ENV/ + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Build artifacts +*.egg-info/ +dist/ +build/ + diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..02e8e6352061c1c5eff24631d8dc314c56e599c8 --- /dev/null +++ b/README.md @@ -0,0 +1,44 @@ + + +--- +title: Universal Prompt Optimizer +emoji: ๐Ÿงฌ +colorFrom: blue +colorTo: cyan +sdk: gradio +sdk_version: 4.0.0 +app_file: app.py +pinned: false +license: mit +--- +# Universal Prompt Optimizer + +A powerful genetic evolutionary prompt optimization tool built with GEPA (Genetic Evolutionary Prompt Agent). Optimize your prompts using genetic algorithms with optional LLEGO crossover for faster convergence. + +## Features + +- ๐Ÿงฌ **Genetic Algorithm Optimization**: Evolve prompts through multiple iterations +- ๐ŸŽฏ **Multi-Model Support**: Works with OpenAI, Anthropic, Google, and custom models +- ๐Ÿ“Š **Real-time Metrics**: Track optimization progress and improvements +- ๐Ÿ–ผ๏ธ **Multi-modal Support**: Include images in your training examples +- โšก **LLEGO Crossover**: Advanced genetic operations for faster convergence + +## How to Use + +1. **Select Model**: Choose your target LLM (GPT-4, Claude, Gemini, or custom) +2. **Enter Seed Prompt**: Describe your task, constraints, and desired output format +3. **Add Training Examples**: Provide input/output pairs (images optional) +4. **Configure Optimization**: Set evolution rounds, batch size, and enable LLEGO +5. **Start Optimization**: Watch as the genetic algorithm evolves your prompt + +## API Keys + +API keys are stored in-session only and never logged. You can provide them in the UI or set them as environment variables: + +- `OPENAI_API_KEY` +- `ANTHROPIC_API_KEY` +- `GOOGLE_API_KEY` + +## License + +MIT License diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..68f431f56a38e1a4767609a3d601b84667596c92 --- /dev/null +++ b/app.py @@ -0,0 +1,1563 @@ +""" +๐Ÿš€ Universal Prompt Optimizer - Enhanced Production UI v8.0 +Principal Engineer Edition: Linear/Vercel-style Dark Mode with Premium UX +""" + +import sys +import os +# Add src directory to Python path for gepa_optimizer imports +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'src')) + +import gradio as gr +import json +import base64 +import io +import os +import logging +import traceback +import html +import numpy as np +from PIL import Image as PILImage +from typing import List, Dict, Optional, Any, Tuple +import threading +from collections import deque + +# Optional import for URL image downloads +try: + import requests + REQUESTS_AVAILABLE = True +except ImportError: + REQUESTS_AVAILABLE = False + +# ========================================== +# 0. LOGGING & BACKEND UTILS +# ========================================== +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s - %(levelname)s - %(message)s" +) +logger = logging.getLogger(__name__) + +# Global Candidates Store (Thread-safe) +_candidates_store = { + 'candidates': deque(maxlen=100), + 'lock': threading.Lock(), + 'iteration': 0 +} + +def add_candidate_to_store(candidate: Dict[str, Any]): + with _candidates_store['lock']: + _candidates_store['candidates'].append({ + 'iteration': _candidates_store['iteration'], + 'source': candidate.get('source', 'unknown'), + 'prompt': candidate.get('prompt', ''), + 'timestamp': candidate.get('timestamp', ''), + 'index': len(_candidates_store['candidates']) + 1 + }) + +def get_candidates_from_store() -> List[Dict[str, Any]]: + with _candidates_store['lock']: + return list(_candidates_store['candidates']) + +def clear_candidates_store(): + with _candidates_store['lock']: + _candidates_store['candidates'].clear() + _candidates_store['iteration'] = 0 + +def increment_iteration(): + with _candidates_store['lock']: + _candidates_store['iteration'] += 1 + +# ========================================== +# 1. MOCK BACKEND (Kept as provided) +# ========================================== +try: + from gepa_optimizer import quick_optimize_sync, OptimizedResult + BACKEND_AVAILABLE = True +except ImportError: + BACKEND_AVAILABLE = False + from dataclasses import dataclass + + @dataclass + class OptimizedResult: + optimized_prompt: str + improvement_metrics: dict + iteration_history: list + + def quick_optimize_sync(seed_prompt, dataset, model, **kwargs): + import time + iterations = kwargs.get('max_iterations', 5) + batch_size = kwargs.get('batch_size', 4) + use_llego = kwargs.get('use_llego', True) + + # Simulate processing time based on iterations + time.sleep(0.5 * iterations) + + llego_note = "with LLEGO crossover" if use_llego else "standard mutation only" + + return OptimizedResult( + optimized_prompt=f"""# OPTIMIZED PROMPT FOR {model} +# ---------------------------------------- +# Optimization: {iterations} iterations, batch size {batch_size}, {llego_note} + +## Task Context +{seed_prompt} + +## Refined Instructions +1. Analyse the input constraints strictly. +2. Verify output format against expected schema. +3. Apply chain-of-thought reasoning before answering. +4. Cross-reference with provided examples for consistency. + +## Safety & Edge Cases +- If input is ambiguous, ask for clarification. +- Maintain a professional, neutral tone. +- Handle edge cases gracefully with informative responses.""", + improvement_metrics={ + "baseline_score": 0.45, + "final_score": 0.92, + "improvement": "+104.4%", + "iterations_run": iterations, + "candidates_evaluated": iterations * batch_size, + }, + iteration_history=[ + f"Iter 1: Baseline evaluation - Score: 0.45", + f"Iter 2: Added Chain-of-Thought constraints - Score: 0.62", + f"Iter 3: Refined output formatting rules - Score: 0.78", + f"Iter 4: {'LLEGO crossover applied' if use_llego else 'Mutation applied'} - Score: 0.88", + f"Iter 5: Final refinement - Score: 0.92", + ][:iterations], + ) + +# ========================================== +# 2. HELPER FUNCTIONS +# ========================================== +def gradio_image_to_base64(image_input) -> Optional[str]: + """Convert Gradio image input to base64 string with comprehensive error handling.""" + if image_input is None: + return None + + try: + pil_image = None + + if isinstance(image_input, np.ndarray): + try: + # Validate array shape and dtype + if image_input.size == 0: + logger.warning("Empty image array provided") + return None + pil_image = PILImage.fromarray(image_input) + except (ValueError, TypeError) as e: + logger.error(f"Failed to convert numpy array to PIL Image: {str(e)}") + return None + elif isinstance(image_input, PILImage.Image): + pil_image = image_input + elif isinstance(image_input, str): + if not os.path.exists(image_input): + logger.warning(f"Image file not found: {image_input}") + return None + try: + pil_image = PILImage.open(image_input) + except (IOError, OSError) as e: + logger.error(f"Failed to open image file: {str(e)}") + return None + else: + logger.warning(f"Unsupported image input type: {type(image_input)}") + return None + + if pil_image is None: + return None + + try: + # Validate image before encoding + pil_image.verify() + # Reopen after verify (verify closes the image) + pil_image = PILImage.open(io.BytesIO(pil_image.tobytes())) + except Exception: + # If verify fails, try to proceed anyway + pass + + try: + buffered = io.BytesIO() + pil_image.save(buffered, format="PNG") + img_str = base64.b64encode(buffered.getvalue()).decode("utf-8") + return f"data:image/png;base64,{img_str}" + except (IOError, OSError, ValueError) as e: + logger.error(f"Failed to encode image to base64: {str(e)}") + return None + except Exception as e: + logger.error(f"Unexpected error in image conversion: {str(e)}\n{traceback.format_exc()}") + return None + +def validate_dataset(dataset: List[Dict]) -> Tuple[bool, str]: + """Validate dataset structure and content with detailed error messages.""" + if not isinstance(dataset, list): + return False, "Dataset must be a list of examples." + + if len(dataset) == 0: + return False, "Dataset is empty. Add at least one example." + + # Validate each item in the dataset + for i, item in enumerate(dataset): + if not isinstance(item, dict): + return False, f"Dataset item {i+1} must be a dictionary with 'input' and 'output' keys." + + if "input" not in item or "output" not in item: + return False, f"Dataset item {i+1} is missing required 'input' or 'output' field." + + if not isinstance(item.get("input"), str) or not isinstance(item.get("output"), str): + return False, f"Dataset item {i+1} has invalid 'input' or 'output' type (must be strings)." + + if not item.get("input", "").strip() or not item.get("output", "").strip(): + return False, f"Dataset item {i+1} has empty 'input' or 'output' field." + + return True, "" + +def validate_model(model: str, custom_model: str) -> Tuple[bool, str]: + """Validate model selection and custom model format.""" + if not model: + return False, "Please select a foundation model." + + if model == "custom": + if not custom_model or not custom_model.strip(): + return False, "Custom model selected but no model ID provided." + + # Validate custom model format (provider/model_name) + parts = custom_model.strip().split("/") + if len(parts) != 2: + return False, "Custom model ID must be in format 'provider/model_name' (e.g., 'openai/gpt-4')." + + if not parts[0].strip() or not parts[1].strip(): + return False, "Custom model ID provider and model name cannot be empty." + + return True, "" + +def validate_api_keys(model: str, api_keys: Dict[str, str]) -> Tuple[bool, str]: + """Validate that required API keys are provided for the selected model.""" + if not api_keys: + return True, "" # Keys are optional if already set in environment + + model_provider = model.split("/")[0] if "/" in model else model.lower() + + # Check if model requires a specific provider key + required_providers = { + "openai": "openai", + "anthropic": "anthropic", + "google": "google" + } + + if model_provider in required_providers: + provider = required_providers[model_provider] + key_value = api_keys.get(provider, "").strip() if api_keys.get(provider) else "" + + # Check environment variable as fallback + env_vars = { + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "google": "GOOGLE_API_KEY" + } + + if not key_value and not os.environ.get(env_vars.get(provider, "")): + return False, f"API key for {provider.capitalize()} is required for model '{model}' but not provided." + + return True, "" + +def safe_optimize(seed_prompt, dataset, model, custom_model="", max_iterations=5, max_metric_calls=50, batch_size=4, use_llego=True, api_keys=None): + """Safely run optimization with comprehensive error handling.""" + try: + # Validate seed prompt + if not seed_prompt or not isinstance(seed_prompt, str): + return False, "Seed prompt is required and must be a string.", None + + if not seed_prompt.strip(): + return False, "Seed prompt cannot be empty.", None + + # Validate dataset + is_valid, msg = validate_dataset(dataset) + if not is_valid: + return False, msg, None + + # Determine final model + final_model = custom_model.strip() if custom_model and custom_model.strip() else model + + # Validate model + model_valid, model_msg = validate_model(model, custom_model) + if not model_valid: + return False, model_msg, None + + # Validate API keys + api_valid, api_msg = validate_api_keys(final_model, api_keys or {}) + if not api_valid: + return False, api_msg, None + + # Validate optimization parameters + if not isinstance(max_iterations, int) or max_iterations < 1 or max_iterations > 50: + return False, "Max iterations must be between 1 and 50.", None + + if not isinstance(max_metric_calls, int) or max_metric_calls < 10 or max_metric_calls > 500: + return False, "Max metric calls must be between 10 and 500.", None + + if not isinstance(batch_size, int) or batch_size < 1 or batch_size > 20: + return False, "Batch size must be between 1 and 20.", None + + # Check backend availability + if not BACKEND_AVAILABLE: + logger.warning("Backend not available, using mock optimizer") + + # Set API keys from UI if provided + if api_keys: + try: + key_mapping = { + "openai": "OPENAI_API_KEY", + "google": "GOOGLE_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + } + for provider, env_var in key_mapping.items(): + if api_keys.get(provider) and api_keys[provider].strip(): + os.environ[env_var] = api_keys[provider].strip() + logger.info(f"Set {provider} API key from UI") + except Exception as e: + logger.error(f"Failed to set API keys: {str(e)}") + return False, f"Failed to configure API keys: {str(e)}", None + + # Run optimization + try: + result = quick_optimize_sync( + seed_prompt=seed_prompt, + dataset=dataset, + model=final_model, + max_iterations=max_iterations, + max_metric_calls=max_metric_calls, + batch_size=batch_size, + use_llego=use_llego, + verbose=True, + ) + + # Validate result structure + if not result: + return False, "Optimization returned no result.", None + + if not hasattr(result, 'optimized_prompt'): + return False, "Optimization result is missing required fields.", None + + return True, "Success", result + + except KeyboardInterrupt: + logger.warning("Optimization interrupted by user") + return False, "Optimization was interrupted.", None + except TimeoutError: + logger.error("Optimization timed out") + return False, "Optimization timed out. Try reducing max_iterations or max_metric_calls.", None + except ConnectionError as e: + logger.error(f"Connection error during optimization: {str(e)}") + return False, f"Connection error: {str(e)}. Check your internet connection and API keys.", None + except ValueError as e: + logger.error(f"Invalid parameter in optimization: {str(e)}") + return False, f"Invalid configuration: {str(e)}", None + except Exception as e: + error_msg = str(e) + logger.error(f"Optimization failed: {error_msg}\n{traceback.format_exc()}") + # Provide user-friendly error messages + if "api" in error_msg.lower() or "key" in error_msg.lower(): + return False, f"API error: {error_msg}. Please check your API keys.", None + elif "rate limit" in error_msg.lower(): + return False, "Rate limit exceeded. Please wait a moment and try again.", None + elif "quota" in error_msg.lower(): + return False, "API quota exceeded. Please check your account limits.", None + else: + return False, f"Optimization failed: {error_msg}", None + + except Exception as e: + logger.error(f"Unexpected error in safe_optimize: {str(e)}\n{traceback.format_exc()}") + return False, f"Unexpected error: {str(e)}", None + +# ========================================== +# 3. UI LOGIC +# ========================================== +def add_example(input_text, output_text, image_input, current_dataset): + """Add an example to the dataset with comprehensive error handling.""" + try: + # Validate inputs + if not input_text: + raise gr.Error("Input text is required.") + + if not output_text: + raise gr.Error("Output text is required.") + + if not isinstance(input_text, str) or not isinstance(output_text, str): + raise gr.Error("Input and Output must be text strings.") + + input_text = input_text.strip() + output_text = output_text.strip() + + if not input_text: + raise gr.Error("Input text cannot be empty.") + + if not output_text: + raise gr.Error("Output text cannot be empty.") + + # Validate dataset state + if not isinstance(current_dataset, list): + raise gr.Error("Dataset state is invalid. Please refresh the page.") + + # Process image with error handling + img_b64 = None + try: + img_b64 = gradio_image_to_base64(image_input) + except Exception as e: + logger.warning(f"Image processing failed, continuing without image: {str(e)}") + # Continue without image - it's optional + + # Create new item + try: + new_item = { + "input": input_text, + "output": output_text, + "image": img_b64, + "image_preview": "๐Ÿ–ผ๏ธ Image" if img_b64 else "-" + } + + # Validate item structure + if not isinstance(new_item["input"], str) or not isinstance(new_item["output"], str): + raise gr.Error("Failed to create dataset item: invalid data types.") + + current_dataset.append(new_item) + + return current_dataset, "", "", None + + except Exception as e: + logger.error(f"Failed to add example to dataset: {str(e)}") + raise gr.Error(f"Failed to add example: {str(e)}") + + except gr.Error: + # Re-raise Gradio errors as-is + raise + except Exception as e: + logger.error(f"Unexpected error in add_example: {str(e)}\n{traceback.format_exc()}") + raise gr.Error(f"Unexpected error: {str(e)}") + +def update_table(dataset): + """Update the dataset table display with error handling.""" + try: + if not dataset: + return [] + + if not isinstance(dataset, list): + logger.error(f"Invalid dataset type: {type(dataset)}") + return [] + + table_data = [] + for i, item in enumerate(dataset): + try: + if not isinstance(item, dict): + logger.warning(f"Skipping invalid dataset item {i+1}: not a dictionary") + continue + + input_text = str(item.get("input", ""))[:50] if item.get("input") else "" + output_text = str(item.get("output", ""))[:50] if item.get("output") else "" + image_preview = str(item.get("image_preview", "-")) + + table_data.append([i+1, input_text, output_text, image_preview]) + except Exception as e: + logger.warning(f"Error processing dataset item {i+1}: {str(e)}") + continue + + return table_data + + except Exception as e: + logger.error(f"Error updating table: {str(e)}\n{traceback.format_exc()}") + return [] + +def clear_dataset(): + """Clear the dataset with error handling.""" + try: + return [], [] + except Exception as e: + logger.error(f"Error clearing dataset: {str(e)}") + return [], [] + +def get_candidates_display(): + """Generate HTML display for candidates with error handling.""" + try: + candidates = get_candidates_from_store() + + if not candidates: + return "
๐Ÿงฌ

Waiting for optimization to start...

" + + if not isinstance(candidates, list): + logger.error(f"Invalid candidates type: {type(candidates)}") + return "
Error loading candidates.
" + + html_output = "
" + + # Show last 10 candidates + candidates_to_show = list(candidates)[-10:] + for c in reversed(candidates_to_show): + try: + if not isinstance(c, dict): + continue + + iteration = str(c.get('iteration', '?')) + source = str(c.get('source', 'unknown')).upper() + prompt = str(c.get('prompt', ''))[:200] + + # Escape HTML to prevent XSS + iteration = html.escape(iteration) + source = html.escape(source) + prompt = html.escape(prompt) + + html_output += f""" +
+
+
+ ITERATION {iteration} + {source} +
+
{prompt}...
+
+ """ + except Exception as e: + logger.warning(f"Error rendering candidate: {str(e)}") + continue + + html_output += "
" + return html_output + + except Exception as e: + logger.error(f"Error generating candidates display: {str(e)}\n{traceback.format_exc()}") + return "
Error loading candidates display.
" + +def run_optimization_flow(seed, dataset, model, custom_model, iter_count, call_count, batch, llego, k_openai, k_google, k_anthropic, progress=gr.Progress()): + """Run the optimization flow with comprehensive error handling.""" + import time + + try: + # Validate inputs + if not seed: + raise gr.Error("Seed prompt is required.") + + if not dataset: + raise gr.Error("Dataset is required. Add at least one example.") + + if not model: + raise gr.Error("Model selection is required.") + + # Validate numeric parameters + try: + iter_count = int(iter_count) if iter_count else 5 + call_count = int(call_count) if call_count else 50 + batch = int(batch) if batch else 4 + except (ValueError, TypeError) as e: + raise gr.Error(f"Invalid optimization parameters: {str(e)}") + + # Determine final model + try: + final_model = custom_model.strip() if custom_model and custom_model.strip() else model + except Exception as e: + logger.warning(f"Error processing custom model: {str(e)}") + final_model = model + + # Clear candidates store + try: + clear_candidates_store() + except Exception as e: + logger.warning(f"Error clearing candidates store: {str(e)}") + + # Prepare API keys + api_keys = {} + try: + api_keys = { + "openai": k_openai if k_openai else "", + "google": k_google if k_google else "", + "anthropic": k_anthropic if k_anthropic else "" + } + except Exception as e: + logger.warning(f"Error processing API keys: {str(e)}") + + # Initial state + try: + yield ( + gr.update(visible=True), + gr.update(visible=False), + gr.update(visible=False), + "๐Ÿš€ Initializing Genetic Algorithm...", + "", {}, "", "" + ) + time.sleep(0.5) # Brief pause for UI update + except Exception as e: + logger.error(f"Error in initial UI update: {str(e)}") + raise gr.Error(f"Failed to initialize UI: {str(e)}") + + # Evolution loop (visual progress - actual work happens in safe_optimize) + try: + for i in range(1, iter_count + 1): + try: + increment_iteration() + add_candidate_to_store({ + "source": "evolution_step", + "prompt": f"Candidate {i}: Optimizing instruction clarity and task alignment...", + "timestamp": "now" + }) + + progress(i/iter_count, desc=f"Evolution Round {i}/{iter_count}") + yield ( + gr.update(), gr.update(), gr.update(), + f"๐Ÿงฌ **Evolution Round {i}/{iter_count}**\n\nโ€ข Generating {batch} prompt mutations\nโ€ข Evaluating fitness scores\nโ€ข Selecting top candidates", + "", {}, "", get_candidates_display() + ) + time.sleep(0.3) # Pause to show progress + except Exception as e: + logger.warning(f"Error in evolution step {i}: {str(e)}") + # Continue with next iteration + continue + except Exception as e: + logger.error(f"Error in evolution loop: {str(e)}") + # Continue to optimization attempt + + # Final optimization + try: + success, msg, result = safe_optimize( + seed_prompt=seed, + dataset=dataset, + model=model, + custom_model=custom_model, + max_iterations=iter_count, + max_metric_calls=call_count, + batch_size=batch, + use_llego=llego, + api_keys=api_keys + ) + + if not success: + # Show error state + yield ( + gr.update(visible=True), + gr.update(visible=False), + gr.update(visible=False), + f"โŒ **Optimization Failed**\n\n{msg}", + "", {}, "", get_candidates_display() + ) + raise gr.Error(msg) + + # Validate result before displaying + if not result: + raise gr.Error("Optimization completed but returned no result.") + + if not hasattr(result, 'optimized_prompt'): + raise gr.Error("Optimization result is missing required fields.") + + # Show results + try: + optimized_prompt = result.optimized_prompt if result.optimized_prompt else "" + improvement_metrics = result.improvement_metrics if hasattr(result, 'improvement_metrics') else {} + iteration_history = result.iteration_history if hasattr(result, 'iteration_history') else [] + + history_text = "\n".join(iteration_history) if isinstance(iteration_history, list) else str(iteration_history) + + yield ( + gr.update(visible=False), + gr.update(visible=False), + gr.update(visible=True), + "โœ… Optimization Complete", + optimized_prompt, + improvement_metrics, + history_text, + get_candidates_display() + ) + except Exception as e: + logger.error(f"Error displaying results: {str(e)}") + raise gr.Error(f"Failed to display results: {str(e)}") + + except gr.Error: + # Re-raise Gradio errors + raise + except Exception as e: + logger.error(f"Error in optimization: {str(e)}\n{traceback.format_exc()}") + raise gr.Error(f"Optimization error: {str(e)}") + + except gr.Error: + # Re-raise Gradio errors as-is + raise + except KeyboardInterrupt: + logger.warning("Optimization interrupted by user") + raise gr.Error("Optimization was interrupted.") + except Exception as e: + logger.error(f"Unexpected error in optimization flow: {str(e)}\n{traceback.format_exc()}") + raise gr.Error(f"Unexpected error: {str(e)}") + +# ========================================== +# 4. ENHANCED CSS (Linear/Vercel-style) +# ========================================== +CUSTOM_CSS = """ +@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap'); + +:root { + --bg0: #070A0F; + --bg1: #0B1020; + --bg2: rgba(255,255,255,0.04); + --bg3: rgba(255,255,255,0.06); + + --stroke0: rgba(148,163,184,0.14); + --stroke1: rgba(148,163,184,0.22); + + --text0: #EAF0FF; + --text1: rgba(234,240,255,0.74); + --text2: rgba(234,240,255,0.56); + + --teal: #06B6D4; + --blue: #3B82F6; + + --ok: #10B981; + --okGlow: rgba(16,185,129,0.18); + + --bad: #EF4444; + + --shadow: 0 12px 40px rgba(0,0,0,0.45); + --shadowSoft: 0 10px 24px rgba(0,0,0,0.32); + + --radius: 14px; + --radiusSm: 10px; +} + +html, body { + background: radial-gradient(1200px 700px at 20% -10%, rgba(6,182,212,0.13), transparent 55%), + radial-gradient(1000px 650px at 90% 0%, rgba(59,130,246,0.10), transparent 60%), + linear-gradient(180deg, var(--bg0) 0%, var(--bg1) 100%); + color: var(--text0); + font-family: Inter, system-ui, -apple-system, Segoe UI, Roboto, sans-serif; +} + +.gradio-container { + max-width: 1520px !important; + padding: 12px 18px !important; + margin: 0 auto !important; +} + +/* --- App shell --- */ +.app-shell { min-height: auto !important; } +.topbar { + padding: 12px 14px 12px 14px; + margin-bottom: 4px; + border: 1px solid var(--stroke0); + border-radius: var(--radius); + background: linear-gradient(180deg, rgba(255,255,255,0.04) 0%, rgba(255,255,255,0.02) 100%); + box-shadow: var(--shadowSoft); +} +.topbar-wrap { margin-bottom: 0 !important; } + +.brand-row { display: flex; align-items: center; justify-content: space-between; gap: 16px; } +.brand-left { display: flex; align-items: center; gap: 14px; } +.brand-mark { + width: 44px; height: 44px; border-radius: 12px; + background: linear-gradient(135deg, rgba(6,182,212,0.26), rgba(59,130,246,0.20)); + border: 1px solid rgba(6,182,212,0.30); + box-shadow: 0 0 0 4px rgba(6,182,212,0.10); + display: flex; align-items: center; justify-content: center; + font-weight: 800; +} +.h1 { + font-size: 22px; font-weight: 800; letter-spacing: -0.02em; + margin: 0; line-height: 1.2; +} +.subtitle { margin-top: 4px; color: var(--text1); font-weight: 500; font-size: 13px; } + +.status-pill { + display: inline-flex; align-items: center; gap: 10px; + padding: 10px 12px; border-radius: 999px; + background: rgba(255,255,255,0.03); + border: 1px solid var(--stroke0); + color: var(--text1); + font-size: 12px; font-weight: 700; letter-spacing: 0.08em; + text-transform: uppercase; +} +.dot { + width: 10px; height: 10px; border-radius: 999px; + background: var(--ok); + box-shadow: 0 0 16px rgba(16,185,129,0.40); + animation: pulse 1.8s ease-in-out infinite; +} +@keyframes pulse { 0%, 100% { transform: scale(1); opacity: 0.95; } 50% { transform: scale(1.18); opacity: 0.70; } } + +/* --- Two-column layout helpers --- */ +.left-col, .right-col { min-width: 280px; } + +/* --- Cards / Sections --- */ +.card { + border-radius: var(--radius); + background: linear-gradient(180deg, rgba(255,255,255,0.045) 0%, rgba(255,255,255,0.022) 100%); + border: 1px solid var(--stroke0); + box-shadow: var(--shadowSoft); + padding: 16px; +} +.card + .card { margin-top: 14px; } + +.card-head { + display: flex; align-items: center; justify-content: space-between; + gap: 12px; + padding-bottom: 12px; + margin-bottom: 12px; + border-bottom: 1px solid var(--stroke0); +} +.card-title { + display: flex; align-items: center; gap: 10px; + font-size: 13px; font-weight: 800; letter-spacing: 0.12em; + text-transform: uppercase; color: var(--text1); +} +.step { + width: 30px; height: 30px; border-radius: 10px; + background: linear-gradient(135deg, rgba(6,182,212,0.95), rgba(59,130,246,0.95)); + box-shadow: 0 10px 20px rgba(6,182,212,0.18); + display: flex; align-items: center; justify-content: center; + color: white; font-weight: 900; font-size: 13px; +} +.hint { color: var(--text2); font-size: 12px; line-height: 1.4; } + +.ds-count span { + display: inline-flex; + align-items: center; + padding: 7px 10px; + border-radius: 999px; + border: 1px solid var(--stroke0); + background: rgba(255,255,255,0.02); + color: var(--text1) !important; + font-weight: 700; + font-size: 12px; +} + +/* --- Inputs --- */ +label { color: var(--text1) !important; font-weight: 650 !important; font-size: 12px !important; } + +textarea, input, select { + background: rgba(255,255,255,0.03) !important; + border: 1px solid var(--stroke0) !important; + border-radius: 12px !important; + color: var(--text0) !important; + transition: border-color 0.15s ease, box-shadow 0.15s ease, transform 0.15s ease; +} + +textarea:focus, input:focus, select:focus { + outline: none !important; + border-color: rgba(6,182,212,0.55) !important; + box-shadow: 0 0 0 4px rgba(6,182,212,0.14) !important; +} + +.keybox input { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; } + +.seed textarea { min-height: 160px !important; } +.mono textarea { font-family: "JetBrains Mono", ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, monospace !important; font-size: 12.5px !important; } + +/* --- Buttons --- */ +.cta button { + width: 100% !important; + border: 0 !important; + border-radius: 14px !important; + padding: 14px 16px !important; + font-size: 13px !important; + font-weight: 900 !important; + letter-spacing: 0.12em !important; + text-transform: uppercase !important; + color: white !important; + background: linear-gradient(135deg, rgba(6,182,212,1) 0%, rgba(59,130,246,1) 100%) !important; + box-shadow: 0 18px 48px rgba(6,182,212,0.22) !important; + position: relative !important; + overflow: hidden !important; +} +.cta button::after { + content: ""; + position: absolute; inset: -120px; + background: radial-gradient(closest-side, rgba(255,255,255,0.18), transparent 60%); + transform: translateX(-40%); + transition: transform 0.45s ease; +} +.cta button:hover { transform: translateY(-1px); } +.cta button:hover::after { transform: translateX(40%); } +.cta button:active { transform: translateY(0px); } + +.btn-secondary button { + border-radius: 12px !important; + border: 1px solid var(--stroke1) !important; + background: rgba(255,255,255,0.03) !important; + color: var(--text0) !important; + font-weight: 800 !important; +} +.btn-secondary button:hover { border-color: rgba(6,182,212,0.55) !important; } + +.btn-danger button { + border-radius: 12px !important; + border: 1px solid rgba(239,68,68,0.55) !important; + background: rgba(239,68,68,0.06) !important; + color: rgba(255,170,170,1) !important; + font-weight: 900 !important; +} + +/* --- Dataframe --- */ +.dataframe { + border-radius: 14px !important; + border: 1px solid var(--stroke0) !important; + background: rgba(255,255,255,0.02) !important; + overflow: hidden !important; +} +.dataframe thead th { + background: rgba(255,255,255,0.04) !important; + color: var(--text1) !important; + font-weight: 900 !important; + font-size: 11px !important; + letter-spacing: 0.10em !important; + text-transform: uppercase !important; + border-bottom: 1px solid var(--stroke0) !important; +} +.dataframe tbody td { + color: var(--text0) !important; + font-size: 12px !important; + border-bottom: 1px solid rgba(148,163,184,0.10) !important; +} +.dataframe tbody tr:hover { background: rgba(255,255,255,0.03) !important; } + +/* --- Status / Results --- */ +.panel { + border-radius: var(--radius); + border: 1px solid var(--stroke0); + background: linear-gradient(180deg, rgba(255,255,255,0.045), rgba(255,255,255,0.020)); + box-shadow: var(--shadowSoft); + padding: 16px; +} +.panel-title { + display: flex; align-items: center; justify-content: space-between; + gap: 10px; + padding-bottom: 12px; margin-bottom: 12px; + border-bottom: 1px solid var(--stroke0); +} +.panel-title h3 { margin: 0; font-size: 13px; letter-spacing: 0.12em; text-transform: uppercase; color: var(--text1); } +.running-pill { + display: inline-flex; align-items: center; gap: 10px; + padding: 8px 10px; border-radius: 999px; + border: 1px solid rgba(6,182,212,0.38); + background: rgba(6,182,212,0.08); + color: rgba(153,246,228,0.95); + font-weight: 900; font-size: 11px; letter-spacing: 0.10em; text-transform: uppercase; +} +.running-dot { width: 9px; height: 9px; border-radius: 99px; background: var(--teal); box-shadow: 0 0 18px rgba(6,182,212,0.45); animation: pulse 1.8s ease-in-out infinite; } + +.empty { + border-radius: var(--radius); + border: 1px dashed rgba(148,163,184,0.26); + background: rgba(255,255,255,0.02); + padding: 28px; + text-align: center; + color: var(--text2); +} +.empty .big { font-size: 40px; opacity: 0.22; margin-bottom: 10px; } +.empty .t { color: var(--text1); font-weight: 800; margin-bottom: 6px; } +.empty .s { font-size: 12px; } + +.results { + border-radius: var(--radius); + border: 1px solid rgba(16,185,129,0.55); + background: linear-gradient(180deg, rgba(16,185,129,0.12), rgba(255,255,255,0.02)); + box-shadow: 0 0 0 4px rgba(16,185,129,0.10), 0 20px 60px rgba(0,0,0,0.42); + padding: 16px; +} +.results-banner { + display: flex; align-items: center; justify-content: space-between; + gap: 12px; + padding-bottom: 12px; margin-bottom: 12px; + border-bottom: 1px solid rgba(16,185,129,0.28); +} +.results-banner .k { display: flex; align-items: center; gap: 10px; } +.results-banner .k .icon { + width: 36px; height: 36px; border-radius: 12px; + background: rgba(16,185,129,0.18); + border: 1px solid rgba(16,185,129,0.45); + display: flex; align-items: center; justify-content: center; +} +.results-banner .k .title { font-weight: 900; color: rgba(189,255,225,0.98); letter-spacing: 0.06em; text-transform: uppercase; font-size: 12px; } +.results-banner .k .sub { margin-top: 2px; color: rgba(189,255,225,0.70); font-size: 12px; } + +.tabs { background: transparent !important; } +.tab-nav button { + background: transparent !important; + border: 0 !important; + border-bottom: 2px solid transparent !important; + color: var(--text2) !important; + font-weight: 800 !important; + padding: 10px 12px !important; +} +.tab-nav button[aria-selected="true"] { + color: rgba(153,246,228,0.98) !important; + border-bottom-color: rgba(6,182,212,0.75) !important; +} +.tab-nav button:hover { color: var(--text0) !important; } + +.small-note { color: var(--text2); font-size: 12px; } + +/* --- Candidates stream --- */ +.cand-empty { padding: 28px; text-align: center; color: var(--text2); } +.cand-empty-icon { font-size: 40px; opacity: 0.25; margin-bottom: 10px; } +.cand-empty-title { color: var(--text1); font-weight: 900; margin-bottom: 4px; } +.cand-empty-sub { font-size: 12px; } + +.cand-stream { display: flex; flex-direction: column; gap: 10px; } +.cand-card { + border-radius: 14px; + border: 1px solid rgba(148,163,184,0.18); + background: linear-gradient(135deg, rgba(15,23,42,0.85), rgba(2,6,23,0.45)); + overflow: hidden; +} +.cand-topbar { height: 2px; background: linear-gradient(90deg, var(--teal), var(--blue)); } +.cand-header { + display: flex; align-items: center; justify-content: space-between; + gap: 10px; + padding: 10px 12px 0 12px; +} +.cand-iter { font-family: "JetBrains Mono", ui-monospace; font-size: 11px; color: rgba(153,246,228,0.92); font-weight: 800; letter-spacing: 0.08em; } +.cand-pill { + font-size: 10px; font-weight: 900; letter-spacing: 0.10em; + padding: 5px 8px; border-radius: 999px; + border: 1px solid rgba(148,163,184,0.20); + background: rgba(255,255,255,0.03); + color: var(--text2); +} +.cand-body { + padding: 10px 12px 12px 12px; + font-family: "JetBrains Mono", ui-monospace; + font-size: 12px; + line-height: 1.6; + color: rgba(234,240,255,0.75); +} + +/* --- Responsive --- */ +@media (max-width: 980px) { + .gradio-container { padding: 16px 12px !important; } + .brand-row { flex-direction: column; align-items: flex-start; } + .status-pill { align-self: stretch; justify-content: center; } +} +""" + +FORCE_DARK_JS = """ +function forceDarkTheme() { + try { + const url = new URL(window.location.href); + if (url.searchParams.get("__theme") !== "dark") { + url.searchParams.set("__theme", "dark"); + window.location.replace(url.toString()); + } + } catch (e) { + // no-op + } +} +forceDarkTheme(); +""" + +# ========================================== +# 5. UI CONSTRUCTION (Redesigned) +# ========================================== +APP_TITLE = "Universal Prompt Optimizer" +APP_SUBTITLE = "Genetic Evolutionary Prompt Agent (GEPA)" +STATUS_READY = "System Ready" + +with gr.Blocks( + title="Universal Prompt Optimizer", + theme=gr.themes.Base() +) as app: + dataset_state = gr.State([]) + + # TOP BAR + gr.HTML( + f""" +
+
+
+
GE
+
+
{APP_TITLE}
+
{APP_SUBTITLE}
+
+
+
{STATUS_READY}
+
+
+ """, + elem_classes=["topbar-wrap"] + ) + + # MAIN LAYOUT + with gr.Row(): + + # LEFT COLUMN: Configuration + with gr.Column(scale=5): + + # Step 1 + with gr.Group(elem_classes=["card"]): + gr.HTML( + """ +
+
1
Model & Credentials
+
Select a target model, then provide keys (stored in-session only).
+
+ """ + ) + + with gr.Row(): + model_select = gr.Dropdown( + label="Foundation Model", + choices=[ + "openai/gpt-4o", + "openai/gpt-4-turbo", + "anthropic/claude-3-5-sonnet", + "google/gemini-1.5-pro", + "custom" + ], + value="openai/gpt-4o", + scale=2 + ) + custom_model_input = gr.Textbox( + label="Custom Model ID", + placeholder="provider/model_name", + scale=1 + ) + + gr.HTML('
API Access Keys
') + gr.Markdown("*Keys are stored in-session only and never logged*", elem_classes=["text-xs"]) + + with gr.Row(): + key_openai = gr.Textbox( + label="OpenAI API Key", + type="password", + placeholder="sk-...", + scale=1 + ) + key_google = gr.Textbox( + label="Google API Key", + type="password", + placeholder="AIza...", + scale=1 + ) + key_anthropic = gr.Textbox( + label="Anthropic API Key", + type="password", + placeholder="sk-ant...", + scale=1 + ) + + # Step 2 + with gr.Group(elem_classes=["card"]): + gr.HTML( + """ +
+
2
Seed Prompt
+
Describe the task, constraints, output format, and tone.
+
+ """ + ) + seed_input = gr.Textbox( + label="Task Description", + placeholder="Example: You are a code reviewer that identifies security vulnerabilities in Python code. Return a JSON report with severity and fixes...", + lines=7, + max_lines=14, + elem_classes=["seed", "mono"] + ) + + # Step 3 + with gr.Group(elem_classes=["card"]): + gr.HTML( + """ +
+
3
Training Examples
+
Add a few high-quality I/O pairs (images optional) to shape the optimizer.
+
+ """ + ) + + with gr.Tabs(): + with gr.Tab("Manual Entry"): + with gr.Row(): + with gr.Column(scale=2): + d_in = gr.Textbox( + label="Input / User Prompt", + placeholder="Example user input...", + lines=3 + ) + d_out = gr.Textbox( + label="Ideal Output", + placeholder="Expected AI response...", + lines=3 + ) + with gr.Column(scale=1): + d_img = gr.Image( + label="Attach Image (Optional)", + type="numpy", + height=170 + ) + + btn_add = gr.Button( + "Add Example", + elem_classes=["btn-secondary"] + ) + + with gr.Tab("Bulk Import (JSON)"): + gr.Markdown( + "Paste a JSON array like: `[{\"input\": \"...\", \"output\": \"...\"}]`", + elem_classes=["small-note"] + ) + bulk_json = gr.Textbox( + show_label=False, + placeholder='[{"input": "...", "output": "..."}]', + lines=6 + ) + btn_import = gr.Button( + "Import JSON", + elem_classes=["btn-secondary"] + ) + + with gr.Row(): + gr.HTML("
Current dataset
") + ds_count = gr.HTML( + "0 examples loaded", + elem_classes=["ds-count"] + ) + + ds_table = gr.Dataframe( + headers=["ID", "Input", "Output", "Media"], + datatype=["number", "str", "str", "str"], + row_count=6, + column_count=(4, "fixed"), + interactive=False + ) + + with gr.Row(): + btn_clear = gr.Button( + "Clear All", + elem_classes=["btn-danger"], + size="sm" + ) + + # Step 4 (Prominent, not buried) + with gr.Group(elem_classes=["card"]): + gr.HTML( + """ +
+
4
Optimization Controls
+
Tune evolution budget. Defaults are safe for quick runs.
+
+ """ + ) + + with gr.Row(): + slider_iter = gr.Slider( + minimum=1, + maximum=20, + value=5, + step=1, + label="Evolution Rounds", + info="Number of genetic iterations" + ) + slider_calls = gr.Slider( + minimum=10, + maximum=200, + value=50, + step=10, + label="Max LLM Calls", + info="Total API call budget" + ) + + with gr.Row(): + slider_batch = gr.Slider( + minimum=1, + maximum=10, + value=4, + step=1, + label="Batch Size", + info="Candidates per iteration" + ) + check_llego = gr.Checkbox( + value=True, + label="Enable LLEGO Crossover", + info="Use advanced genetic operations" + ) + + btn_optimize = gr.Button( + "Start Optimization", + elem_classes=["cta", "mt-6"] + ) + + # RIGHT: STATUS + RESULTS + with gr.Column(scale=5, elem_classes=["right-col"]): + # STATUS PANEL (Hidden by default) + status_panel = gr.Group(visible=False, elem_classes=["panel"]) + with status_panel: + gr.HTML( + """ +
+

Optimization status

+
Running
+
+ """ + ) + txt_status = gr.Markdown("Initializing genetic algorithm...") + + # EMPTY STATE + empty_state = gr.HTML( + """ +
+
๐Ÿงฌ
+
Ready to optimize
+
Fill Steps 1โ€“3, then click Start Optimization to begin prompt evolution.
+
+ """, + visible=True + ) + + # RESULTS PANEL (Hidden by default) + results_panel = gr.Group(visible=False, elem_classes=["results"]) + with results_panel: + gr.HTML( + """ +
+
+
โœ“
+
+
Optimization successful
+
Review the optimized prompt, metrics, and evolution traces.
+
+
+
+ """ + ) + + with gr.Tabs(): + with gr.Tab("Optimized Prompt"): + res_prompt = gr.Textbox( + label="Optimized Prompt", + lines=18, + max_lines=28, + interactive=False, + show_label=True, + elem_classes=["mono"] + ) + + with gr.Tab("Metrics & Log"): + res_metrics = gr.JSON(label="Performance Gains") + res_history = gr.TextArea( + label="Evolution Log", + interactive=False, + lines=10 + ) + + with gr.Tab("๐Ÿงฌ Live Candidates"): + gr.Markdown("Real-time stream of generated prompt candidates during optimization:") + live_candidates = gr.HTML() + btn_refresh_cand = gr.Button( + "๐Ÿ”„ Refresh Stream", + elem_classes=["secondary-btn"], + size="sm" + ) + + # ========================================== + # 6. EVENT HANDLERS + # ========================================== + + # Dataset Management + def update_dataset_count(dataset): + """Update dataset count display with error handling.""" + try: + if not isinstance(dataset, list): + return "0 examples loaded" + count = len(dataset) + return f"{count} example{'s' if count != 1 else ''} loaded" + except Exception as e: + logger.error(f"Error updating dataset count: {str(e)}") + return "Error" + + # Wrap event handlers with error handling + def safe_add_example(*args): + """Wrapper for add_example with error handling.""" + try: + return add_example(*args) + except gr.Error: + raise + except Exception as e: + logger.error(f"Unexpected error in add_example: {str(e)}") + raise gr.Error(f"Failed to add example: {str(e)}") + + def safe_update_table(dataset): + """Wrapper for update_table with error handling.""" + try: + return update_table(dataset) + except Exception as e: + logger.error(f"Error updating table: {str(e)}") + return [] + + def safe_clear_dataset(): + """Wrapper for clear_dataset with error handling.""" + try: + return clear_dataset() + except Exception as e: + logger.error(f"Error clearing dataset: {str(e)}") + return [], [] + + btn_add.click( + safe_add_example, + inputs=[d_in, d_out, d_img, dataset_state], + outputs=[dataset_state, d_in, d_out, d_img] + ).then( + safe_update_table, + inputs=[dataset_state], + outputs=[ds_table] + ).then( + update_dataset_count, + inputs=[dataset_state], + outputs=[ds_count] + ) + + btn_clear.click( + safe_clear_dataset, + outputs=[dataset_state, ds_table] + ).then( + lambda: "0 examples loaded", + outputs=[ds_count] + ) + + # Bulk Import + def import_bulk_json(json_text, current_dataset): + """Import examples from JSON with comprehensive error handling.""" + try: + # Validate inputs + if not json_text or not json_text.strip(): + raise gr.Error("JSON input is empty. Please provide a JSON array.") + + if not isinstance(current_dataset, list): + raise gr.Error("Dataset state is invalid. Please refresh the page.") + + # Parse JSON + try: + data = json.loads(json_text.strip()) + except json.JSONDecodeError as e: + raise gr.Error(f"Invalid JSON format: {str(e)}. Please check your JSON syntax.") + + # Validate structure + if not isinstance(data, list): + raise gr.Error("JSON must be an array of objects. Example: [{\"input\": \"...\", \"output\": \"...\"}]") + + if len(data) == 0: + raise gr.Error("JSON array is empty. Add at least one example object.") + + # Validate and import items + imported_count = 0 + errors = [] + + for i, item in enumerate(data): + try: + if not isinstance(item, dict): + errors.append(f"Item {i+1}: not a dictionary") + continue + + if "input" not in item or "output" not in item: + errors.append(f"Item {i+1}: missing 'input' or 'output' field") + continue + + input_val = item["input"] + output_val = item["output"] + + if not isinstance(input_val, str) or not isinstance(output_val, str): + errors.append(f"Item {i+1}: 'input' and 'output' must be strings") + continue + + if not input_val.strip() or not output_val.strip(): + errors.append(f"Item {i+1}: 'input' and 'output' cannot be empty") + continue + + # Add valid item + current_dataset.append({ + "input": input_val.strip(), + "output": output_val.strip(), + "image": item.get("image"), # Optional + "image_preview": "๐Ÿ–ผ๏ธ Image" if item.get("image") else "-" + }) + imported_count += 1 + + except Exception as e: + errors.append(f"Item {i+1}: {str(e)}") + logger.warning(f"Error importing item {i+1}: {str(e)}") + continue + + # Report results + if imported_count == 0: + error_msg = "No valid examples imported. " + if errors: + error_msg += "Errors: " + "; ".join(errors[:3]) + if len(errors) > 3: + error_msg += f" (and {len(errors) - 3} more)" + raise gr.Error(error_msg) + + if errors: + warning_msg = f"Imported {imported_count} example(s). " + if len(errors) <= 3: + warning_msg += f"Warnings: {'; '.join(errors)}" + else: + warning_msg += f"{len(errors)} items had errors." + logger.warning(warning_msg) + + return current_dataset, "" + + except gr.Error: + # Re-raise Gradio errors + raise + except Exception as e: + logger.error(f"Unexpected error in import_bulk_json: {str(e)}\n{traceback.format_exc()}") + raise gr.Error(f"Failed to import JSON: {str(e)}") + + btn_import.click( + import_bulk_json, + inputs=[bulk_json, dataset_state], + outputs=[dataset_state, bulk_json] + ).then( + safe_update_table, + inputs=[dataset_state], + outputs=[ds_table] + ).then( + update_dataset_count, + inputs=[dataset_state], + outputs=[ds_count] + ) + + # Main Optimization Flow + btn_optimize.click( + run_optimization_flow, + inputs=[ + seed_input, dataset_state, model_select, custom_model_input, + slider_iter, slider_calls, slider_batch, check_llego, + key_openai, key_google, key_anthropic + ], + outputs=[ + status_panel, empty_state, results_panel, + txt_status, res_prompt, res_metrics, res_history, live_candidates + ] + ) + + # Refresh Candidates + def safe_get_candidates_display(): + """Wrapper for get_candidates_display with error handling.""" + try: + return get_candidates_display() + except Exception as e: + logger.error(f"Error refreshing candidates: {str(e)}") + return "
Error loading candidates.
" + + btn_refresh_cand.click( + safe_get_candidates_display, + outputs=[live_candidates] + ) + +# ========================================== +# 7. LAUNCH +# ========================================== +if __name__ == "__main__": + app.queue().launch( + server_name="0.0.0.0", + server_port=7860, + share=False, # Set to False for HF Spaces + show_error=True, + css=CUSTOM_CSS, + js=FORCE_DARK_JS + ) + \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..b415c2f7705cd0945b80f6960401d2591fe23cf0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,23 @@ +# Core dependencies - gepa from git +git+https://github.com/gepa-ai/gepa.git +numpy>=1.21.0 +pandas>=1.5.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 + +# HTTP/API clients +requests>=2.31.0 +aiohttp>=3.8.0 +asyncio-throttle>=1.0.0 + +# LLM Provider SDKs +openai>=1.0.0 +anthropic>=0.18.0 +google-generativeai>=0.3.0 +google-genai>=0.2.0 + +# Image processing +Pillow>=9.0.0 + +# Gradio UI (version will be set by README.md sdk_version) +gradio>=4.0.0 \ No newline at end of file diff --git a/src/gepa_optimizer.egg-info/PKG-INFO b/src/gepa_optimizer.egg-info/PKG-INFO new file mode 100644 index 0000000000000000000000000000000000000000..272d9e1fc41f406056f6ddb09898b32ddd8a6037 --- /dev/null +++ b/src/gepa_optimizer.egg-info/PKG-INFO @@ -0,0 +1,439 @@ +Metadata-Version: 2.4 +Name: gepa-optimizer +Version: 0.1.0 +Summary: Universal prompt optimization framework based on GEPA +Home-page: https://github.com/suhasb-dev/Prompt-Optimizer +Author: Suhas +Author-email: Suhas +License: MIT +Project-URL: Homepage, https://github.com/suhasb-dev/Prompt-Optimizer +Project-URL: Repository, https://github.com/suhasb-dev/Prompt-Optimizer +Project-URL: Documentation, https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/ +Project-URL: Bug Reports, https://github.com/suhasb-dev/Prompt-Optimizer/issues +Keywords: prompt-optimization,llm,gepa,ai,machine-learning,ui-tree-extraction +Classifier: Development Status :: 3 - Alpha +Classifier: Intended Audience :: Developers +Classifier: Intended Audience :: Science/Research +Classifier: License :: OSI Approved :: MIT License +Classifier: Programming Language :: Python :: 3 +Classifier: Programming Language :: Python :: 3.8 +Classifier: Programming Language :: Python :: 3.9 +Classifier: Programming Language :: Python :: 3.10 +Classifier: Programming Language :: Python :: 3.11 +Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence +Classifier: Topic :: Software Development :: Libraries :: Python Modules +Requires-Python: >=3.8 +Description-Content-Type: text/markdown +License-File: LICENSE +Requires-Dist: gepa>=0.0.12 +Requires-Dist: pandas>=1.5.0 +Requires-Dist: pydantic>=2.0.0 +Requires-Dist: python-dotenv>=1.0.0 +Requires-Dist: requests>=2.31.0 +Requires-Dist: aiohttp>=3.8.0 +Requires-Dist: asyncio-throttle>=1.0.0 +Requires-Dist: google-generativeai>=0.3.0 +Requires-Dist: Pillow>=9.0.0 +Provides-Extra: dev +Requires-Dist: pytest>=7.0.0; extra == "dev" +Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev" +Requires-Dist: black>=23.0.0; extra == "dev" +Requires-Dist: flake8>=6.0.0; extra == "dev" +Requires-Dist: mypy>=1.0.0; extra == "dev" +Provides-Extra: docs +Requires-Dist: sphinx>=5.0.0; extra == "docs" +Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "docs" +Provides-Extra: all +Requires-Dist: pytest>=7.0.0; extra == "all" +Requires-Dist: pytest-asyncio>=0.21.0; extra == "all" +Requires-Dist: black>=23.0.0; extra == "all" +Requires-Dist: flake8>=6.0.0; extra == "all" +Requires-Dist: mypy>=1.0.0; extra == "all" +Requires-Dist: sphinx>=5.0.0; extra == "all" +Requires-Dist: sphinx-rtd-theme>=1.2.0; extra == "all" +Dynamic: author +Dynamic: home-page +Dynamic: license-file +Dynamic: requires-python + +# GEPA Optimizer + +[![PyPI version](https://badge.fury.io/py/gepa-optimizer.svg)](https://badge.fury.io/py/gepa-optimizer) +[![Python 3.8+](https://img.shields.io/badge/python-3.8+-blue.svg)](https://www.python.org/downloads/) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) + +A universal prompt optimization framework built on [GEPA](https://arxiv.org/abs/2507.19457) with optional [LLEGO](https://arxiv.org/abs/2503.14217) genetic operators for accelerated convergence. + +## Overview + +GEPA Optimizer provides a modular architecture for optimizing prompts through reflective evolution. It requires custom evaluators and LLM clients, enabling domain-specific optimization for any use case. + +**Key capabilities:** +- Multi-modal support (text + vision models) +- Hybrid GEPA + LLEGO optimization modes +- Configurable train/val/test data splitting +- Batch API support for cost reduction +- Async-first architecture + +## Installation + +```bash +pip install gepa-optimizer +``` + +**From source:** +```bash +git clone https://github.com/suhasb-dev/Prompt-Optimizer.git +cd Prompt-Optimizer +pip install -e . +``` + +## Quick Start + +```python +import asyncio +from gepa_optimizer import ( + GepaOptimizer, + OptimizationConfig, + BaseEvaluator, + BaseLLMClient +) + +# Define custom evaluator +class MyEvaluator(BaseEvaluator): + def evaluate(self, predicted: str, expected: str) -> dict: + score = 1.0 if predicted.strip() == expected.strip() else 0.0 + return {"accuracy": score, "composite_score": score} + +# Define custom LLM client +class MyLLMClient(BaseLLMClient): + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> dict: + # Your LLM integration here + return {"content": "response"} + +async def main(): + config = OptimizationConfig( + model="openai/gpt-4o", + reflection_model="openai/gpt-4o", + max_iterations=5, + max_metric_calls=50, + batch_size=8 + ) + + optimizer = GepaOptimizer( + config=config, + llm_client=MyLLMClient("openai", "gpt-4o"), + evaluator=MyEvaluator() + ) + + result = await optimizer.train( + seed_prompt="Your initial prompt", + dataset=your_dataset + ) + + print(f"Optimized: {result.prompt}") + print(f"Score: {result.improvement_data}") + +asyncio.run(main()) +``` + +## Project Structure + +``` +src/gepa_optimizer/ +โ”œโ”€โ”€ core/ # Core optimization logic +โ”‚ โ”œโ”€โ”€ optimizer.py # GepaOptimizer main class +โ”‚ โ”œโ”€โ”€ base_adapter.py # BaseGepaAdapter interface +โ”‚ โ””โ”€โ”€ universal_adapter.py +โ”œโ”€โ”€ evaluation/ # Evaluator implementations +โ”‚ โ”œโ”€โ”€ base_evaluator.py # BaseEvaluator abstract class +โ”‚ โ”œโ”€โ”€ scroll_evaluator.py +โ”‚ โ”œโ”€โ”€ validation_evaluator.py +โ”‚ โ””โ”€โ”€ index_caching_evaluator.py +โ”œโ”€โ”€ llms/ # LLM client implementations +โ”‚ โ”œโ”€โ”€ base_llm.py # BaseLLMClient abstract class +โ”‚ โ”œโ”€โ”€ vision_llm.py # VisionLLMClient (OpenAI, Google, Anthropic) +โ”‚ โ””โ”€โ”€ batch_llm.py # BatchLLMClient (50% cost savings) +โ”œโ”€โ”€ operators/ # LLEGO genetic operators +โ”‚ โ””โ”€โ”€ llego_operators.py # FitnessGuidedCrossover, DiversityGuidedMutation +โ”œโ”€โ”€ data/ # Dataset loaders and converters +โ”œโ”€โ”€ models/ # Configuration and result models +โ””โ”€โ”€ utils/ # Utilities and helpers +``` + +## Configuration + +### Basic Configuration + +```python +from gepa_optimizer import OptimizationConfig, ModelConfig + +config = OptimizationConfig( + # Required parameters + model="openai/gpt-4o", # or ModelConfig instance + reflection_model="openai/gpt-4o", + max_iterations=10, + max_metric_calls=100, + batch_size=8, + + # Data splitting (train/val/test) + data_split=DataSplitConfig( + train_ratio=0.6, + val_ratio=0.2, + test_ratio=0.2 + ), + + # Optional settings + reflection_examples=3, # Examples per reflection (2-5 recommended) + evaluate_on_test=True, # Final evaluation on held-out test set + log_level="INFO" # DEBUG, INFO, WARNING, ERROR +) +``` + +### LLEGO Genetic Operators + +Enable LLEGO for faster convergence through fitness-guided crossover and diversity-guided mutation: + +```python +config = OptimizationConfig( + model="openai/gpt-4o", + reflection_model="openai/gpt-4o", + max_iterations=5, + max_metric_calls=50, + batch_size=8, + + # Enable LLEGO + use_llego_operators=True, + alpha=0.15, # Fitness extrapolation factor + tau=10.0, # Diversity temperature + nu=4, # Parent arity + n_crossover=2, # Crossover offspring per iteration + n_mutation=3, # Mutation offspring per iteration + population_size=15 +) +``` + +### Hybrid Mode (GEPA + LLEGO) + +Combine GEPA's semantic reflection with LLEGO's structural diversity: + +```python +config = OptimizationConfig( + model="openai/gpt-4o", + reflection_model="openai/gpt-4o", + max_iterations=6, + max_metric_calls=200, + batch_size=10, + + # Hybrid mode + use_llego_operators=True, + enable_gepa_reflection_with_llego=True, + num_gepa_reflection_candidates=3, + n_crossover=3, + n_mutation=3 + # Total: 9 candidates per iteration (3 GEPA + 3 crossover + 3 mutation) +) +``` + +### Batch API (Cost Optimization) + +Use batch processing for 50% cost reduction: + +```python +from gepa_optimizer.llms import BatchLLMClient + +llm_client = BatchLLMClient( + provider="google", + model_name="gemini-2.5-flash", + batch_size=20, + polling_interval=30 +) + +optimizer = GepaOptimizer( + config=config, + llm_client=llm_client, + evaluator=evaluator +) +``` + +## Built-in Components + +### LLM Clients + +| Client | Description | Use Case | +|--------|-------------|----------| +| `VisionLLMClient` | Multi-modal client for OpenAI, Google, Anthropic | Real-time requests | +| `BatchLLMClient` | Batch processing client | Cost-sensitive workloads | + +### Evaluators + +| Evaluator | Description | +|-----------|-------------| +| `ScrollElementEvaluator` | UI element detection scoring | +| `ValidationEvaluator` | Screen validation tasks | +| `IndexCachingEvaluator` | Index-based element selection | +| `UITreeEvaluator` | UI tree extraction | + +### Dataset Loaders + +| Loader | Description | +|--------|-------------| +| `load_scroll_dataset()` | Load scroll detection datasets | +| `load_validation_split()` | Load validation datasets with splits | +| `load_index_caching_split()` | Load index caching datasets | + +## Creating Custom Components + +### Custom Evaluator + +```python +from gepa_optimizer import BaseEvaluator + +class CustomEvaluator(BaseEvaluator): + def __init__(self): + super().__init__(metric_weights={ + "accuracy": 0.5, + "completeness": 0.3, + "format": 0.2 + }) + + def evaluate(self, predicted: str, expected: str) -> dict: + accuracy = self._compute_accuracy(predicted, expected) + completeness = self._compute_completeness(predicted, expected) + format_score = self._compute_format(predicted) + + composite = ( + accuracy * 0.5 + + completeness * 0.3 + + format_score * 0.2 + ) + + return { + "accuracy": accuracy, + "completeness": completeness, + "format": format_score, + "composite_score": composite # Required key + } +``` + +### Custom LLM Client + +```python +from gepa_optimizer import BaseLLMClient + +class CustomLLMClient(BaseLLMClient): + def __init__(self, api_key: str): + super().__init__(provider="custom", model_name="my-model") + self.api_key = api_key + + def generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: str = None, + **kwargs + ) -> dict: + # Your API call here + response = call_your_api(system_prompt, user_prompt, image_base64) + return {"content": response} +``` + +## Examples + +| File | Description | +|------|-------------| +| [`examples/basic_usage.py`](examples/basic_usage.py) | Basic optimization workflow | +| [`examples/advanced_usage.py`](examples/advanced_usage.py) | Advanced configuration | +| [`examples/batch_api_example.py`](examples/batch_api_example.py) | Batch API usage | +| [`examples/gemini_usage.py`](examples/gemini_usage.py) | Google Gemini integration | + +**Run examples:** +```bash +python examples/basic_usage.py +``` + +## Testing + +```bash +# Run all tests +pytest tests/ + +# Run unit tests only +pytest tests/unit/ + +# Run integration tests +pytest tests/integration/ +``` + +## API Reference + +### GepaOptimizer + +```python +class GepaOptimizer: + def __init__( + self, + config: OptimizationConfig, + llm_client: BaseLLMClient, + evaluator: BaseEvaluator, + adapter_type: str = "universal" + ) + + async def train( + self, + seed_prompt: str, + dataset: Union[List, Dict], + **kwargs + ) -> OptimizedResult +``` + +### OptimizationConfig + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `model` | `str \| ModelConfig` | Required | Target model | +| `reflection_model` | `str \| ModelConfig` | Required | Reflection model | +| `max_iterations` | `int` | Required | Maximum optimization iterations | +| `max_metric_calls` | `int` | Required | Maximum evaluation calls | +| `batch_size` | `int` | Required | Samples per evaluation batch | +| `use_llego_operators` | `bool` | `False` | Enable LLEGO genetic operators | +| `enable_gepa_reflection_with_llego` | `bool` | `False` | Enable hybrid mode | +| `use_llm_as_judge` | `bool` | `True` | Enable LLM-as-Judge feedback | +| `log_level` | `str` | `"INFO"` | Logging verbosity | + +### OptimizedResult + +| Attribute | Type | Description | +|-----------|------|-------------| +| `prompt` | `str` | Optimized prompt | +| `original_prompt` | `str` | Initial seed prompt | +| `improvement_data` | `dict` | Score improvements | +| `optimization_time` | `float` | Total time in seconds | +| `is_successful` | `bool` | Optimization success status | + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `OPENAI_API_KEY` | OpenAI API key | +| `ANTHROPIC_API_KEY` | Anthropic API key | +| `GOOGLE_API_KEY` | Google AI API key | + +## References + +- **GEPA Paper:** [Reflective Prompt Evolution Can Outperform Reinforcement Learning](https://arxiv.org/abs/2507.19457) +- **LLEGO Paper:** [Decision Tree Induction Through LLMs via Semantically-Aware Evolution](https://arxiv.org/abs/2503.14217) +- **GEPA Library:** [github.com/gepa-ai/gepa](https://github.com/gepa-ai/gepa) + +## License + +MIT License - see [LICENSE](LICENSE) for details. + +## Contributing + +Contributions welcome. Please open an issue or submit a pull request. + +## Support + +- **Issues:** [GitHub Issues](https://github.com/suhasb-dev/Prompt-Optimizer/issues) +- **Documentation:** [GitBook](https://suhasb-dev.gitbook.io/gepa-universal-prompt-optimizer/) diff --git a/src/gepa_optimizer.egg-info/SOURCES.txt b/src/gepa_optimizer.egg-info/SOURCES.txt new file mode 100644 index 0000000000000000000000000000000000000000..f019258d7e1ba6587c93e9adafd7e606b206503b --- /dev/null +++ b/src/gepa_optimizer.egg-info/SOURCES.txt @@ -0,0 +1,65 @@ +LICENSE +README.md +pyproject.toml +setup.py +src/gepa_optimizer/__init__.py +src/gepa_optimizer/cli.py +src/gepa_optimizer/types.py +src/gepa_optimizer/version.py +src/gepa_optimizer.egg-info/PKG-INFO +src/gepa_optimizer.egg-info/SOURCES.txt +src/gepa_optimizer.egg-info/dependency_links.txt +src/gepa_optimizer.egg-info/entry_points.txt +src/gepa_optimizer.egg-info/requires.txt +src/gepa_optimizer.egg-info/top_level.txt +src/gepa_optimizer/core/__init__.py +src/gepa_optimizer/core/base_adapter.py +src/gepa_optimizer/core/custom_adapter.py +src/gepa_optimizer/core/optimizer.py +src/gepa_optimizer/core/result.py +src/gepa_optimizer/core/universal_adapter.py +src/gepa_optimizer/data/__init__.py +src/gepa_optimizer/data/converters.py +src/gepa_optimizer/data/index_caching_loader.py +src/gepa_optimizer/data/loaders.py +src/gepa_optimizer/data/scroll_dataset_loader.py +src/gepa_optimizer/data/validation_dataset_loader.py +src/gepa_optimizer/data/validators.py +src/gepa_optimizer/evaluation/__init__.py +src/gepa_optimizer/evaluation/base_evaluator.py +src/gepa_optimizer/evaluation/index_caching_evaluator.py +src/gepa_optimizer/evaluation/scroll_evaluator.py +src/gepa_optimizer/evaluation/ui_evaluator.py +src/gepa_optimizer/evaluation/universal_evaluator.py +src/gepa_optimizer/evaluation/validation_evaluator.py +src/gepa_optimizer/infrastructure/__init__.py +src/gepa_optimizer/infrastructure/logging/__init__.py +src/gepa_optimizer/infrastructure/logging/context.py +src/gepa_optimizer/infrastructure/logging/formatters.py +src/gepa_optimizer/infrastructure/logging/logger.py +src/gepa_optimizer/llms/__init__.py +src/gepa_optimizer/llms/base_llm.py +src/gepa_optimizer/llms/batch_llm.py +src/gepa_optimizer/llms/llego_enhanced_llm.py +src/gepa_optimizer/llms/vision_llm.py +src/gepa_optimizer/models/__init__.py +src/gepa_optimizer/models/config.py +src/gepa_optimizer/models/dataset.py +src/gepa_optimizer/models/result.py +src/gepa_optimizer/operators/__init__.py +src/gepa_optimizer/operators/base_operator.py +src/gepa_optimizer/operators/crossover.py +src/gepa_optimizer/operators/llego_operators.py +src/gepa_optimizer/operators/models.py +src/gepa_optimizer/operators/mutation.py +src/gepa_optimizer/utils/__init__.py +src/gepa_optimizer/utils/api_keys.py +src/gepa_optimizer/utils/candidate_collector.py +src/gepa_optimizer/utils/clean_logger.py +src/gepa_optimizer/utils/exceptions.py +src/gepa_optimizer/utils/helpers.py +src/gepa_optimizer/utils/llm_judge_prompt.py +src/gepa_optimizer/utils/log_parser.py +src/gepa_optimizer/utils/logging.py +src/gepa_optimizer/utils/metrics.py +src/gepa_optimizer/utils/pareto_logger.py \ No newline at end of file diff --git a/src/gepa_optimizer.egg-info/dependency_links.txt b/src/gepa_optimizer.egg-info/dependency_links.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b137891791fe96927ad78e64b0aad7bded08bdc --- /dev/null +++ b/src/gepa_optimizer.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/src/gepa_optimizer.egg-info/entry_points.txt b/src/gepa_optimizer.egg-info/entry_points.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9b0dbe7680b3733ee391c2c83177f29594117eb --- /dev/null +++ b/src/gepa_optimizer.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +gepa-optimize = gepa_optimizer.cli:main diff --git a/src/gepa_optimizer.egg-info/requires.txt b/src/gepa_optimizer.egg-info/requires.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecfbd2e6e4a4482e0036fa85ac2ca7d695b13be6 --- /dev/null +++ b/src/gepa_optimizer.egg-info/requires.txt @@ -0,0 +1,29 @@ +gepa>=0.0.12 +pandas>=1.5.0 +pydantic>=2.0.0 +python-dotenv>=1.0.0 +requests>=2.31.0 +aiohttp>=3.8.0 +asyncio-throttle>=1.0.0 +google-generativeai>=0.3.0 +Pillow>=9.0.0 + +[all] +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +black>=23.0.0 +flake8>=6.0.0 +mypy>=1.0.0 +sphinx>=5.0.0 +sphinx-rtd-theme>=1.2.0 + +[dev] +pytest>=7.0.0 +pytest-asyncio>=0.21.0 +black>=23.0.0 +flake8>=6.0.0 +mypy>=1.0.0 + +[docs] +sphinx>=5.0.0 +sphinx-rtd-theme>=1.2.0 diff --git a/src/gepa_optimizer.egg-info/top_level.txt b/src/gepa_optimizer.egg-info/top_level.txt new file mode 100644 index 0000000000000000000000000000000000000000..a53df9f6ea55c2b670c462010432f5969311d777 --- /dev/null +++ b/src/gepa_optimizer.egg-info/top_level.txt @@ -0,0 +1 @@ +gepa_optimizer diff --git a/src/gepa_optimizer/__init__.py b/src/gepa_optimizer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a9a4dc05ed44b418078c9404690ba7af8d163d7f --- /dev/null +++ b/src/gepa_optimizer/__init__.py @@ -0,0 +1,295 @@ +""" +GEPA Universal Prompt Optimizer + +A modern, modular Python library for universal prompt optimization powered by GEPA. + +Quick Start (No custom evaluator needed!): + + from gepa_optimizer import quick_optimize + + result = await quick_optimize( + seed_prompt="Your initial prompt", + dataset=[ + {"input": "task1", "output": "expected1"}, + {"input": "task2", "output": "expected2"}, + ], + model="openai/gpt-4o" # or any: "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022" + ) + print(result.optimized_prompt) +""" + +# Core functionality +from .core import GepaOptimizer +from .core.base_adapter import BaseGepaAdapter +from .core.universal_adapter import UniversalGepaAdapter + +# Configuration and models +from .models import OptimizationConfig, OptimizationResult, OptimizedResult, ModelConfig + +# Data processing +from .data import UniversalConverter, DataLoader, DataValidator +from .data.scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset +from .data.validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split +from .data.index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split + +# LLM clients +from .llms import VisionLLMClient +from .llms.base_llm import BaseLLMClient +from .llms.batch_llm import BatchLLMClient + +# Evaluators - including Universal Semantic Evaluator (works for ANY task!) +from .evaluation import ( + BaseEvaluator, + UniversalSemanticEvaluator, + create_universal_evaluator, + UITreeEvaluator, + ScrollElementEvaluator, + ValidationEvaluator, + IndexCachingEvaluator +) + +# LLEGO Genetic Operators +from .operators import ( + # Base interfaces + BaseGeneticOperator, + BaseCrossoverOperator, + BaseMutationOperator, + # Concrete operators + FitnessGuidedCrossover, + DiversityGuidedMutation, + LLEGOIntegrationLayer, + # Data models + PromptCandidate, + PromptMetadata +) + +# Utilities +from .utils import setup_logging, calculate_metrics, sanitize_prompt, APIKeyManager +from .utils.exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError + +# Logging infrastructure +from .infrastructure.logging import get_logger, configure_logging, LogContext + +# Type definitions (for type hints in user code) +from .types import ( + DatasetItem, + EvaluationResult, + LLMResponse, + CandidateDict, + LLMClientProtocol, + EvaluatorProtocol, +) + +__version__ = "0.1.0" + + +# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• +# CONVENIENCE FUNCTION: quick_optimize +# No evaluator needed - uses Universal Semantic Evaluator automatically +# โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ•โ• + +async def quick_optimize( + seed_prompt: str, + dataset: list, + model: str, + max_iterations: int = 5, + max_metric_calls: int = 50, + batch_size: int = 4, + use_llego: bool = True, + verbose: bool = True +) -> OptimizedResult: + """ + ๐Ÿš€ Quick prompt optimization - no custom evaluator needed! + + Uses Universal Semantic Evaluator that works for ANY task. + + Args: + seed_prompt: Your initial prompt to optimize + dataset: List of dicts with 'input' and 'output' (expected) keys + Can also include 'image' key for multi-modal tasks + model: LLM model to use in format "provider/model-name" (REQUIRED) + Examples: + - "google/gemini-1.5-pro" + - "google/gemini-2.5-flash-preview-05-20" + - "openai/gpt-4o" + - "openai/gpt-4-turbo" + - "anthropic/claude-3-5-sonnet-20241022" + max_iterations: Maximum optimization iterations (default: 5) + max_metric_calls: Maximum evaluation calls (default: 50) + batch_size: Samples per evaluation batch (default: 4) + use_llego: Enable LLEGO genetic operators (default: True) + verbose: Show progress logs (default: True) + + Returns: + OptimizedResult with optimized prompt and improvement metrics + + Example: + >>> result = await quick_optimize( + ... seed_prompt="Count the objects in the image", + ... dataset=[ + ... {"input": "image1.jpg", "output": "5 objects", "image": "base64..."}, + ... {"input": "image2.jpg", "output": "3 objects", "image": "base64..."}, + ... ], + ... model="openai/gpt-4o", # or "google/gemini-1.5-pro", etc. + ... max_iterations=3 + ... ) + >>> print(result.optimized_prompt) + """ + import logging + + if verbose: + logging.basicConfig(level=logging.INFO) + + # Create LLM client + llm_client = VisionLLMClient.from_model_string(model) + + # Create Universal Semantic Evaluator (uses same LLM for analysis) + evaluator = UniversalSemanticEvaluator( + llm_client=llm_client, + use_llm_analysis=True + ) + + # Create configuration + config = OptimizationConfig( + model=model, + reflection_model=model, + max_iterations=max_iterations, + max_metric_calls=max_metric_calls, + batch_size=batch_size, + use_llego_operators=use_llego, + enable_gepa_reflection_with_llego=use_llego, + num_gepa_reflection_candidates=3, + n_crossover=2, + n_mutation=2, + verbose=verbose + ) + + # Create optimizer + optimizer = GepaOptimizer( + config=config, + llm_client=llm_client, + evaluator=evaluator + ) + + # Run optimization + result = await optimizer.train( + seed_prompt=seed_prompt, + dataset=dataset + ) + + return result + + +def quick_optimize_sync( + seed_prompt: str, + dataset: list, + model: str, + max_iterations: int = 5, + max_metric_calls: int = 50, + batch_size: int = 4, + use_llego: bool = True, + verbose: bool = True +) -> OptimizedResult: + """ + ๐Ÿš€ Synchronous version of quick_optimize. + + Same as quick_optimize but runs synchronously (blocks until complete). + + Args: + model: LLM model to use in format "provider/model-name" (REQUIRED) + Examples: "openai/gpt-4o", "google/gemini-1.5-pro", "anthropic/claude-3-5-sonnet-20241022" + + See quick_optimize for full documentation. + """ + import asyncio + return asyncio.run(quick_optimize( + seed_prompt=seed_prompt, + dataset=dataset, + model=model, + max_iterations=max_iterations, + max_metric_calls=max_metric_calls, + batch_size=batch_size, + use_llego=use_llego, + verbose=verbose + )) + + +__all__ = [ + # ๐Ÿš€ Quick Start (recommended for new users) + "quick_optimize", + "quick_optimize_sync", + + # Core functionality + "GepaOptimizer", + "BaseGepaAdapter", + "UniversalGepaAdapter", + + # Configuration + "OptimizationConfig", + "OptimizationResult", + "OptimizedResult", + "ModelConfig", + + # Data processing + "UniversalConverter", + "DataLoader", + "DataValidator", + + # Dataset loaders + "ScrollDatasetLoader", + "load_scroll_dataset", + "ValidationDatasetLoader", + "load_validation_dataset", + "load_validation_split", + "IndexCachingDatasetLoader", + "load_index_caching_dataset", + "load_index_caching_split", + + # LLM clients + "VisionLLMClient", + "BaseLLMClient", + "BatchLLMClient", + + # Evaluators (Universal recommended for general use) + "UniversalSemanticEvaluator", + "create_universal_evaluator", + "BaseEvaluator", + "UITreeEvaluator", + "ScrollElementEvaluator", + "ValidationEvaluator", + "IndexCachingEvaluator", + + # LLEGO Genetic Operators - Base interfaces + "BaseGeneticOperator", + "BaseCrossoverOperator", + "BaseMutationOperator", + # LLEGO Genetic Operators - Concrete implementations + "FitnessGuidedCrossover", + "DiversityGuidedMutation", + "LLEGOIntegrationLayer", + "PromptCandidate", + "PromptMetadata", + + # Utilities + "APIKeyManager", + "GepaOptimizerError", + "GepaDependencyError", + "InvalidInputError", + "DatasetError", + "setup_logging", + "calculate_metrics", + "sanitize_prompt", + + # Logging infrastructure + "get_logger", + "configure_logging", + "LogContext", + + # Type definitions + "DatasetItem", + "EvaluationResult", + "LLMResponse", + "CandidateDict", + "LLMClientProtocol", + "EvaluatorProtocol", +] diff --git a/src/gepa_optimizer/cli.py b/src/gepa_optimizer/cli.py new file mode 100644 index 0000000000000000000000000000000000000000..14a1593308fcd2ae0cab1fa15fc734815d877252 --- /dev/null +++ b/src/gepa_optimizer/cli.py @@ -0,0 +1,239 @@ +""" +Command Line Interface for GEPA Optimizer +""" + +import argparse +import sys +import json +import asyncio +from pathlib import Path +from typing import Optional + +from .core import GepaOptimizer +from .models import OptimizationConfig, ModelConfig +from .utils import setup_logging, APIKeyManager + + +def main(): + """Main CLI entry point""" + parser = argparse.ArgumentParser( + description="GEPA Universal Prompt Optimizer CLI", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + gepa-optimize --model openai/gpt-4-turbo --prompt "Extract UI elements" --dataset data.json + gepa-optimize --config config.json --prompt "Analyze interface" --dataset images/ + """ + ) + + # Required arguments + parser.add_argument( + "--prompt", + required=True, + help="Initial seed prompt to optimize" + ) + parser.add_argument( + "--dataset", + required=True, + help="Path to dataset file or directory" + ) + + # Model configuration + parser.add_argument( + "--model", + help="Model specification (e.g., 'openai/gpt-4-turbo')" + ) + parser.add_argument( + "--reflection-model", + help="Reflection model specification" + ) + parser.add_argument( + "--config", + help="Path to configuration JSON file" + ) + + # Optimization parameters + parser.add_argument( + "--max-iterations", + type=int, + default=10, + help="Maximum optimization iterations (default: 10)" + ) + parser.add_argument( + "--max-metric-calls", + type=int, + default=100, + help="Maximum metric evaluation calls (default: 100)" + ) + parser.add_argument( + "--batch-size", + type=int, + default=4, + help="Batch size for evaluation (default: 4)" + ) + + # GEPA-specific parameters + parser.add_argument( + "--candidate-selection-strategy", + type=str, + default="pareto", + choices=["pareto", "best"], + help="Strategy for selecting candidates (default: pareto)" + ) + parser.add_argument( + "--skip-perfect-score", + action="store_true", + help="Skip updating candidates with perfect scores" + ) + parser.add_argument( + "--reflection-minibatch-size", + type=int, + default=None, + help="Number of examples to use for reflection (default: use batch_size)" + ) + parser.add_argument( + "--perfect-score", + type=float, + default=1.0, + help="Perfect score threshold (default: 1.0)" + ) + parser.add_argument( + "--module-selector", + type=str, + default="round_robin", + choices=["round_robin", "all"], + help="Component selection strategy (default: round_robin)" + ) + + # Output options + parser.add_argument( + "--output", + help="Output file path for results (default: stdout)" + ) + parser.add_argument( + "--verbose", "-v", + action="store_true", + help="Enable verbose logging" + ) + + args = parser.parse_args() + + # Setup logging + setup_logging(level="DEBUG" if args.verbose else "INFO") + + try: + # Load configuration + if args.config: + config = load_config_from_file(args.config) + else: + config = create_config_from_args(args) + + # Validate API keys + validate_api_keys(config) + + # Create optimizer + optimizer = GepaOptimizer(config=config) + + # Run optimization (async) + print(f"๐Ÿš€ Starting optimization with model: {config.model.model_name}") + result = asyncio.run(optimizer.train( + seed_prompt=args.prompt, + dataset=args.dataset + )) + + # Output results + output_results(result, args.output) + + print("โœ… Optimization completed successfully!") + + except Exception as e: + print(f"โŒ Error: {str(e)}", file=sys.stderr) + sys.exit(1) + + +def load_config_from_file(config_path: str) -> OptimizationConfig: + """Load configuration from JSON file""" + path = Path(config_path) + if not path.exists(): + raise FileNotFoundError(f"Configuration file not found: {config_path}") + + with open(path, 'r') as f: + config_data = json.load(f) + + # Convert model configs + if 'model' in config_data and isinstance(config_data['model'], dict): + config_data['model'] = ModelConfig(**config_data['model']) + + if 'reflection_model' in config_data and isinstance(config_data['reflection_model'], dict): + config_data['reflection_model'] = ModelConfig(**config_data['reflection_model']) + + return OptimizationConfig(**config_data) + + +def create_config_from_args(args) -> OptimizationConfig: + """Create configuration from command line arguments""" + if not args.model: + raise ValueError("Either --model or --config must be specified") + + # Parse model specification + model_config = ModelConfig.from_string(args.model) + + reflection_model_config = None + if args.reflection_model: + reflection_model_config = ModelConfig.from_string(args.reflection_model) + + return OptimizationConfig( + model=model_config, + reflection_model=reflection_model_config, + max_iterations=args.max_iterations, + max_metric_calls=args.max_metric_calls, + batch_size=args.batch_size + ) + + +def validate_api_keys(config: OptimizationConfig): + """Validate that required API keys are available""" + api_manager = APIKeyManager() + + providers = [config.model.provider] + if config.reflection_model: + providers.append(config.reflection_model.provider) + + missing_keys = api_manager.get_missing_keys(providers) + + if missing_keys: + print("โŒ Missing API keys for the following providers:") + for provider in missing_keys: + print(f" - {provider.upper()}_API_KEY") + print("\nPlease set the required environment variables or use a .env file") + sys.exit(1) + +def output_results(result, output_path: Optional[str]): + """Output optimization results""" + output_data = { + "optimized_prompt": result.prompt, + "original_prompt": result.original_prompt, + "improvement_metrics": result.improvement_data, + "optimization_time": result.optimization_time, + "status": result.status, + "session_id": result.session_id + } + + if output_path: + with open(output_path, 'w') as f: + json.dump(output_data, f, indent=2) + print(f"๐Ÿ“„ Results saved to: {output_path}") + else: + print("\n๐Ÿ“Š Optimization Results:") + print(f"Session ID: {result.session_id}") + print(f"Status: {result.status}") + print(f"Time: {result.optimization_time:.2f}s") + print(f"\nOriginal Prompt:\n{result.original_prompt}") + print(f"\nOptimized Prompt:\n{result.prompt}") + + if 'improvement_percent' in result.improvement_data: + print(f"\nImprovement: {result.improvement_data['improvement_percent']:.2f}%") + + +if __name__ == "__main__": + main() diff --git a/src/gepa_optimizer/core/__init__.py b/src/gepa_optimizer/core/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..41630b803c65248370baa2b0874fdf287ed0d052 --- /dev/null +++ b/src/gepa_optimizer/core/__init__.py @@ -0,0 +1,8 @@ +""" +Core functionality for GEPA Universal Prompt Optimizer +""" + +from .optimizer import GepaOptimizer +from .result import ResultProcessor + +__all__ = ["GepaOptimizer", "ResultProcessor"] diff --git a/src/gepa_optimizer/core/base_adapter.py b/src/gepa_optimizer/core/base_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..7f1ff8ea3eb47fbbd0ffc89b011fe233cafc68ce --- /dev/null +++ b/src/gepa_optimizer/core/base_adapter.py @@ -0,0 +1,85 @@ +""" +Base adapter class for all GEPA adapters. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional +import logging +from gepa.core.adapter import GEPAAdapter, EvaluationBatch + +from ..llms.base_llm import BaseLLMClient +from ..evaluation.base_evaluator import BaseEvaluator + +logger = logging.getLogger(__name__) + +class BaseGepaAdapter(GEPAAdapter, ABC): + """ + Abstract base class for GEPA adapters. + + Provides the foundation for creating task-specific adapters while + maintaining compatibility with the GEPA framework. + """ + + def __init__(self, llm_client: BaseLLMClient, evaluator: BaseEvaluator): + """ + Initialize adapter with LLM client and evaluator. + + Args: + llm_client: LLM client for generating responses + evaluator: Evaluator for scoring predictions + """ + if not isinstance(llm_client, BaseLLMClient): + raise TypeError("llm_client must be an instance of BaseLLMClient") + if not isinstance(evaluator, BaseEvaluator): + raise TypeError("evaluator must be an instance of BaseEvaluator") + + self.llm_client = llm_client + self.evaluator = evaluator + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + # Performance tracking + self._evaluation_count = 0 + self._best_score = 0.0 + self._best_candidate = None + + @abstractmethod + def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str], + capture_traces: bool = False) -> EvaluationBatch: + """ + Evaluate candidate on a batch of data. + + Args: + batch: List of data items to evaluate + candidate: Prompt candidate to evaluate + capture_traces: Whether to capture detailed traces + + Returns: + EvaluationBatch with outputs, scores, and optional trajectories + """ + pass + + @abstractmethod + def make_reflective_dataset(self, candidate: Dict[str, str], + eval_batch: EvaluationBatch, + components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]: + """ + Create reflective dataset for GEPA's reflection process. + + Args: + candidate: Current prompt candidate + eval_batch: Results from evaluation + components_to_update: List of components to update + + Returns: + Dictionary mapping components to reflection data + """ + pass + + def get_performance_stats(self) -> Dict[str, Any]: + """Get performance statistics for monitoring""" + return { + 'evaluation_count': self._evaluation_count, + 'best_score': self._best_score, + 'model_info': self.llm_client.get_model_info(), + 'evaluator_class': self.evaluator.__class__.__name__ + } diff --git a/src/gepa_optimizer/core/custom_adapter.py b/src/gepa_optimizer/core/custom_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..2f0d32ddc2a39b866d77fa5693be5e178cf20d09 --- /dev/null +++ b/src/gepa_optimizer/core/custom_adapter.py @@ -0,0 +1,389 @@ +""" +Custom GEPA Adapter for the GEPA Universal Prompt Optimizer +""" + +import json +import logging +import re +from typing import Any, Dict, List, Optional + +# Import ModelConfig +from ..models import ModelConfig + +from gepa.core.adapter import GEPAAdapter, EvaluationBatch +from ..llms.vision_llm import VisionLLMClient +from ..evaluation.ui_evaluator import UITreeEvaluator +from .base_adapter import BaseGepaAdapter + +logger = logging.getLogger(__name__) + +class CustomGepaAdapter(BaseGepaAdapter): + """ + Custom adapter for the GEPA Universal Prompt Optimizer. + """ + + def __init__(self, model_config: 'ModelConfig', metric_weights: Optional[Dict[str, float]] = None): + """Initialize the custom GEPA adapter with model configuration.""" + # Convert string model to ModelConfig if needed + if not isinstance(model_config, ModelConfig): + model_config = ModelConfig( + provider='openai', + model_name=str(model_config), + api_key=None + ) + + # Initialize components + llm_client = VisionLLMClient( + provider=model_config.provider, + model_name=model_config.model_name, + api_key=model_config.api_key, + base_url=model_config.base_url, + temperature=model_config.temperature, + max_tokens=model_config.max_tokens, + top_p=model_config.top_p, + frequency_penalty=model_config.frequency_penalty, + presence_penalty=model_config.presence_penalty + ) + + evaluator = UITreeEvaluator(metric_weights=metric_weights) + + # Initialize parent class + super().__init__(llm_client, evaluator) + + # Track candidates for logging + self._last_candidate = None + self._evaluation_count = 0 + + self.logger.info(f"๐Ÿš€ Initialized UI Tree adapter with {model_config.provider}/{model_config.model_name}") + + def _parse_json_safely(self, json_str: str) -> Dict[str, Any]: + """Safely parse JSON string to dictionary with enhanced parsing and repair.""" + if not json_str or not isinstance(json_str, str): + return {} + + # Try direct parsing first + try: + return json.loads(json_str) + except json.JSONDecodeError: + pass + + # Try to extract JSON from markdown code blocks + json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', json_str, re.DOTALL) + if json_match: + try: + return json.loads(json_match.group(1)) + except json.JSONDecodeError: + pass + + # Try to find JSON object in the string + json_match = re.search(r'\{.*\}', json_str, re.DOTALL) + if json_match: + try: + return json.loads(json_match.group(0)) + except json.JSONDecodeError: + pass + + # Try repair and parse + repaired_json = self._repair_json(json_str) + if repaired_json: + try: + return json.loads(repaired_json) + except json.JSONDecodeError: + pass + + self.logger.warning(f"Failed to parse JSON: {json_str[:100]}...") + return {} + + def _repair_json(self, json_str: str) -> str: + """Attempt to repair common JSON issues.""" + try: + # Remove markdown formatting + json_str = re.sub(r'```(?:json)?\s*', '', json_str) + json_str = re.sub(r'```\s*$', '', json_str) + + # Remove extra text before/after JSON + json_match = re.search(r'\{.*\}', json_str, re.DOTALL) + if json_match: + json_str = json_match.group(0) + + # Fix common issues + json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas + json_str = re.sub(r',\s*]', ']', json_str) # Remove trailing commas in arrays + json_str = re.sub(r'([{,]\s*)(\w+):', r'\1"\2":', json_str) # Quote unquoted keys + + return json_str + except Exception as e: + self.logger.warning(f"๐Ÿ”ง JSON repair failed: {e}") + return "" + + def evaluate( + self, + batch: List[Dict[str, Any]], + candidate: Dict[str, str], + capture_traces: bool = False, + ) -> EvaluationBatch: + """Evaluate the candidate on a batch of data.""" + outputs = [] + scores = [] + trajectories = [] if capture_traces else None + + system_prompt = candidate.get('system_prompt', '') + + # Check if this is a new candidate (different from last one) + if self._last_candidate != system_prompt: + self._evaluation_count += 1 + self.log_proposed_candidate(candidate, self._evaluation_count) + self._last_candidate = system_prompt + + self.logger.info(f"๐Ÿ“Š Evaluating {len(batch)} samples with prompt: '{system_prompt[:50]}...'") + + for i, item in enumerate(batch): + input_text = item.get('input', '') + image_base64 = item.get('image', '') + ground_truth_json = item.get('output', '') + + # Call the LLM client + llm_response = self.llm_client.generate(system_prompt, input_text, image_base64=image_base64) + + # Extract content from the response dictionary + if isinstance(llm_response, dict): + llm_output_json_str = llm_response.get("content", "") + if not llm_output_json_str: + llm_output_json_str = str(llm_response) + else: + llm_output_json_str = str(llm_response) if llm_response else "" + + # ๐Ÿ” DEBUG: Log essential info only (removed verbose JSON content) + self.logger.debug(f"๐Ÿ” Sample {i+1} - LLM Response Type: {type(llm_response)}") + self.logger.debug(f"๐Ÿ” Sample {i+1} - Response Length: {len(llm_output_json_str)} chars") + + outputs.append(llm_output_json_str) + + # Parse JSON strings to dictionaries for evaluation + llm_output_dict = self._parse_json_safely(llm_output_json_str) + ground_truth_dict = self._parse_json_safely(ground_truth_json) + + # Initialize evaluation_results with default values + evaluation_results = { + "composite_score": 0.0, + "element_completeness": 0.0, + "element_type_accuracy": 0.0, + "text_content_accuracy": 0.0, + "hierarchy_accuracy": 0.0, + "style_accuracy": 0.0 + } + + # Calculate composite score and evaluation results + if not llm_output_dict and not ground_truth_dict: + composite_score = 0.1 + evaluation_results = {k: 0.1 for k in evaluation_results.keys()} + self.logger.warning(f"โš ๏ธ Sample {i+1}: Empty results - using default score: {composite_score}") + elif not llm_output_dict or not ground_truth_dict: + composite_score = 0.05 + evaluation_results = {k: 0.05 for k in evaluation_results.keys()} + self.logger.warning(f"โš ๏ธ Sample {i+1}: Incomplete results - using low score: {composite_score}") + else: + # Calculate score using evaluator with parsed dictionaries + evaluation_results = self.evaluator.evaluate(llm_output_dict, ground_truth_dict) + composite_score = evaluation_results["composite_score"] + + # Clean, readable logging (removed verbose JSON dumps) + llm_children = len(llm_output_dict.get('children', [])) + gt_children = len(ground_truth_dict.get('children', [])) + + if composite_score < 0.1: + self.logger.warning(f"โš ๏ธ Sample {i+1}: Low score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements") + self.logger.debug(f" Score breakdown: {evaluation_results}") + else: + self.logger.info(f"โœ… Sample {i+1}: Score {composite_score:.4f} - LLM: {llm_children} elements, GT: {gt_children} elements") + + scores.append(composite_score) + + if capture_traces: + trajectories.append({ + 'input_text': input_text, + 'image_base64': image_base64, + 'ground_truth_json': ground_truth_json, + 'llm_output_json': llm_output_json_str, + 'evaluation_results': evaluation_results + }) + + avg_score = sum(scores) / len(scores) if scores else 0.0 + + # Update performance tracking (handled by parent class) + if avg_score > self._best_score: + self._best_score = avg_score + self._best_candidate = candidate.copy() + self.logger.info(f"๐ŸŽฏ New best candidate found with score: {avg_score:.4f}") + + self.logger.info(f"๐Ÿ“ˆ Batch evaluation complete - Average score: {avg_score:.4f}") + + return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories) + + def make_reflective_dataset( + self, + candidate: Dict[str, str], + eval_batch: EvaluationBatch, + components_to_update: List[str], + ) -> Dict[str, List[Dict[str, Any]]]: + """Create a reflective dataset from the evaluation results.""" + reflective_dataset = {} + system_prompt = candidate.get('system_prompt', '') + + # ๐ŸŽฏ NEW: Log the proposed new prompt being evaluated + self.logger.info(f"๐Ÿ“ Creating reflection dataset for prompt: '{system_prompt[:100]}...'") + + # Pretty print reflection dataset creation + self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update) + + for component in components_to_update: + reflective_dataset[component] = [] + for i, trace in enumerate(eval_batch.trajectories): + feedback = self._generate_feedback(trace['evaluation_results']) + reflective_dataset[component].append({ + "current_prompt": system_prompt, + "input_text": trace['input_text'], + "image_base64": trace['image_base64'], + "generated_json": trace['llm_output_json'], + "ground_truth_json": trace['ground_truth_json'], + "score": trace['evaluation_results']["composite_score"], + "feedback": feedback, + "detailed_scores": trace['evaluation_results'] + }) + + # ๐ŸŽฏ NEW: Log reflection dataset summary + total_samples = sum(len(data) for data in reflective_dataset.values()) + avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0 + self.logger.info(f"๐Ÿ“ Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}") + + return reflective_dataset + + def _generate_feedback(self, evaluation_results: Dict[str, float]) -> str: + """Generate textual feedback based on evaluation results.""" + composite_score = evaluation_results.get("composite_score", 0.0) + + feedback_parts = [] + + # Overall quality assessment + if composite_score >= 0.8: + feedback_parts.append("The overall quality is good.") + elif composite_score >= 0.5: + feedback_parts.append("The overall quality is moderate.") + else: + feedback_parts.append("The overall quality is low. Focus on fundamental accuracy.") + + # Specific metric feedback + if evaluation_results.get("element_completeness", 0.0) < 0.7: + feedback_parts.append("Element completeness is low. Ensure all UI elements are captured.") + + if evaluation_results.get("element_type_accuracy", 0.0) < 0.7: + feedback_parts.append("Element type accuracy is low. Verify correct UI element identification (Button, Text, Image, etc.).") + + if evaluation_results.get("text_content_accuracy", 0.0) < 0.7: + feedback_parts.append("Text content accuracy is low. Improve text extraction fidelity.") + + if evaluation_results.get("hierarchy_accuracy", 0.0) < 0.7: + feedback_parts.append("Hierarchy accuracy is low. Ensure correct parent-child relationships.") + + if evaluation_results.get("style_accuracy", 0.0) < 0.7: + feedback_parts.append("Style accuracy is low. Capture more styling properties (colors, sizes, positioning).") + + return " ".join(feedback_parts) + + def get_best_candidate(self) -> Optional[Dict[str, str]]: + """Get the best candidate found so far.""" + return self._best_candidate + + def get_best_score(self) -> float: + """Get the best score found so far.""" + return self._best_score + + def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0): + """ + Log the new proposed candidate prompt. + + Args: + candidate: The new candidate prompt from GEPA + iteration: Current optimization iteration + """ + system_prompt = candidate.get('system_prompt', '') + + logger.info("="*80) + logger.info(f"NEW PROPOSED CANDIDATE (Iteration {iteration})") + logger.info("="*80) + logger.info(f"PROPOSED PROMPT:") + logger.info("-" * 40) + logger.debug(f'"{system_prompt}"') + logger.info("-" * 40) + logger.info(f"Prompt Length: {len(system_prompt)} characters") + logger.info(f"Word Count: {len(system_prompt.split())} words") + logger.info("="*80) + + def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch, + components_to_update: List[str]): + """ + Log the reflection dataset creation process. + + Args: + candidate: Current candidate being evaluated + eval_batch: Evaluation results + components_to_update: Components being updated + """ + system_prompt = candidate.get('system_prompt', '') + + logger.info("="*80) + logger.info("REFLECTION DATASET CREATION") + logger.info("="*80) + + logger.info(f"CURRENT PROMPT BEING ANALYZED:") + logger.info("-" * 40) + logger.debug(f'"{system_prompt}"') + logger.info("-" * 40) + + logger.info(f"EVALUATION SUMMARY:") + logger.info("-" * 40) + if eval_batch.scores: + avg_score = sum(eval_batch.scores) / len(eval_batch.scores) + min_score = min(eval_batch.scores) + max_score = max(eval_batch.scores) + logger.info(f" Average Score: {avg_score:.4f}") + logger.info(f" Min Score: {min_score:.4f}") + logger.info(f" Max Score: {max_score:.4f}") + logger.info(f" Total Samples: {len(eval_batch.scores)}") + + logger.info(f"COMPONENTS TO UPDATE:") + logger.info("-" * 40) + for i, component in enumerate(components_to_update, 1): + logger.info(f" {i}. {component}") + + if eval_batch.trajectories: + logger.debug(f"DETAILED ANALYSIS:") + logger.debug("-" * 40) + for i, trace in enumerate(eval_batch.trajectories[:3], 1): # Show first 3 samples + evaluation_results = trace['evaluation_results'] + composite_score = evaluation_results.get("composite_score", 0.0) + + logger.debug(f" Sample {i} (Score: {composite_score:.4f}):") + + # Show input data (truncated) + input_text = trace['input_text'][:100] + "..." if len(trace['input_text']) > 100 else trace['input_text'] + logger.debug(f" Input: \"{input_text}\"") + + # Show predicted output (truncated) + predicted_output = trace['llm_output_json'][:100] + "..." if len(trace['llm_output_json']) > 100 else trace['llm_output_json'] + logger.debug(f" Output: \"{predicted_output}\"") + + # Show detailed scores + logger.debug(f" Detailed Scores:") + for metric, score in evaluation_results.items(): + if metric != "composite_score": + logger.debug(f" {metric.replace('_', ' ').title()}: {score:.4f}") + + # Show generated feedback + feedback = self._generate_feedback(evaluation_results) + logger.debug(f" Feedback: \"{feedback}\"") + + if len(eval_batch.trajectories) > 3: + logger.debug(f" ... and {len(eval_batch.trajectories) - 3} more samples") + + logger.info("="*80) diff --git a/src/gepa_optimizer/core/optimizer.py b/src/gepa_optimizer/core/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..30271dce298b80e1dc6b4478bb69aa8e2766b65c --- /dev/null +++ b/src/gepa_optimizer/core/optimizer.py @@ -0,0 +1,1279 @@ +""" +Main GepaOptimizer class - the heart of the optimization system +""" + +import time +import logging +from typing import Any, Dict, List, Optional, Union +import asyncio +import io +import sys +from contextlib import redirect_stdout, redirect_stderr + +import gepa +from ..utils.api_keys import APIKeyManager +from .result import ResultProcessor +from ..data.converters import UniversalConverter +from ..models.result import OptimizationResult, OptimizedResult +from ..models.config import OptimizationConfig, ModelConfig +from ..utils.helpers import sanitize_prompt +from ..utils.exceptions import GepaDependencyError, InvalidInputError, DatasetError, GepaOptimizerError + +logger = logging.getLogger(__name__) + +class GepaOptimizer: + """ + Main class for prompt optimization using GEPA + + This is the primary interface that users interact with. + Provides both simple and advanced optimization capabilities. + """ + + def __init__(self, config: Optional[OptimizationConfig] = None, + adapter_type: str = "universal", + custom_adapter: Optional[Any] = None, + llm_model_name: Optional[str] = None, + metric_weights: Optional[Dict[str, float]] = None, + **kwargs): + """ + Initialize the optimizer + + Args: + config: Optimization configuration (required) + adapter_type: Type of adapter to use ("universal" only - fully configurable) + custom_adapter: Custom adapter instance (overrides adapter_type) + llm_model_name: [Deprecated] Use config.model instead. Will be removed in future versions. + metric_weights: [Deprecated] Not used - evaluator handles metrics. Will be removed in future versions. + **kwargs: Additional parameters for universal adapter (llm_client, evaluator, etc.) + + Raises: + ValueError: If required configuration is missing + GepaDependencyError: If GEPA library is not available + """ + if config is None: + raise ValueError("config parameter is required. Use OptimizationConfig to configure the optimizer.") + + # Initialize logger first + self.logger = logging.getLogger(__name__) + + self.config = config + self.converter = UniversalConverter(data_split_config=config.data_split) + self.api_manager = APIKeyManager() + self.result_processor = ResultProcessor() + + # Initialize adapter based on configuration + if custom_adapter: + # User provided custom adapter + from .base_adapter import BaseGepaAdapter + if not isinstance(custom_adapter, BaseGepaAdapter): + raise TypeError("custom_adapter must be an instance of BaseGepaAdapter") + self.adapter = custom_adapter + self.logger.info("Using user-provided custom adapter") + elif adapter_type == "universal": + # Universal adapter requires user to provide components + llm_client = kwargs.get('llm_client') + evaluator = kwargs.get('evaluator') + + if not llm_client or not evaluator: + raise ValueError( + "llm_client and evaluator are required for universal adapter. " + "Example: GepaOptimizer(config=config, adapter_type='universal', " + "llm_client=llm_client, evaluator=evaluator)" + ) + + from .universal_adapter import UniversalGepaAdapter + self.adapter = UniversalGepaAdapter( + llm_client=llm_client, + evaluator=evaluator, + data_converter=kwargs.get('data_converter') + ) + self.logger.info("Using universal adapter") + else: + raise ValueError( + f"Unknown adapter_type: {adapter_type}. " + f"Only 'universal' is supported. " + f"Provide llm_client and evaluator when using universal adapter." + ) + + # Keep backward compatibility + self.custom_adapter = self.adapter + + # Log model configuration + model_info = self.adapter.get_performance_stats() + self.logger.info(f"Initialized adapter: {model_info}") + + # Set up logging + logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' + ) + + # Validate GEPA availability + if gepa is None: + raise GepaDependencyError("GEPA library is not available. Please install it with: pip install gepa") + + async def train(self, + seed_prompt: str, + dataset: Union[List[Any], str, Dict, Any], + **kwargs) -> OptimizedResult: + """ + Main training method for prompt optimization + + Args: + seed_prompt: Initial prompt to optimize + dataset: Training data in any format + **kwargs: Additional parameters that can override config + + Returns: + OptimizedResult: Optimization result with improved prompt + + Raises: + InvalidInputError: For invalid input parameters + DatasetError: For issues with dataset processing + GepaOptimizerError: For optimization failures + """ + start_time = time.time() + session_id = f"opt_{int(start_time)}_{id(self)}" + + try: + self.logger.info(f"Starting optimization session: {session_id}") + self.logger.info(f"Using model: {self.config.model.model_name} (provider: {self.config.model.provider})") + + # #region agent log + import json as _json_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "optimizer.py:train_start", "message": "Optimization train() started", "data": {"session_id": session_id, "max_iterations": self.config.max_iterations}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + # ๐Ÿ”ฅ FIX E: Reset Pareto logger at start of each optimization run + from ..utils.pareto_logger import reset_pareto_logger + reset_pareto_logger() + self.logger.info("โœ… Reset Pareto logger for new optimization run") + + # Update config with any overrides from kwargs + self._update_config_from_kwargs(kwargs) + + # Step 1: Validate inputs + self._validate_inputs(seed_prompt) + + # Step 2: Convert dataset to GEPA format with 3-way split + # ๐Ÿ”ฅ FIX: Support pre-split datasets (user-provided train/val/test) + if isinstance(dataset, dict) and all(k in dataset for k in ['train', 'val', 'test']): + # User provided pre-split dataset - use it directly + self.logger.info("โœ… Detected pre-split dataset - using user's split (no re-splitting)") + trainset_raw = dataset.get('train', []) + valset_raw = dataset.get('val', []) + testset_raw = dataset.get('test', []) + + # Still need to standardize the format (convert to GEPA format) + trainset = self.converter._standardize(trainset_raw) + valset = self.converter._standardize(valset_raw) + testset = self.converter._standardize(testset_raw) if testset_raw else [] + + self.logger.info( + f"Using pre-split dataset: {len(trainset)} train (Dfeedback), " + f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)" + ) + else: + # Standard path: convert and split automatically + self.logger.info("Converting dataset to GEPA format with 3-way split...") + trainset, valset, testset = self.converter.convert( + dataset, + split_config=self.config.data_split + ) + + # Log split with adaptive strategy info + split_strategy = self.config.data_split.small_dataset_strategy + strategy_note = "" + if split_strategy == 'adaptive': + total_size = len(trainset) + len(valset) + len(testset) + train_ratio, val_ratio, test_ratio = self.config.data_split.get_adaptive_ratios(total_size) + strategy_note = f" (adaptive: {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% ratios)" + self.logger.info( + f"Dataset split{strategy_note}: {len(trainset)} train (Dfeedback), " + f"{len(valset)} val (Dpareto), {len(testset)} test (held-out)" + ) + + if not trainset: + raise DatasetError("Dataset appears to be empty after conversion") + + # Step 3: Create seed candidate + seed_candidate = self._create_seed_candidate(seed_prompt) + + # ๐Ÿ”ฅ CRITICAL: Set valset info in adapter BEFORE baseline evaluation + # This ensures adapter correctly detects 'dpareto' dataset type + # Use direct assignment (don't rely on hasattr) to ensure attributes are set + try: + self.adapter._valset_size = len(valset) if valset else 0 + self.logger.info(f"โœ… Set valset_size in adapter: {len(valset) if valset else 0} for Dpareto detection") + except AttributeError: + self.logger.warning("โš ๏ธ Could not set _valset_size in adapter - attribute not supported") + + try: + self.adapter._valset = valset + self.logger.info(f"โœ… Stored valset in adapter ({len(valset) if valset else 0} samples)") + except AttributeError: + self.logger.warning("โš ๏ธ Could not set _valset in adapter - attribute not supported") + + # Step 3.5: Calculate baseline score on VALIDATION set (not test set) + # This ensures fair comparison since optimization uses validation set for Pareto selection + baseline_val_score = None + if valset: + self.logger.info("๐Ÿ“Š Evaluating seed prompt on validation set for baseline...") + # Set baseline flag so adapter knows this is baseline, not optimization + # Use direct assignment to ensure the flag is set + try: + self.adapter._is_baseline_evaluation = True + self.logger.info("โœ… Set baseline evaluation flag in adapter") + except AttributeError: + self.logger.warning("โš ๏ธ Could not set _is_baseline_evaluation in adapter") + + try: + # Evaluate on validation set (same as what GEPA will use for Pareto selection) + eval_result = self.adapter.evaluate( + batch=valset, + candidate=seed_candidate, + capture_traces=False + ) + baseline_val_score = sum(eval_result.scores) / len(eval_result.scores) if eval_result.scores else 0.0 + self.logger.info(f"๐Ÿ“Š Baseline validation score: {baseline_val_score:.4f} (on {len(valset)} samples)") + + # Store baseline in adapter for later use + if hasattr(self.adapter, '_baseline_score'): + self.adapter._baseline_score = baseline_val_score + + # ๐Ÿ”ฅ CRITICAL FIX: Also set baseline in Pareto logger + # This ensures candidates can be properly evaluated against baseline + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + pareto_log.set_baseline(baseline_val_score) + self.logger.info(f"โœ… Baseline set in Pareto logger: {baseline_val_score:.4f}") + + except Exception as e: + self.logger.warning(f"Baseline evaluation failed: {e}") + import traceback + self.logger.debug(f"Baseline evaluation error: {traceback.format_exc()}") + finally: + try: + self.adapter._is_baseline_evaluation = False + self.logger.debug("โœ… Reset baseline evaluation flag - optimization can begin") + except AttributeError: + pass # Ignore if attribute not supported + + # Step 4: Run GEPA optimization + self.logger.info("Starting GEPA optimization...") + gepa_result, actual_iterations = await self._run_gepa_optimization( + adapter=self.adapter, + seed_candidate=seed_candidate, + trainset=trainset, + valset=valset, + **kwargs + ) + + # Step 5: Extract best candidate + best_candidate = self._extract_best_candidate(gepa_result) + + # ๐Ÿ”ฅ CRITICAL: Extract optimized prompt from best_candidate + # This is the actual optimized prompt that GEPA found + self.logger.info(f"\n{'โ•'*80}") + self.logger.info(f"๐Ÿ“ EXTRACTING OPTIMIZED PROMPT FROM GEPA RESULT") + self.logger.info(f"{'โ•'*80}") + self.logger.info(f"best_candidate keys: {list(best_candidate.keys()) if isinstance(best_candidate, dict) else 'N/A'}") + + optimized_prompt = best_candidate.get('system_prompt', seed_prompt) + if not optimized_prompt or optimized_prompt.strip() == '': + # Fallback: try other keys or use seed prompt + optimized_prompt = best_candidate.get('prompt', best_candidate.get('text', seed_prompt)) + + # Get fitness score if available + best_fitness = best_candidate.get('fitness') or self.adapter.get_best_score() if hasattr(self.adapter, 'get_best_score') else None + candidate_source = best_candidate.get('source', 'unknown') + + self.logger.info(f"\nโœ… EXTRACTED OPTIMIZED PROMPT:") + self.logger.info(f" Source: {candidate_source}") + if best_fitness is not None: + self.logger.info(f" Fitness: f={best_fitness:.4f}") + self.logger.info(f" Length: {len(optimized_prompt)} characters") + self.logger.info(f" Words: {len(optimized_prompt.split())} words") + self.logger.info(f"\n๐Ÿ“ FULL OPTIMIZED PROMPT TEXT:") + self.logger.info(f"{'โ”€'*80}") + self.logger.info(optimized_prompt) + self.logger.info(f"{'โ”€'*80}") + + if optimized_prompt != seed_prompt: + self.logger.info(f"\nโœ… SUCCESS: Prompt WAS OPTIMIZED!") + self.logger.info(f" Seed length: {len(seed_prompt)} chars") + self.logger.info(f" Optimized length: {len(optimized_prompt)} chars") + self.logger.info(f" Difference: {len(optimized_prompt) - len(seed_prompt):+d} chars") + if best_fitness is not None: + baseline_fitness = 0.5 # Default baseline, could be improved + improvement = best_fitness - baseline_fitness + improvement_pct = (improvement / baseline_fitness * 100) if baseline_fitness > 0 else 0 + self.logger.info(f" Fitness: f={best_fitness:.4f} (improvement: {improvement:+.4f} ({improvement_pct:+.1f}%))") + else: + self.logger.warning(f"\nโš ๏ธ WARNING: Optimized prompt is IDENTICAL to seed prompt") + self.logger.warning(f" This means GEPA didn't modify the prompt during optimization") + if best_fitness is not None: + self.logger.warning(f" Best fitness found: f={best_fitness:.4f}") + self.logger.warning(f" ๐Ÿ’ก Check if LLEGO best candidate is being properly extracted") + + self.logger.info(f"{'โ•'*80}\n") + + # Step 5.5: Calculate improvement metrics (validation vs validation) + optimized_test_score = None + improvement_data = {} + + # ๐Ÿ”ฅ FIX: Calculate improvement based on VALIDATION scores (fair comparison) + # Compare optimized VALIDATION score vs validation baseline (both on Dpareto) + # This ensures fair comparison - both evaluated on the same validation set + optimized_val_score = best_fitness # Best candidate's fitness is from validation set (Dpareto) + + if baseline_val_score is not None and optimized_val_score is not None: + absolute_improvement = optimized_val_score - baseline_val_score + relative_improvement = ( + (absolute_improvement / baseline_val_score * 100) + if baseline_val_score > 0 else 0 + ) + + improvement_data = { + 'baseline_val_score': baseline_val_score, + 'optimized_val_score': optimized_val_score, + 'absolute_improvement': absolute_improvement, + 'relative_improvement_percent': relative_improvement + } + + self.logger.info( + f"๐Ÿ“ˆ Validation improvement: {relative_improvement:+.2f}% " + f"(baseline val: {baseline_val_score:.4f} โ†’ optimized val: {optimized_val_score:.4f})" + ) + + # Step 5.6: Evaluate optimized prompt on test set (if available) for final reporting + if testset and self.config.evaluate_on_test: + self.logger.info("๐Ÿ“Š Evaluating optimized prompt on test set...") + + # ๐Ÿ”ฅ CRITICAL FIX: Clear LLEGO candidate queue before test evaluation + # This prevents the LLEGO wrapper from intercepting test evaluation calls + # and returning wrong candidates instead of actually running the optimized prompt + from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient + if hasattr(self.adapter, 'llm_client') and isinstance(self.adapter.llm_client, LLEGOEnhancedLLMClient): + if hasattr(self.adapter.llm_client, '_adapter_generated_candidates'): + self.adapter.llm_client._adapter_generated_candidates = [] + self.logger.info("โœ… Cleared LLEGO candidate queue for clean test evaluation") + if hasattr(self.adapter.llm_client, '_candidate_queue'): + self.adapter.llm_client._candidate_queue = [] + self.logger.info("โœ… Cleared LLEGO hybrid candidate queue for clean test evaluation") + + # Evaluate on test set for final reporting (but improvement is based on validation) + try: + optimized_test_score = self._evaluate_candidate_on_testset( + best_candidate, + testset + ) + self.logger.info(f"๐Ÿ“Š Optimized test score: {optimized_test_score:.4f}") + + # Add test score to improvement_data for reference (but improvement is based on validation) + improvement_data['optimized_test_score'] = optimized_test_score + + if baseline_val_score is not None: + test_vs_baseline = ( + ((optimized_test_score - baseline_val_score) / baseline_val_score * 100) + if baseline_val_score > 0 else 0 + ) + self.logger.info( + f"๐Ÿ“Š Test set vs validation baseline: {test_vs_baseline:+.2f}% " + f"(baseline val: {baseline_val_score:.4f} โ†’ optimized test: {optimized_test_score:.4f})" + ) + except Exception as e: + self.logger.warning(f"Test evaluation failed: {e}") + + # Step 6: Process results + optimization_time = time.time() - start_time + + processed_result = self.result_processor.process_full_result( + result=gepa_result, + original_prompt=seed_prompt, + optimization_time=optimization_time, + actual_iterations=actual_iterations, + test_metrics=improvement_data # Add test metrics + ) + + # Merge improvement data + final_improvement_data = {**processed_result.get('improvement_data', {}), **improvement_data} + + # Step 7: Create result objects + # ๐Ÿ”ฅ CRITICAL: Use extracted optimized_prompt instead of processed_result + result = OptimizedResult( + original_prompt=seed_prompt, + optimized_prompt=optimized_prompt, # Use extracted prompt, not processed_result! + improvement_data=final_improvement_data, + optimization_time=optimization_time, + dataset_size=len(trainset) + len(valset) + len(testset), + total_iterations=processed_result.get('total_iterations', 0), + status=processed_result.get('status', 'completed'), + error_message=processed_result.get('error_message'), + detailed_result=OptimizationResult( + session_id=session_id, + original_prompt=seed_prompt, + optimized_prompt=optimized_prompt, # Use extracted prompt! + improvement_data=final_improvement_data, + optimization_time=optimization_time, + dataset_size=len(trainset) + len(valset) + len(testset), + total_iterations=processed_result.get('total_iterations', 0), + status=processed_result.get('status', 'completed'), + error_message=processed_result.get('error_message') + ) + ) + + self.logger.info(f"โœ… Optimization completed in {optimization_time:.2f}s") + return result + + except Exception as e: + optimization_time = time.time() - start_time + error_msg = f"Optimization failed: {str(e)}" + self.logger.error(error_msg) + + # Return failed result + return OptimizedResult( + original_prompt=seed_prompt, + optimized_prompt=seed_prompt, # Return original on failure + improvement_data={'error': error_msg}, + optimization_time=optimization_time, + dataset_size=0, + total_iterations=0, + status='failed', + error_message=error_msg + ) + + def _update_config_from_kwargs(self, kwargs: Dict[str, Any]) -> None: + """Update configuration with runtime overrides from kwargs.""" + updated_params = [] + + for key, value in kwargs.items(): + if hasattr(self.config, key): + setattr(self.config, key, value) + updated_params.append(f"{key}={value}") + else: + self.logger.warning(f"Unknown parameter '{key}' ignored") + + if updated_params: + self.logger.info(f"Updated config parameters: {', '.join(updated_params)}") + + def _validate_inputs(self, seed_prompt: str) -> None: + """ + Validate input parameters for optimization + + Args: + seed_prompt: The seed prompt to validate + + Raises: + InvalidInputError: If validation fails + """ + if not seed_prompt or not isinstance(seed_prompt, str): + raise InvalidInputError("Seed prompt must be a non-empty string") + + if len(seed_prompt.strip()) < 10: + raise InvalidInputError("Seed prompt is too short (minimum 10 characters)") + + # Validate model configuration + model_config = self.config.model + if not hasattr(model_config, 'model_name') or not model_config.model_name: + raise InvalidInputError("Model name is required") + + reflection_config = self.config.reflection_model + if not hasattr(reflection_config, 'model_name') or not reflection_config.model_name: + raise InvalidInputError("Reflection model name is required") + + def _clean_reflection_prompt(self, prompt: str, max_length: int = 50000) -> str: + """ + Clean reflection prompt by removing base64 images and truncating if too long. + + ๐Ÿ”ฅ CRITICAL: GEPA's reflective dataset includes base64 images which create + massive prompts (7MB+) that exceed token limits. This function: + 1. Strips all base64 image data + 2. Removes excessive detailed_scores entries + 3. Truncates to reasonable size + 4. Preserves essential feedback information + + Args: + prompt: Original prompt from GEPA (may contain base64) + max_length: Maximum length after cleaning (default: 50K chars) + + Returns: + Cleaned prompt without base64, within size limits + """ + import re + + # Step 1: Remove base64 image strings (typically very long alphanumeric strings) + # Base64 images are usually 50K+ characters of A-Za-z0-9+/= pattern + # Look for very long base64-like sequences + base64_pattern = r'[A-Za-z0-9+/=]{5000,}' # Sequences of 5000+ base64 chars + cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', prompt) + + # Step 2: Remove detailed_scores sections that might contain base64 references + # These are usually in markdown format: "### detailed_scores\n...base64..." + detailed_scores_pattern = r'### detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*' + cleaned = re.sub(detailed_scores_pattern, '### detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE) + + # Step 3: Remove any remaining image_base64 references + cleaned = re.sub(r'image_base64[^\n]*', 'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE) + cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned) # Very long strings likely base64 + + # Step 4: Truncate if still too long (keep the beginning which usually has the most important info) + if len(cleaned) > max_length: + # Keep first part (usually contains prompt and key feedback) + # Add truncation notice + truncated_size = len(cleaned) - max_length + cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters of detailed evaluation data]" + self.logger.warning(f"โš ๏ธ Prompt truncated: {len(prompt)} โ†’ {len(cleaned)} chars") + + return cleaned + + def _validate_models(self, task_lm, reflection_lm): + """ + Validate if specified models are supported. + + Note: No hardcoded restrictions - the API provider will validate model existence. + This method is kept for potential future validation logic but doesn't restrict users. + """ + # No hardcoded model restrictions - users can specify any model + # The API provider will handle validation and return errors if model doesn't exist + self.logger.debug(f"Using task model: {task_lm}, reflection model: {reflection_lm}") + + def _create_seed_candidate(self, seed_prompt: str) -> Dict[str, str]: + """Create a seed candidate from the input prompt.""" + sanitized_prompt = sanitize_prompt(seed_prompt) + return {'system_prompt': sanitized_prompt} + + async def _run_gepa_optimization(self, adapter, seed_candidate: Any, trainset: List[Any], valset: List[Any], **kwargs) -> tuple: # Return tuple + """ + Run GEPA optimization with the given adapter and data + + Args: + adapter: Custom adapter for GEPA + seed_candidate: Initial prompt candidate + trainset: Training dataset + valset: Validation dataset + **kwargs: Additional optimization parameters that can override config + + Returns: + Dict with optimization results + + Raises: + GepaOptimizerError: If optimization fails + + Note: + The following parameters are required in the config: + - max_metric_calls: Maximum number of metric evaluations + - batch_size: Batch size for evaluation + - max_iterations: Maximum number of optimization iterations + """ + try: + # Get optimization parameters from config (these are required fields) + max_metric_calls = self.config.max_metric_calls + batch_size = self.config.batch_size + max_iterations = self.config.max_iterations + + # Create reflection model client + from ..llms.vision_llm import VisionLLMClient + base_reflection_lm_client = VisionLLMClient( + provider=self.config.reflection_model.provider, + model_name=self.config.reflection_model.model_name, + api_key=self.config.reflection_model.api_key, + base_url=self.config.reflection_model.base_url, + temperature=self.config.reflection_model.temperature, + max_tokens=self.config.reflection_model.max_tokens, + top_p=self.config.reflection_model.top_p, + frequency_penalty=self.config.reflection_model.frequency_penalty, + presence_penalty=self.config.reflection_model.presence_penalty + ) + # reflection_lm_client will be set below (may be wrapped with LLEGO) + reflection_lm_client = base_reflection_lm_client + + # ๐Ÿ†• LLEGO Integration: Create enhanced reflection callable + if self.config.use_llego_operators: + self.logger.info("๐Ÿงฌ LLEGO genetic operators ENABLED") + self.logger.info(f" ฮฑ={self.config.alpha}, ฯ„={self.config.tau}, ฮฝ={self.config.nu}") + self.logger.info(f" Crossover offspring: {self.config.n_crossover}, Mutation offspring: {self.config.n_mutation}") + + # Import LLEGO operators + from ..operators.llego_operators import LLEGOIntegrationLayer, PromptCandidate + + # Initialize LLEGO integration layer + llego = LLEGOIntegrationLayer( + alpha=self.config.alpha, + tau=self.config.tau, + nu=self.config.nu, + population_size=self.config.population_size, + n_crossover=self.config.n_crossover, + n_mutation=self.config.n_mutation + ) + + # Initialize with seed prompt + llego.initialize_population( + seed_prompt=seed_candidate.get('system_prompt', ''), + initial_fitness=0.5 + ) + + # ๐Ÿ”ฅ HYBRID MODE FIX: Wrap reflection_lm_client with LLEGO for hybrid mode + # This ensures reflection calls go through LLEGO wrapper for candidate generation + if self.config.enable_gepa_reflection_with_llego: + self.logger.info("๐Ÿ”ฅ HYBRID MODE: Wrapping reflection_lm_client with LLEGO") + from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient + + # Wrap reflection_lm_client with LLEGO so hybrid generation is triggered + reflection_lm_client = LLEGOEnhancedLLMClient( + base_llm=base_reflection_lm_client, + llego_layer=llego, + config=self.config, # Pass config for hybrid mode! + verbose=True + ) + self.logger.info("โœ… reflection_lm_client wrapped with LLEGO (hybrid mode enabled)") + + # ๐Ÿ”ฅ CRITICAL: Store reflection_lm_client reference in adapter so it can set context + # This allows make_reflective_dataset to set reflection context on BOTH clients + if hasattr(adapter, 'reflection_lm_client'): + adapter.reflection_lm_client = reflection_lm_client + self.logger.info("โœ… Stored reflection_lm_client reference in adapter") + else: + # Add reflection_lm_client attribute to adapter + adapter.reflection_lm_client = reflection_lm_client + self.logger.info("โœ… Added reflection_lm_client attribute to adapter") + + # ๐Ÿ”ฅ NEW: Also store config and reflection_lm_client for adapter-level generation + if hasattr(adapter, '_config'): + adapter._config = self.config + self.logger.info("โœ… Stored config in adapter for hybrid mode") + else: + adapter._config = self.config + self.logger.info("โœ… Added _config attribute to adapter") + + if hasattr(adapter, '_reflection_lm_client'): + adapter._reflection_lm_client = reflection_lm_client + self.logger.info("โœ… Stored _reflection_lm_client in adapter for hybrid mode") + else: + adapter._reflection_lm_client = reflection_lm_client + self.logger.info("โœ… Added _reflection_lm_client attribute to adapter") + + # ๐Ÿ”ฅ CRITICAL FIX: Ensure LLEGO layer is stored in adapter + # Without this, adapter.llego will be None and population updates are skipped! + if hasattr(adapter, 'llego'): + if adapter.llego is None: + adapter.llego = llego + self.logger.info("โœ… CRITICAL: Set LLEGO layer in adapter (was None)") + else: + self.logger.debug("โœ… LLEGO layer already set in adapter") + else: + # Add llego attribute if it doesn't exist + adapter.llego = llego + self.logger.info("โœ… CRITICAL: Added LLEGO layer to adapter") + + # ๐Ÿ”ฅ CRITICAL: Always set _reflection_lm_client in adapter (even without hybrid mode) + # This is required for propose_new_texts() to work + if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None: + adapter._reflection_lm_client = reflection_lm_client + self.logger.info("โœ… Set _reflection_lm_client in adapter (required for propose_new_texts)") + + # ๐Ÿ”ฅ HYBRID MODE FIX: Inject config into LLEGO wrapper for hybrid mode + # The adapter already has LLEGO wrapper, we just need to update its config + if self.config.enable_gepa_reflection_with_llego: + # HYBRID MODE: Update the LLEGO wrapper's config + self.logger.info("๐Ÿ”ฅ HYBRID MODE: Enabling hybrid candidate generation in LLEGO wrapper") + + # Get the LLM client (may already be wrapped) + llm_client = self.adapter.llm_client + from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient + + if isinstance(llm_client, LLEGOEnhancedLLMClient): + # Already wrapped, just update config + llm_client.config = self.config + self.logger.info("โœ… Updated LLEGO wrapper with hybrid mode config") + else: + # Not wrapped yet, wrap it now with config + llego_wrapped_llm = LLEGOEnhancedLLMClient( + base_llm=llm_client, + llego_layer=llego, + config=self.config, # โ† Pass config for hybrid mode! + verbose=True + ) + # Update adapter's LLM client + self.adapter.llm_client = llego_wrapped_llm + self.logger.info("โœ… Wrapped LLM client with LLEGO (hybrid mode enabled)") + + adapter = self.adapter + else: + # LLEGO-ONLY MODE: Wrap adapter with LLEGO layer (no hybrid) + self.logger.info("๐Ÿงฌ LLEGO-ONLY MODE: Recreating adapter with LLEGO integration...") + if hasattr(self, 'adapter') and self.adapter: + from .universal_adapter import UniversalGepaAdapter + + # Get original LLM client and evaluator from current adapter + original_llm = self.adapter.llm_client + # If it's already wrapped, unwrap it + if hasattr(original_llm, 'base_llm'): + original_llm = original_llm.base_llm + + evaluator = self.adapter.evaluator + data_converter = self.adapter.data_converter + + # Recreate adapter with LLEGO (no hybrid mode config) + from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient + llego_wrapped_llm = LLEGOEnhancedLLMClient( + base_llm=original_llm, + llego_layer=llego, + config=None, # No hybrid mode + verbose=True + ) + + adapter = UniversalGepaAdapter( + llm_client=llego_wrapped_llm, + evaluator=evaluator, + data_converter=data_converter, + llego_layer=llego + ) + self.logger.info("โœ… Adapter recreated with LLEGO-enhanced LLM client") + else: + adapter = self.adapter + + # Create LLEGO-enhanced reflection callable + # When hybrid mode is enabled, reflection_lm_client is wrapped with LLEGO + # The wrapper will automatically generate hybrid candidates when called + def reflection_lm_callable(prompt: str) -> str: + """ + Reflection callable that delegates to LLEGO-wrapped client. + In hybrid mode, the wrapper generates candidates from both GEPA and LLEGO. + + ๐Ÿ”ฅ CRITICAL: Clean the prompt to remove base64 images and truncate if too long. + """ + # ๐Ÿ”ฅ FIX: Clean prompt to remove base64 images and truncate excessive data + cleaned_prompt = self._clean_reflection_prompt(prompt) + + self.logger.info(f"\n{'๐Ÿ”ฅ'*40}") + self.logger.info(f"๐Ÿ”ฅ reflection_lm_callable CALLED (delegating to LLEGO wrapper)") + self.logger.info(f"๐Ÿ”ฅ Original prompt length: {len(prompt)} chars") + self.logger.info(f"๐Ÿ”ฅ Cleaned prompt length: {len(cleaned_prompt)} chars") + self.logger.info(f"๐Ÿ”ฅ Truncation: {len(prompt) - len(cleaned_prompt)} chars removed") + self.logger.info(f"๐Ÿ”ฅ First 200 chars (cleaned): {cleaned_prompt[:200]}...") + self.logger.info(f"{'๐Ÿ”ฅ'*40}\n") + + try: + # ๐Ÿ”ฅ CRITICAL: Set reflection context BEFORE generating + # This signals to the LLEGO wrapper that we're in reflection mode + if isinstance(reflection_lm_client, LLEGOEnhancedLLMClient): + reflection_lm_client.set_reflection_context( + current_prompt=cleaned_prompt, # Use cleaned prompt + feedback=None, + in_reflection=True # Enable reflection mode + ) + self.logger.info("โœ… Reflection context set on reflection_lm_client") + + # ๐Ÿ”ฅ HYBRID MODE: If reflection_lm_client is wrapped with LLEGO, + # calling generate() will trigger hybrid candidate generation + # The wrapper handles queuing and returns candidates one by one + + # ๐Ÿ”ฅ CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback + optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization. + +Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues. + +Core Requirements: +1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary) +2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening) +3. PRESERVE the core task domain and output format requirements +4. INTEGRATE improvements from feedback naturally into the prompt structure +5. MAINTAIN clarity, specificity, and actionability + +Quality Standards: +- Be specific and concrete (avoid vague instructions) +- Use clear, imperative language for task instructions +- Include edge case handling if feedback identifies confusion +- Ensure the prompt is self-contained and unambiguous + +DO NOT include: +- Analysis of what went wrong +- Explanations of your changes +- Meta-text like "Here's an improved version..." or "Based on feedback..." +- Recommendations or suggestions (those are already in the feedback) + +Output the improved prompt directly and only the prompt.""" + + result = reflection_lm_client.generate( + system_prompt=optimization_system_prompt, + user_prompt=cleaned_prompt, # Use cleaned prompt (no base64, truncated) + image_base64="" + ) + + # Extract content from result + if isinstance(result, dict): + candidate = result.get("content", str(result)) + source = result.get("source", "unknown") + self.logger.info(f"โœ… Candidate from {source} (FULL TEXT):") + self.logger.info(f" '{candidate}'") + return candidate + else: + candidate = str(result) + self.logger.info(f"โœ… Candidate generated (FULL TEXT):") + self.logger.info(f" '{candidate}'") + return candidate + + except Exception as e: + self.logger.error(f"โŒ Error in reflection_lm_callable: {e}") + import traceback + self.logger.error(traceback.format_exc()) + # Fallback: return prompt as-is + return prompt + + # Set up reflection context for LLEGO wrapper + if self.config.enable_gepa_reflection_with_llego and isinstance(reflection_lm_client, LLEGOEnhancedLLMClient): + # Store current prompt in reflection context for LLEGO operators + reflection_lm_client.set_reflection_context( + current_prompt=seed_candidate.get('system_prompt', ''), + feedback=None, + in_reflection=True + ) + + else: + # Standard GEPA reflection (no LLEGO) + adapter = self.adapter # Use the original adapter + + # ๐Ÿ”ฅ CRITICAL: Always set _reflection_lm_client in adapter (even without LLEGO) + # This is required for propose_new_texts() to work + if not hasattr(adapter, '_reflection_lm_client') or adapter._reflection_lm_client is None: + adapter._reflection_lm_client = reflection_lm_client + self.logger.info("โœ… Set _reflection_lm_client in adapter (required for propose_new_texts)") + + # Define standard reflection callable (no LLEGO enhancement) + def reflection_lm_callable(prompt: str) -> str: + """Standard callable wrapper for reflection model that GEPA expects""" + try: + # ๐Ÿ”ฅ CRITICAL: System prompt must instruct LLM to generate improved prompt, not feedback + optimization_system_prompt = """You are an expert prompt engineer specializing in iterative prompt optimization. + +Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues. + +Core Requirements: +1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary) +2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening) +3. PRESERVE the core task domain and output format requirements +4. INTEGRATE improvements from feedback naturally into the prompt structure +5. MAINTAIN clarity, specificity, and actionability + +Quality Standards: +- Be specific and concrete (avoid vague instructions) +- Use clear, imperative language for task instructions +- Include edge case handling if feedback identifies confusion +- Ensure the prompt is self-contained and unambiguous + +DO NOT include: +- Analysis of what went wrong +- Explanations of your changes +- Meta-text like "Here's an improved version..." or "Based on feedback..." +- Recommendations or suggestions (those are already in the feedback) + +Output the improved prompt directly and only the prompt.""" + + # For reflection, we only need text generation (no images) + result = reflection_lm_client.generate( + system_prompt=optimization_system_prompt, + user_prompt=prompt, + image_base64="" # No image for reflection + ) + + # Extract string content from the result dictionary + if isinstance(result, dict): + return result.get("content", str(result)) + else: + return str(result) + + except Exception as e: + self.logger.error(f"Reflection model error: {e}") + return prompt # Return original prompt on error + self.logger.info( + f"Starting GEPA optimization with {max_iterations} iterations, " + f"batch size {batch_size}, max metric calls: {max_metric_calls}" + ) + self.logger.info( + f"GEPA parameters: candidate_selection_strategy=pareto, " + f"reflection_minibatch_size={batch_size}, " + f"skip_perfect_score=False, " + f"module_selector=round_robin" + ) + + # Prepare optimization parameters with ONLY valid GEPA parameters + # Note: 'adapter' variable is set above (either LLEGO-enhanced or standard) + # ๐Ÿ”ฅ REMOVED: Excessive diagnostic warnings - moved to DEBUG level + reflection_lm_passed = reflection_lm_callable if self.config.use_llego_operators else None + if reflection_lm_passed: + self.logger.debug(f"reflection_lm_callable passed to GEPA (may be ignored in adapter mode)") + + # #region agent log + import json as _json_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params", "message": "GEPA params construction", "data": {"max_iterations_from_config": max_iterations, "max_metric_calls": max_metric_calls, "batch_size": batch_size}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + gepa_params = { + 'adapter': adapter, # Use the adapter created above (with or without LLEGO) + 'seed_candidate': seed_candidate, + 'trainset': trainset, + 'valset': valset, + 'max_metric_calls': max_metric_calls, + # NOTE: GEPA does NOT have num_iterations - it uses max_metric_calls to control iterations + + # ๐Ÿ”ฅ CRITICAL: When using an adapter, GEPA expects: + # - adapter.make_reflective_dataset() to create feedback data + # - GEPA's internal proposer to generate candidates from that data + # - task_lm and reflection_lm must be None (GEPA will use model from adapter) + 'task_lm': None, # Don't pass - adapter handles this + 'reflection_lm': reflection_lm_passed, # Pass LLEGO-enhanced reflection (may be ignored!) + + # Valid GEPA parameters based on actual library + 'candidate_selection_strategy': 'pareto', # Use Pareto selection + 'skip_perfect_score': False, # Don't skip perfect scores + 'reflection_minibatch_size': batch_size, # Use batch size for reflection + 'perfect_score': 1.0, # Perfect score threshold + 'module_selector': 'round_robin', # Cycle through components + 'display_progress_bar': self.config.verbose, # Show progress if verbose + 'raise_on_exception': True, # Raise exceptions for debugging + } + + # ๐Ÿ”ฅ CRITICAL FIX: Filter kwargs to only include valid GEPA parameters + # GEPA does NOT accept num_iterations, max_iterations, or other non-GEPA params + VALID_GEPA_PARAMS = { + 'seed_candidate', 'trainset', 'valset', 'adapter', 'task_lm', 'reflection_lm', + 'candidate_selection_strategy', 'skip_perfect_score', 'batch_sampler', + 'reflection_minibatch_size', 'perfect_score', 'reflection_prompt_template', + 'module_selector', 'use_merge', 'max_merge_invocations', 'merge_val_overlap_floor', + 'max_metric_calls', 'stop_callbacks', 'logger', 'run_dir', 'use_wandb', + 'wandb_api_key', 'wandb_init_kwargs', 'use_mlflow', 'mlflow_tracking_uri', + 'mlflow_experiment_name', 'track_best_outputs', 'display_progress_bar', + 'use_cloudpickle', 'seed', 'raise_on_exception', 'val_evaluation_policy' + } + + # Only add valid kwargs that aren't already in gepa_params + for key, value in kwargs.items(): + if key in VALID_GEPA_PARAMS and key not in gepa_params: + gepa_params[key] = value + elif key not in VALID_GEPA_PARAMS: + self.logger.debug(f"โš ๏ธ Filtering out invalid GEPA parameter: {key}") + + # #region agent log + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "A", "location": "optimizer.py:gepa_params_final", "message": "Final GEPA params keys", "data": {"params_keys": list(gepa_params.keys()), "max_metric_calls": gepa_params.get('max_metric_calls', 'NOT_PASSED')}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + # ๐ŸŽฏ NEW: Capture GEPA's internal logging for pareto front information + gepa_output = io.StringIO() + + # Log iteration start + from ..utils.clean_logger import get_clean_logger + clean_log = get_clean_logger() + clean_log.log_iteration_start(1, seed_prompt=seed_candidate.get('system_prompt', '')) + + # ๐Ÿ”ฅ CRITICAL: Pass valset size to adapter for better dataset type detection + if hasattr(adapter, '_valset_size'): + adapter._valset_size = len(valset) + self.logger.debug(f"โœ… Set valset_size in adapter: {len(valset)} for Dpareto detection") + + # ๐Ÿ”ฅ CRITICAL FIX: Store valset in adapter so we can evaluate generated candidates on it + # This ensures generated candidates are evaluated on Dpareto for Pareto selection + if hasattr(adapter, '_valset'): + adapter._valset = valset + self.logger.debug(f"โœ… Stored valset in adapter ({len(valset)} samples) for Dpareto evaluation of generated candidates") + else: + # Add _valset attribute if it doesn't exist + adapter._valset = valset + self.logger.debug(f"โœ… Added _valset attribute to adapter ({len(valset)} samples)") + + # Run GEPA optimization (synchronous call wrapped in async) + result = await asyncio.get_event_loop().run_in_executor( + None, + lambda: self._run_gepa_with_logging(gepa_params, gepa_output) + ) + + # ๐ŸŽฏ NEW: Process and log pareto front information, extract iteration count + gepa_logs = gepa_output.getvalue() + actual_iterations = self._log_pareto_front_info(gepa_logs) # Get iteration count + + return result, actual_iterations # Return both result and iteration count + except Exception as e: + # Try to extract partial results before failing + self.logger.warning(f"GEPA optimization failed: {e}") + + # Check if we have any cached results from the adapter + best_candidate = adapter.get_best_candidate() + best_score = adapter.get_best_score() + + if best_candidate and best_score > 0: + self.logger.info(f"๐ŸŽฏ Using cached best result with score: {best_score:.4f}") + + # Create a mock GEPA result with the best candidate found + return { + 'best_candidate': best_candidate, + 'best_score': best_score, + 'partial_result': True, + 'error': f'GEPA failed but returning best result found: {str(e)}' + } + else: + # If no cached results, re-raise the error + raise GepaOptimizerError(f"GEPA optimization failed: {str(e)}") + + def _run_gepa_with_logging(self, gepa_params: Dict[str, Any], output_buffer: io.StringIO) -> Any: + """Run GEPA optimization while capturing its output.""" + # Capture GEPA's print statements and logging + with redirect_stdout(output_buffer), redirect_stderr(output_buffer): + return gepa.optimize(**gepa_params) + + def _log_pareto_front_info(self, gepa_logs: str) -> int: # Return int instead of None + """Extract and log pareto front information from GEPA logs. Returns max iteration count.""" + lines = gepa_logs.split('\n') + current_iteration = 0 + max_iteration = 0 # Track max iteration + + for line in lines: + # Look for iteration information + if 'iteration' in line.lower(): + # Try to extract iteration number + import re + iteration_match = re.search(r'iteration\s+(\d+)', line.lower()) + if iteration_match: + current_iteration = int(iteration_match.group(1)) + max_iteration = max(max_iteration, current_iteration) # Track max + # Log iteration change + from ..utils.clean_logger import get_clean_logger + clean_log = get_clean_logger() + if current_iteration > clean_log.current_iteration: + clean_log.current_iteration = current_iteration + + # Look for pareto front information + if 'pareto front' in line.lower() or 'new program' in line.lower(): + self.logger.info(f"GEPA Pareto Update: {line.strip()}") + elif 'iteration' in line.lower() and ('score' in line.lower() or 'program' in line.lower()): + self.logger.debug(f"{line.strip()}") + elif 'best' in line.lower() and 'score' in line.lower(): + self.logger.info(f"{line.strip()}") + + # Look for evaluation information + if 'evaluating' in line.lower() and 'candidate' in line.lower(): + self.logger.debug(f"{line.strip()}") + + self.logger.info(f"GEPA Optimization Complete: {max_iteration} iterations") + + # #region agent log + import json as _json_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "F", "location": "optimizer.py:gepa_complete", "message": "GEPA optimization complete - iteration count", "data": {"max_iteration_from_logs": max_iteration, "expected_iterations": self.config.max_iterations, "off_by_one": max_iteration != self.config.max_iterations, "gepa_logs_length": len(gepa_logs)}, "timestamp": int(time.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + return max_iteration # Return the max iteration count + + def _extract_best_candidate(self, gepa_result: Any) -> Dict[str, str]: + """ + Extract the best candidate from GEPA Pareto front (single source of truth). + + GEPA Pareto front is the single source of truth because: + - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto + - All non-dominated candidates are added to GEPA Pareto front + - Therefore, the best candidate MUST be in GEPA Pareto front + + Args: + gepa_result: Raw result from gepa.optimize() (used only as fallback edge case) + + Returns: + Best candidate dictionary with prompt components from GEPA Pareto front + """ + try: + self.logger.info(f"\n{'โ•'*80}") + self.logger.info(f"๐Ÿ” EXTRACTING BEST CANDIDATE FROM GEPA PARETO FRONT") + self.logger.info(f"{'โ•'*80}") + + # ======================================================================== + # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth) + # ======================================================================== + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + + if pareto_log.pareto_front: + try: + # Get best candidate from GEPA Pareto front (highest score = best) + gepa_pareto_best = max(pareto_log.pareto_front, key=lambda x: x['score']) + gepa_pareto_fitness = gepa_pareto_best['score'] + gepa_pareto_prompt = gepa_pareto_best['prompt'] + gepa_pareto_type = gepa_pareto_best.get('type', 'unknown') + gepa_pareto_notation = gepa_pareto_best.get('notation', 'S') + + best_candidate = { + 'system_prompt': gepa_pareto_prompt, + 'fitness': gepa_pareto_fitness, + 'source': 'gepa_pareto_front', + 'candidate_type': gepa_pareto_type, + 'notation': gepa_pareto_notation + } + + self.logger.info(f"โœ… SELECTED: Best candidate from GEPA Pareto front") + self.logger.info(f" Notation: {gepa_pareto_notation}") + self.logger.info(f" Fitness: f({gepa_pareto_notation})={gepa_pareto_fitness:.4f}") + self.logger.info(f" Type: {gepa_pareto_type}") + self.logger.info(f" Prompt length: {len(gepa_pareto_prompt)} chars") + self.logger.info(f" ๐Ÿ’ก GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)") + + return best_candidate + + except Exception as e: + self.logger.error(f"โŒ Failed to extract from GEPA Pareto front: {e}") + import traceback + self.logger.error(traceback.format_exc()) + + # ======================================================================== + # EDGE CASE FALLBACK: Pareto front empty (shouldn't happen, but handle gracefully) + # ======================================================================== + self.logger.warning(f"โš ๏ธ GEPA Pareto front is empty - using gepa_result as fallback") + self.logger.warning(f" This should not happen if all candidates are evaluated on Dpareto") + + # Try to extract from gepa_result (last resort) + if hasattr(gepa_result, 'best_candidate'): + gepa_candidate = gepa_result.best_candidate + gepa_prompt = gepa_candidate.get('system_prompt') if isinstance(gepa_candidate, dict) else str(gepa_candidate) + gepa_fitness = getattr(gepa_result, 'best_score', None) + + if gepa_prompt: + self.logger.info(f"โœ… Using gepa_result.best_candidate as fallback") + return { + 'system_prompt': gepa_prompt, + 'fitness': float(gepa_fitness) if gepa_fitness is not None else None, + 'source': 'gepa_result_fallback', + 'candidate_type': 'unknown', + 'notation': 'S' + } + + # Last resort: return empty prompt + self.logger.error(f"โŒ No candidates found anywhere - returning empty prompt") + return {'system_prompt': ''} + + except Exception as e: + self.logger.error(f"โŒ Error extracting best candidate: {e}") + import traceback + self.logger.error(traceback.format_exc()) + return {'system_prompt': ''} + + def _evaluate_candidate_on_testset( + self, + candidate: Dict[str, str], + testset: List[Dict] + ) -> float: + """ + Evaluate a candidate prompt on the held-out test set. + + Args: + candidate: Prompt candidate to evaluate + testset: Test dataset (not used during optimization) + + Returns: + Average composite score on test set + + Raises: + TestSetEvaluationError: If evaluation fails + """ + from ..utils.exceptions import TestSetEvaluationError + + try: + # Evaluate using the adapter (same as GEPA does internally) + eval_result = self.adapter.evaluate( + batch=testset, + candidate=candidate, + capture_traces=False # Don't need detailed traces for test + ) + + if not eval_result.scores: + raise TestSetEvaluationError("No scores returned from test evaluation") + + # Calculate average score + avg_score = sum(eval_result.scores) / len(eval_result.scores) + + self.logger.debug( + f"Test set evaluation: {len(eval_result.scores)} samples, " + f"scores: {eval_result.scores}, avg: {avg_score:.4f}" + ) + + return avg_score + + except Exception as e: + raise TestSetEvaluationError(f"Failed to evaluate on test set: {str(e)}") + + def optimize_sync(self, + model: str, + seed_prompt: str, + dataset: Any, + reflection_lm: str, + max_metric_calls: int = 150, + **kwargs) -> OptimizedResult: + """ + Synchronous version of the optimization method + + Args: + model: Target model to optimize for + seed_prompt: Initial prompt to optimize + dataset: Training data in any format + reflection_lm: Model for reflection + max_metric_calls: Budget for optimization attempts + **kwargs: Additional optimization parameters + + Returns: + OptimizedResult: Optimization result + """ + # Run the async method in a new event loop + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + result = loop.run_until_complete( + self.train(model, seed_prompt, dataset, reflection_lm, max_metric_calls, **kwargs) + ) + return result + finally: + loop.close() + + +# Convenience function for quick optimization +def optimize_prompt( + model: Union[str, ModelConfig], + seed_prompt: str, + dataset: Any, + reflection_model: Optional[Union[str, ModelConfig]] = None, + **kwargs +) -> OptimizedResult: + """ + Convenience function for quick prompt optimization without creating optimizer instance + + Args: + model: Target model configuration + seed_prompt: Initial prompt to optimize + dataset: Training data + reflection_model: Model for reflection (optional) + **kwargs: Additional optimization parameters + + Returns: + OptimizedResult: Optimization result + """ + # Create default config if not provided + if reflection_model is None: + reflection_model = model + + config = OptimizationConfig( + model=model, + reflection_model=reflection_model, + max_iterations=kwargs.get('max_iterations', 10), + max_metric_calls=kwargs.get('max_metric_calls', 50), + batch_size=kwargs.get('batch_size', 4) + ) + + optimizer = GepaOptimizer(config=config) + return asyncio.run(optimizer.train(seed_prompt, dataset, **kwargs)) + + + + + + diff --git a/src/gepa_optimizer/core/result.py b/src/gepa_optimizer/core/result.py new file mode 100644 index 0000000000000000000000000000000000000000..d23bb98840b4e023873ef435df846afebe748187 --- /dev/null +++ b/src/gepa_optimizer/core/result.py @@ -0,0 +1,180 @@ +""" +Result processing for GEPA Optimizer +Handles extraction and processing of GEPA optimization results +""" + +from typing import Any, Dict, Optional +import logging + +logger = logging.getLogger(__name__) + +class ResultProcessor: + """ + Processes raw GEPA optimization results into clean, usable formats + """ + + @staticmethod + def extract_optimized_prompt(result: Any) -> str: + """ + Extract the optimized prompt from GEPA result object + + Args: + result: Raw GEPA optimization result + + Returns: + str: The optimized prompt text + """ + try: + # Try multiple possible result structures + if hasattr(result, 'best_candidate'): + candidate = result.best_candidate + + if isinstance(candidate, dict): + # Try common prompt keys + for key in ['system_prompt', 'prompt', 'text']: + if key in candidate: + return str(candidate[key]) + + # If no standard key found, return string representation + return str(candidate) + else: + return str(candidate) + + # Fallback - convert entire result to string + return str(result) + + except Exception as e: + logger.warning(f"Failed to extract optimized prompt: {e}") + return "Optimization completed (prompt extraction failed)" + + @staticmethod + def extract_metrics(result: Any) -> Dict[str, Any]: + """ + Extract performance metrics from GEPA result + + Args: + result: Raw GEPA optimization result + + Returns: + Dict[str, Any]: Extracted metrics + """ + metrics = {} + + try: + # Extract common metrics + if hasattr(result, 'best_score'): + metrics['best_score'] = float(result.best_score) + + if hasattr(result, 'baseline_score'): + metrics['baseline_score'] = float(result.baseline_score) + + if hasattr(result, 'improvement'): + metrics['improvement'] = float(result.improvement) + + if hasattr(result, 'iterations'): + metrics['iterations'] = int(result.iterations) + + # Calculate improvement percentage if we have both scores + if 'best_score' in metrics and 'baseline_score' in metrics: + baseline = metrics['baseline_score'] + if baseline > 0: + improvement_percent = ((metrics['best_score'] - baseline) / baseline) * 100 + metrics['improvement_percent'] = round(improvement_percent, 2) + + # Extract additional metadata + if hasattr(result, 'metadata'): + metrics['metadata'] = result.metadata + + except Exception as e: + logger.warning(f"Failed to extract metrics: {e}") + + return metrics + + @staticmethod + def extract_reflection_history(result: Any) -> list: + """ + Extract reflection/optimization history from GEPA result + + Args: + result: Raw GEPA optimization result + + Returns: + list: List of reflection iterations + """ + history = [] + + try: + if hasattr(result, 'optimization_history'): + for i, iteration in enumerate(result.optimization_history): + history_item = { + 'iteration': i, + 'score': iteration.get('score', 0.0), + 'candidate': iteration.get('candidate', {}), + 'feedback': iteration.get('feedback', ''), + 'improvement': iteration.get('improvement', 0.0) + } + history.append(history_item) + + except Exception as e: + logger.warning(f"Failed to extract reflection history: {e}") + + return history + + @staticmethod + def process_full_result( + result: Any, + original_prompt: str, + optimization_time: float, + actual_iterations: Optional[int] = None, + test_metrics: Optional[Dict[str, Any]] = None + ) -> Dict[str, Any]: + """ + Process complete GEPA result into structured format. + + Args: + result: Raw GEPA optimization result + original_prompt: Original seed prompt + optimization_time: Time taken for optimization + actual_iterations: Actual number of iterations from GEPA logs (optional) + test_metrics: Metrics from test set evaluation (optional) + + Returns: + Dict[str, Any]: Complete processed result + """ + # Extract metrics first + metrics = ResultProcessor.extract_metrics(result) + + # Extract iterations from GEPA result + total_iterations = 0 + try: + # First priority: use actual_iterations if provided (from logs) + if actual_iterations is not None: + total_iterations = actual_iterations + elif hasattr(result, 'iterations'): + total_iterations = int(result.iterations) + elif hasattr(result, 'num_iterations'): + total_iterations = int(result.num_iterations) + elif hasattr(result, 'optimization_history'): + total_iterations = len(result.optimization_history) + # Check if it's in metrics + elif 'iterations' in metrics: + total_iterations = metrics['iterations'] + except Exception as e: + logger.warning(f"Failed to extract iterations: {e}") + + # Merge test metrics into improvement_data + improvement_data = {} + if test_metrics: + improvement_data.update(test_metrics) + + return { + 'original_prompt': original_prompt, + 'optimized_prompt': ResultProcessor.extract_optimized_prompt(result), + 'metrics': metrics, + 'improvement_data': improvement_data, + 'reflection_history': ResultProcessor.extract_reflection_history(result), + 'optimization_time': optimization_time, + 'total_iterations': total_iterations, + 'status': 'completed', + 'raw_result': result # Keep raw result for advanced users + } diff --git a/src/gepa_optimizer/core/universal_adapter.py b/src/gepa_optimizer/core/universal_adapter.py new file mode 100644 index 0000000000000000000000000000000000000000..539b7413219278c5e28085fb159456aa664c3600 --- /dev/null +++ b/src/gepa_optimizer/core/universal_adapter.py @@ -0,0 +1,2386 @@ +""" +Universal GEPA adapter for user-defined metrics and LLM clients. +""" + +from .base_adapter import BaseGepaAdapter +from ..data.converters import UniversalConverter +from typing import Any, Dict, List, Optional +import logging +import re +from gepa.core.adapter import EvaluationBatch + +logger = logging.getLogger(__name__) + +class UniversalGepaAdapter(BaseGepaAdapter): + """ + Universal GEPA adapter that works with any LLM client and evaluator. + + This adapter uses the existing UniversalConverter for data processing + and delegates LLM generation and evaluation to user-provided components. + + Features: + - Optimized multi-variation JSON generation (66% cost reduction) + - Robust parsing with multiple fallback strategies + - Automatic fallback to sequential generation if JSON parsing fails + """ + + # Fallback system prompt for sequential generation (when JSON parsing fails) + _FALLBACK_SYSTEM_PROMPT = """You are an expert prompt engineer specializing in iterative prompt optimization. + +Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate an IMPROVED version of the prompt that addresses all identified issues. + +Core Requirements: +1. OUTPUT ONLY the improved prompt text (no explanations, no analysis, no meta-commentary) +2. START directly with the prompt (e.g., "You are a mobile GUI agent..." or similar task-appropriate opening) +3. PRESERVE the core task domain and output format requirements +4. INTEGRATE improvements from feedback naturally into the prompt structure +5. MAINTAIN clarity, specificity, and actionability + +Quality Standards: +- Be specific and concrete (avoid vague instructions) +- Use clear, imperative language for task instructions +- Include edge case handling if feedback identifies confusion +- Ensure the prompt is self-contained and unambiguous + +DO NOT include: +- Analysis of what went wrong +- Explanations of your changes +- Meta-text like "Here's an improved version..." or "Based on feedback..." +- Recommendations or suggestions (those are already in the feedback) + +Output the improved prompt directly and only the prompt.""" + + def __init__(self, llm_client, evaluator, data_converter=None, llego_layer=None): + """ + Initialize universal adapter. + + Args: + llm_client: User-provided LLM client (must inherit from BaseLLMClient) + evaluator: User-provided evaluator (must inherit from BaseEvaluator) + data_converter: Optional custom data converter (uses UniversalConverter by default) + llego_layer: Optional LLEGO integration layer for genetic operations + """ + # Store LLEGO layer first + self.llego = llego_layer + + # If LLEGO is provided, wrap the LLM client + # Note: If config is passed separately, it will be handled by optimizer + if llego_layer is not None: + from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient + # Only wrap if not already wrapped (optimizer may have wrapped it with config) + if not isinstance(llm_client, LLEGOEnhancedLLMClient): + # Wrap before calling super().__init__ + # Config will be set later by optimizer if hybrid mode is enabled + llm_client = LLEGOEnhancedLLMClient(llm_client, llego_layer, config=None, verbose=True) + else: + # Already wrapped, but update config if available + if hasattr(llm_client, 'config') and llm_client.config is None: + # Config will be set by optimizer later + pass + + # Initialize parent (this sets up self.logger) + super().__init__(llm_client, evaluator) + + # Use existing UniversalConverter for data processing + self.data_converter = data_converter or UniversalConverter() + + # ๐Ÿ”ฅ NEW: Initialize optimization state tracking + self._is_baseline_evaluation = False # Flag to distinguish baseline vs optimization + self._last_candidate = None # Track last candidate to detect changes + self._gepa_iteration = 0 # Track actual GEPA iteration (not evaluation count) + + # Track candidates for logging + self._evaluation_count = 0 + + # Track current evaluation context + self._current_evaluation_type = None # 'seed', 'gepa_reflection', 'llego_crossover', 'llego_mutation' + self._current_dataset_type = None # 'dfeedback' or 'dpareto' + self._baseline_score = None # Store baseline score for comparison + + # Track candidate sources by prompt text (in case GEPA doesn't pass source field) + self._candidate_sources = {} # Maps prompt_text -> source_type + + # Track validation set size for better dataset type detection + self._valset_size = None # Will be set by optimizer + self._valset = None # Will be set by optimizer - stores actual valset for Dpareto evaluation + + # ๐Ÿ”ฅ CRITICAL: Track which candidates have been evaluated on Dpareto to avoid double evaluation + # Key: normalized prompt text, Value: (fitness_score, candidate_type, timestamp) + self._dpareto_evaluated_candidates = {} # Maps prompt -> (score, type) + + # ๐Ÿ”ฅ HYBRID MODE: Storage for generated candidates + self._generated_candidates = [] # Store hybrid mode candidates + self._candidate_generation_active = False # Track if we're generating candidates + self._config = None # Will be set by optimizer if hybrid mode enabled + self._reflection_lm_client = None # Will be set by optimizer + + # ๐Ÿ”ฅ FORMAT AWARENESS: Store detected output format for better prompts + self._detected_format = None # Will be populated from expected outputs + self._format_detection_done = False # Only detect once + + # Log initialization + model_info = llm_client.get_model_info() + if llego_layer is not None: + self.logger.info(f"๐Ÿš€ Initialized Universal adapter with {model_info}") + self.logger.info(f"๐Ÿงฌ LLEGO integration ENABLED - LLM client is wrapped for genetic operations") + else: + self.logger.info(f"๐Ÿš€ Initialized Universal adapter with {model_info}") + + def _clean_llm_output(self, output: str) -> str: + """ + ๐Ÿ”ฅ CRITICAL: Clean LLM output before evaluation. + + LLMs often wrap JSON/structured output in markdown code blocks. + This causes evaluation to fail because the evaluator sees: + "```json\n{\"key\": \"value\"}\n```" + Instead of: + "{\"key\": \"value\"}" + + This method extracts the clean content for fair comparison. + """ + if not output or not isinstance(output, str): + return output + + cleaned = output.strip() + + # Remove markdown code blocks (```json ... ``` or ``` ... ```) + code_block_match = re.search(r'```(?:json|JSON)?\s*([\s\S]*?)\s*```', cleaned) + if code_block_match: + extracted = code_block_match.group(1).strip() + # Only use extracted if it looks like valid content + if extracted and (extracted.startswith('{') or extracted.startswith('[') or len(extracted) > 10): + self.logger.debug(f"๐Ÿ“ฆ Cleaned markdown code block from LLM output") + return extracted + + # Remove leading/trailing markdown artifacts + # Handle cases like "Here is the JSON:\n```json\n...\n```" + if '```' in cleaned: + # Try to extract content between first ``` and last ``` + parts = cleaned.split('```') + if len(parts) >= 3: + # Content is in the middle part(s) + middle_content = parts[1] + # Remove language tag if present (e.g., "json\n") + middle_content = re.sub(r'^(?:json|JSON|python|text)\s*\n?', '', middle_content).strip() + if middle_content: + return middle_content + + return cleaned + + def _detect_and_cache_format(self, batch: List[Dict[str, Any]]) -> None: + """ + Detect output format from expected outputs and cache for future use. + + This enables format-aware prompting and feedback generation. + """ + try: + from ..utils.format_detection import detect_output_format + + # Extract expected outputs from batch + expected_outputs = [] + for item in batch: + # Try to extract output directly, or standardize if needed + output = None + if isinstance(item, dict): + # Try common output field names first + output = item.get('output') or item.get('expected_output') or item.get('result') or item.get('answer') + if not output: + # Standardize using converter's private method (same as _evaluate_batch_mode) + try: + standardized = self.data_converter._standardize([item])[0] + output = standardized.get('output') + except Exception: + pass + + if output and isinstance(output, str) and output.strip(): + expected_outputs.append(output) + + if expected_outputs: + self._detected_format = detect_output_format(expected_outputs) + self.logger.info(f"๐Ÿ“ FORMAT DETECTED: {self._detected_format['format_type']}") + self.logger.info(f" Spec: {self._detected_format['format_spec'][:100]}...") + self.logger.info(f" Avg length: {self._detected_format['avg_length']} chars") + # #region agent log + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_DETECT", "location": "universal_adapter.py:format_detected", "message": "Format detection successful", "data": {"format_type": self._detected_format['format_type'], "num_outputs": len(expected_outputs), "avg_length": self._detected_format['avg_length'], "has_constraint": bool(self._detected_format.get('format_constraint'))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + else: + self.logger.warning("โš ๏ธ No expected outputs found for format detection") + self._detected_format = None + # #region agent log + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_DETECT", "location": "universal_adapter.py:format_detected", "message": "Format detection failed - no outputs", "data": {"batch_size": len(batch)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + except Exception as e: + self.logger.warning(f"โš ๏ธ Format detection failed: {e}") + self._detected_format = None + + def evaluate(self, batch: List[Dict[str, Any]], candidate: Dict[str, str], + capture_traces: bool = False) -> EvaluationBatch: + """ + Evaluate candidates using user-provided LLM client and evaluator. + + This method automatically detects BatchLLMClient and uses batch processing + for cost savings, or falls back to standard individual processing. + + This method works with any data type supported by UniversalConverter. + + ๐Ÿ”ฅ IMPORTANT: We only optimize system_prompt, NOT user_prompt. + The user_prompt varies per tester and is not part of optimization. + + ๐Ÿ”ฅ CACHING: Seed prompt is evaluated ONLY ONCE on Dpareto (validation set). + Subsequent evaluations return cached result to save API calls and ensure consistency. + """ + system_prompt = candidate.get('system_prompt', '') + + # ๐Ÿ”ฅ FORMAT DETECTION: Detect output format from expected outputs (once) + if not self._format_detection_done and batch: + self._detect_and_cache_format(batch) + self._format_detection_done = True + + # Determine dataset type first (needed for cache check) + batch_size_threshold = self._config.batch_size if hasattr(self, '_config') and self._config else 8 + + # ๐Ÿ”ฅ CRITICAL FIX: If _is_baseline_evaluation is True, we KNOW this is the validation set + # This fixes the issue where valset_size might not be set yet when baseline detection happens + if hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation: + dataset_type = 'dpareto' # Baseline is ALWAYS evaluated on validation set + self.logger.debug(f"๐ŸŽฏ Forced dataset_type to 'dpareto' (baseline evaluation flag is True)") + elif hasattr(self, '_valset_size') and self._valset_size is not None and len(batch) >= self._valset_size: + dataset_type = 'dpareto' # Full validation set size = Dpareto + elif len(batch) > batch_size_threshold * 1.5: + dataset_type = 'dpareto' # Much larger than batch = likely full valset + else: + dataset_type = 'dfeedback' # Small batch = training minibatch for reflection + + # ๐Ÿ”ฅ CRITICAL: Check cache to avoid re-evaluating same prompt on Dpareto + # This ensures seed prompt is evaluated ONLY ONCE + if dataset_type == 'dpareto': + normalized_prompt = system_prompt.strip().strip('"\'') + if normalized_prompt in self._dpareto_evaluated_candidates: + existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt] + self.logger.info( + f"โ™ป๏ธ CACHE HIT: Prompt already evaluated on Dpareto " + f"(score={existing_score:.4f}, type={existing_type}) - skipping re-evaluation" + ) + + # Return cached result - create EvaluationBatch with cached score + cached_outputs = [f"[CACHED: {existing_type}]"] * len(batch) + cached_scores = [existing_score] * len(batch) + + # Still update baseline if this is seed and baseline not set + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + + if existing_type == 'seed' and self._baseline_score is None: + self._baseline_score = existing_score + pareto_log.set_baseline(existing_score) + self.logger.info(f"๐Ÿ“Š Baseline score set from cache: {existing_score:.4f}") + + # Log to Pareto logger (for tracking, but no re-evaluation) + pareto_log.log_candidate_evaluation( + prompt=system_prompt, + score=existing_score, + candidate_type=existing_type, + dataset_type='dpareto' + ) + + return EvaluationBatch( + outputs=cached_outputs, + scores=cached_scores, + trajectories=None # No traces for cached results + ) + + # Determine candidate type + # Priority order: + # 1. Check candidate dict for 'source' field (from LLM wrapper) + # 2. Check _candidate_sources mapping (from previous evaluations) + # 3. Check _current_evaluation_type (from log_proposed_candidate) + # 4. Infer from context (seed, repeat, etc.) + + candidate_type = candidate.get('source') # First try candidate dict + if not candidate_type or candidate_type == 'unknown': + candidate_type = self._candidate_sources.get(system_prompt) # Check mapping + if not candidate_type or candidate_type == 'unknown': + candidate_type = self._current_evaluation_type # Use stored type + if not candidate_type or candidate_type == 'unknown': + # Try to infer from prompt or metadata + if system_prompt == self._last_candidate: + candidate_type = 'repeat' # Same prompt being re-evaluated + elif self._evaluation_count == 0 or 'seed' in str(candidate.get('source', '')).lower(): + candidate_type = 'seed' # Explicitly mark as seed + self.logger.debug("๐ŸŒฑ Detected seed prompt (Sโ‚€)") + else: + candidate_type = 'unknown' # Truly unknown + + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "C", "location": "universal_adapter.py:candidate_type_detect", "message": "Candidate type detection", "data": {"candidate_type": candidate_type, "evaluation_count": self._evaluation_count, "from_candidate_dict": candidate.get('source'), "from_sources_mapping": self._candidate_sources.get(system_prompt), "from_current_type": self._current_evaluation_type}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + # Store source for future lookups (always update if we found a valid type) + if candidate_type and candidate_type != 'unknown' and system_prompt not in self._candidate_sources: + self._candidate_sources[system_prompt] = candidate_type + self.logger.debug(f" ๐Ÿ“ Stored candidate type: {candidate_type} for prompt (length: {len(system_prompt)})") + + # Dataset type already determined above for cache check - reuse it + + # #region agent log + try: + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "H", "location": "universal_adapter.py:dataset_type_detect", "message": "Dataset type detection", "data": {"batch_size": len(batch), "valset_size": getattr(self, '_valset_size', None), "batch_size_threshold": batch_size_threshold, "detected_type": dataset_type, "evaluation_count": self._evaluation_count}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + except Exception: + pass + # #endregion + + # Check if this is a new candidate (different from last one) + if self._last_candidate != system_prompt: + self._evaluation_count += 1 + # ๐Ÿ”ฅ CRITICAL: If this is baseline evaluation, force candidate_type to 'seed' + if self._is_baseline_evaluation: + candidate_type = 'seed' + self.logger.debug(f"๐ŸŒฑ Baseline evaluation detected - setting candidate_type to 'seed'") + self._current_evaluation_type = candidate_type + self._current_dataset_type = dataset_type + self._last_candidate = system_prompt + + # Minimal logging - just track what we're evaluating + if self._is_baseline_evaluation: + self.logger.debug(f"Evaluating baseline (Sโ‚€) on {dataset_type}") + else: + self.logger.debug(f"Evaluating candidate #{self._evaluation_count} ({candidate_type}) on {dataset_type}") + + # Detect and use batch mode if available + from ..llms.batch_llm import BatchLLMClient + is_batch_mode = isinstance(self.llm_client, BatchLLMClient) + + if is_batch_mode: + outputs, scores, trajectories = self._evaluate_batch_mode( + batch, system_prompt, capture_traces + ) + else: + outputs, scores, trajectories = self._evaluate_standard_mode( + batch, system_prompt, capture_traces + ) + + avg_score = sum(scores) / len(scores) if scores else 0.0 + + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "B,C", "location": "universal_adapter.py:baseline_check", "message": "Baseline check conditions", "data": {"baseline_score_is_none": self._baseline_score is None, "current_dataset_type": self._current_dataset_type, "current_evaluation_type": self._current_evaluation_type, "is_baseline_evaluation": self._is_baseline_evaluation, "batch_size": len(batch), "avg_score": avg_score}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + # ๐Ÿ”ฅ CRITICAL FIX: Baseline MUST be set from seed's first Dpareto evaluation ONLY + # This ensures FAIR comparison: seed and candidates evaluated on SAME dataset (Dpareto) with SAME number of datapoints + # + # Fair evaluation requires: + # - Seed baseline: Dpareto (validation set) - first evaluation during optimization + # - Candidates: Dpareto (validation set) - same dataset, same size + # - Same conditions = fair comparison โœ… + # + # We IGNORE test set for baseline - baseline must come from Dpareto to ensure same dataset/size + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + + # ๐Ÿ”ฅ FIX: Check if this is baseline evaluation AND dpareto - set baseline with priority + is_baseline_eval = hasattr(self, '_is_baseline_evaluation') and self._is_baseline_evaluation + + if self._baseline_score is None: + # ๐Ÿ”ฅ FIX B: Set baseline on FIRST Dpareto evaluation, regardless of candidate type + # Also set baseline if this is explicitly marked as baseline evaluation + if self._current_dataset_type == 'dpareto' or is_baseline_eval: + # โœ… PRIMARY: Set baseline from FIRST Dpareto evaluation (seed or first candidate) + self._baseline_score = avg_score + pareto_log.set_baseline(avg_score) + self.logger.info(f"๐Ÿ“Š Baseline score (Dpareto, {len(batch)} samples): {avg_score:.4f}") + self.logger.info(f" โœ… Baseline set from {'baseline evaluation' if is_baseline_eval else 'first Dpareto'} (type: {self._current_evaluation_type})") + # #region agent log + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "B", "location": "universal_adapter.py:baseline_set", "message": "Baseline score SET", "data": {"baseline_score": avg_score, "candidate_type": self._current_evaluation_type, "dataset_type": self._current_dataset_type, "is_baseline_eval": is_baseline_eval}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + # Note: Test set evaluations are ignored for baseline - baseline comes from Dpareto + else: + # ๐Ÿ”ฅ SAFETY CHECK: Ensure Pareto logger also has baseline if adapter has it + # This handles the case where optimizer set baseline in adapter but Pareto logger wasn't updated + if (self._current_dataset_type == 'dpareto' or is_baseline_eval) and pareto_log.baseline_score is None: + pareto_log.set_baseline(self._baseline_score) + self.logger.info(f"โœ… Synchronized baseline in Pareto logger: {self._baseline_score:.4f}") + + # Track Dpareto evaluations for Pareto front + if self._current_dataset_type == 'dpareto': + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + pareto_log.log_candidate_evaluation( + prompt=system_prompt, + score=avg_score, + candidate_type=self._current_evaluation_type or 'unknown', + dataset_type=self._current_dataset_type + ) + + # Track evaluated candidates + normalized_prompt = system_prompt.strip().strip('"\'') + if normalized_prompt not in self._dpareto_evaluated_candidates: + self._dpareto_evaluated_candidates[normalized_prompt] = ( + avg_score, self._current_evaluation_type or 'unknown', 'evaluated_by_gepa' + ) + + self.logger.debug(f"Evaluation complete: score={avg_score:.4f}") + + # ๐Ÿ”ฅ CRITICAL: Update _best_candidate and _best_score with average fitness for Dpareto evaluations + # This ensures the adapter tracks the best average fitness, not just per-sample scores + # Only update if this score is better than current best + if self._current_dataset_type == 'dpareto': + if self._best_score is None or avg_score > self._best_score: + self._best_score = avg_score + self._best_candidate = { + 'system_prompt': system_prompt, + 'fitness': avg_score, + 'source': self._current_evaluation_type or 'unknown' + } + self.logger.info(f"โœ… Updated best candidate from Dpareto evaluation: f={avg_score:.4f} (type: {self._current_evaluation_type})") + + return EvaluationBatch(outputs=outputs, scores=scores, trajectories=trajectories) + + def _evaluate_batch_mode( + self, + batch: List[Dict], + system_prompt: str, + capture_traces: bool + ) -> tuple: + """ + Batch mode evaluation - process all samples in one API call. + + This method prepares all requests, submits them as a batch job to Gemini, + waits for completion, then evaluates all results. + """ + # Prepare all requests + requests = [] + standardized_items = [] + + for item in batch: + standardized_item = self.data_converter._standardize([item])[0] + standardized_items.append(standardized_item) + + request = { + 'system_prompt': system_prompt, + 'user_prompt': standardized_item['input'] + } + + if standardized_item.get('image'): + request['image_base64'] = standardized_item['image'] + + requests.append(request) + + # Submit batch job and get all results at once + batch_results = self.llm_client.generate_batch(requests) + + # Process results + outputs = [] + scores = [] + trajectories = [] if capture_traces else None + + for i, (llm_response, standardized_item) in enumerate(zip(batch_results, standardized_items)): + # Extract content + raw_output = llm_response.get("content", "") + + # ๐Ÿ”ฅ CRITICAL: Clean markdown wrappers before evaluation + predicted_output = self._clean_llm_output(raw_output) + outputs.append(predicted_output) + + # Evaluate with cleaned output + evaluation_results = self.evaluator.evaluate( + predicted_output, + standardized_item['output'] + ) + + composite_score = evaluation_results.get("composite_score", 0.0) + scores.append(composite_score) + + # Update tracking + if composite_score > self._best_score: + self._best_score = composite_score + self._best_candidate = {'system_prompt': system_prompt} + + # Capture traces + if capture_traces: + trajectories.append({ + 'input_data': standardized_item, + 'predicted_output': predicted_output, + 'evaluation_results': evaluation_results + }) + + # Concise logging with element IDs and candidate notation + predicted_element = evaluation_results.get('predicted_element', '?') + expected_element = evaluation_results.get('expected_element', '?') + status = "โœ…" if composite_score == 1.0 else "โŒ" + + # Add notation for candidate type + notation_map = {'seed': 'Sโ‚€', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโ‚“โ‚’', 'llego_mutation': 'Oโ‚˜แตคโ‚œ'} + notation = notation_map.get(self._current_evaluation_type, 'S') + + self.logger.info(f" [{notation}] Sample {i+1}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status}") + + return outputs, scores, trajectories + + def _evaluate_standard_mode( + self, + batch: List[Dict], + system_prompt: str, + capture_traces: bool + ) -> tuple: + """ + Standard mode evaluation - process samples individually (existing logic). + + This is the original implementation, preserved for backward compatibility + and for use with non-batch LLM clients. + """ + outputs = [] + scores = [] + trajectories = [] if capture_traces else None + + for i, item in enumerate(batch): + # Use existing data processing logic + standardized_item = self.data_converter._standardize([item])[0] + + # Prepare generation parameters + generation_params = { + 'system_prompt': system_prompt, + 'user_prompt': standardized_item['input'] + } + + # Add image if present + if standardized_item.get('image'): + generation_params['image_base64'] = standardized_item['image'] + + # Generate response using user's LLM client + llm_response = self.llm_client.generate(**generation_params) + + # Extract content + if isinstance(llm_response, dict): + raw_output = llm_response.get("content", "") + else: + raw_output = str(llm_response) + + # ๐Ÿ”ฅ CRITICAL: Clean markdown wrappers before evaluation + predicted_output = self._clean_llm_output(raw_output) + outputs.append(predicted_output) + + # Evaluate using user's evaluator with cleaned output + evaluation_results = self.evaluator.evaluate( + predicted_output, + standardized_item['output'] + ) + + composite_score = evaluation_results.get("composite_score", 0.0) + scores.append(composite_score) + + # #region agent log + try: + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_adapter.py:evaluation_result", "message": "Individual evaluation result", "data": {"sample_idx": i, "composite_score": composite_score, "semantic_sim": evaluation_results.get("semantic_similarity", -1), "structural_sim": evaluation_results.get("structural_similarity", -1), "format_mismatch": evaluation_results.get("analysis", {}).get("format_mismatch", False), "predicted_len": len(predicted_output) if predicted_output else 0, "expected_len": len(standardized_item.get('output', ''))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + except Exception: + pass + # #endregion + + # Update performance tracking + self._evaluation_count += 1 + if composite_score > self._best_score: + self._best_score = composite_score + self._best_candidate = {'system_prompt': system_prompt} + + # Capture traces if requested + if capture_traces: + trajectories.append({ + 'input_data': standardized_item, + 'predicted_output': predicted_output, + 'evaluation_results': evaluation_results + }) + + # Concise logging with element IDs and candidate notation + predicted_element = evaluation_results.get('predicted_element', '?') + expected_element = evaluation_results.get('expected_element', '?') + status = "โœ…" if composite_score == 1.0 else "โŒ" + + # Add notation for candidate type + notation_map = {'seed': 'Sโ‚€', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโ‚“โ‚’', 'llego_mutation': 'Oโ‚˜แตคโ‚œ'} + notation = notation_map.get(self._current_evaluation_type, 'S') + + self.logger.info(f" [{notation}] Sample {i+1}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status}") + + return outputs, scores, trajectories + + def make_reflective_dataset(self, candidate: Dict[str, str], eval_batch: EvaluationBatch, + components_to_update: List[str]) -> Dict[str, List[Dict[str, Any]]]: + """ + Create reflective dataset using user-provided evaluator. + + This method generates feedback based on the evaluation results + from the user's custom evaluator. + + ๐Ÿ”ฅ NEW: If hybrid mode is enabled, this method ALSO generates hybrid candidates + (GEPA Reflection + LLEGO Operators) and stores them for GEPA to use. + """ + # ๐Ÿ”ฅ REMOVED: Excessive diagnostic logs - moved to DEBUG level if needed + self.logger.debug(f"make_reflective_dataset() called - generating feedback and hybrid candidates") + + reflective_dataset = {} + system_prompt = candidate.get('system_prompt', '') + + # ๐Ÿ”ฅ REMOVED: Verbose diagnostic checks - only log if hybrid mode is actually enabled + hybrid_mode_enabled = (self._config and + hasattr(self._config, 'enable_gepa_reflection_with_llego') and + self._config.enable_gepa_reflection_with_llego and + self._reflection_lm_client) + + if hybrid_mode_enabled: + self.logger.debug(f"โœ… Hybrid mode conditions met - will generate hybrid candidates") + + # ======================================================================== + # ๐Ÿ”ฅ CRITICAL FIX: Update LLEGO population with evaluated candidate + # ======================================================================== + # This is the MISSING LINK! After a candidate is evaluated, we need to add it + # to the LLEGO population so it can be used for crossover/mutation. + # Without this, the population only contains the seed, so Pareto front stays at 1! + # + # This is called for EVERY candidate that GEPA evaluates: + # - Seed prompt (baseline) โ†’ added to population + # - New candidate 1 (from reflection/crossover/mutation) โ†’ added to population + # - New candidate 2 โ†’ added to population + # - etc. + if self.llego: + # Calculate average fitness from evaluation scores + if eval_batch.scores and len(eval_batch.scores) > 0: + avg_fitness = sum(eval_batch.scores) / len(eval_batch.scores) + else: + # Fallback: extract from trajectories if scores not available + scores = [t.get('evaluation_results', {}).get('composite_score', 0.0) + for t in eval_batch.trajectories if 'evaluation_results' in t] + avg_fitness = sum(scores) / len(scores) if scores else 0.0 + + self.logger.debug(f"Updating LLEGO population: fitness={avg_fitness:.4f}") + + # Create PromptCandidate from evaluated prompt + from ..operators.llego_operators import PromptCandidate + + # Check if this candidate already exists in population (avoid duplicates) + # ๐Ÿ”ฅ FIX: Normalize prompts for comparison (strip whitespace, remove quotes) + normalized_new_prompt = system_prompt.strip().strip('"\'') + existing_prompts = {p.prompt.strip().strip('"\'') for p in self.llego.population} + + # Also check normalized versions + if normalized_new_prompt not in existing_prompts: + prompt_candidate = PromptCandidate( + prompt=system_prompt, # Keep original prompt (not normalized) + fitness=avg_fitness, + metadata={ + 'generation': self.llego.current_generation, + 'operator': 'evaluated', + 'prompt_length': len(system_prompt), + 'word_count': len(system_prompt.split()), + 'evaluation_samples': len(eval_batch.scores) if eval_batch.scores else 0, + 'candidate_type': self._current_evaluation_type or 'unknown', # Store type for notation + 'dataset_evaluated': self._current_dataset_type or 'unknown' + } + ) + + # Update population - this will add the candidate and keep top N by fitness + population_before = len(self.llego.population) + self.llego.update_population([prompt_candidate]) + population_after = len(self.llego.population) + + self.logger.debug(f"Added to LLEGO population: fitness={avg_fitness:.4f}, size={population_after}") + else: + # Update fitness if candidate already exists (seed prompt, etc.) + # ๐Ÿ”ฅ FIX: Also normalize for comparison + updated = False + for p in self.llego.population: + normalized_existing = p.prompt.strip().strip('"\'') + if normalized_existing == normalized_new_prompt: + old_fitness = p.fitness + if avg_fitness > p.fitness: + p.fitness = avg_fitness + updated = True + self.logger.debug(f"Updated fitness: {old_fitness:.4f} โ†’ {avg_fitness:.4f}") + # Update candidate type if we have new information + if self._current_evaluation_type and p.metadata: + old_type = p.metadata.get('candidate_type', 'unknown') + if self._current_evaluation_type != old_type: + p.metadata['candidate_type'] = self._current_evaluation_type + else: + self.logger.debug(f"โ„น๏ธ Candidate already exists with better/equal fitness: {p.fitness:.4f} >= {avg_fitness:.4f}") + break + + if not updated: + self.logger.debug(f"Candidate already in population with higher fitness") + else: + self.logger.debug("LLEGO not initialized - skipping population update") + + # ======================================================================== + # ๐Ÿ”ฅ HYBRID MODE: Generate candidates at adapter level + # ======================================================================== + if (self._config and + hasattr(self._config, 'enable_gepa_reflection_with_llego') and + self._config.enable_gepa_reflection_with_llego and + self._reflection_lm_client): + + self.logger.debug("Generating hybrid candidates") + + # Generate hybrid candidates FIRST + generated_candidates = self._generate_hybrid_candidates_adapter_level( + current_prompt=system_prompt, + eval_batch=eval_batch, + candidate=candidate + ) + + # ๐Ÿ”ฅ CRITICAL: Store generated candidates so we can inject them + # _generate_hybrid_candidates_adapter_level now returns list of dicts with metadata + if generated_candidates: + candidate_dicts = [] + for cand in generated_candidates: + if isinstance(cand, dict) and 'prompt' in cand: + # Already a dict with metadata (preferred format) + candidate_dicts.append(cand) + elif isinstance(cand, str): + # Just a string - determine source based on position (fallback) + # This shouldn't happen if _generate_hybrid_candidates_adapter_level is fixed + self.logger.warning(f"โš ๏ธ Received string candidate instead of dict - using fallback logic") + if len(candidate_dicts) < self._config.num_gepa_reflection_candidates: + source = 'gepa_reflection' + elif len(candidate_dicts) < self._config.num_gepa_reflection_candidates + self._config.n_crossover: + source = 'llego_crossover' + else: + source = 'llego_mutation' + candidate_dicts.append({ + 'prompt': cand, + 'source': source, + 'index': len(candidate_dicts) + 1 + }) + else: + self.logger.warning(f"โš ๏ธ Unknown candidate format: {type(cand)}") + + self._generated_candidates = candidate_dicts + + # Store candidate sources for tracking + for cand_dict in candidate_dicts: + if 'prompt' in cand_dict and 'source' in cand_dict: + self._candidate_sources[cand_dict['prompt']] = cand_dict['source'] + + # ๐Ÿ”ฅ CRITICAL: Inject into LLM client wrapper so it can return them when GEPA calls + # This is the key mechanism: when GEPA calls adapter.llm_client.generate() for proposals, + # our wrapper will detect it and return our pre-generated candidates + if hasattr(self.llm_client, '_adapter_generated_candidates'): + self.llm_client._adapter_generated_candidates = candidate_dicts.copy() + self.logger.debug(f"Injected {len(candidate_dicts)} candidates") + else: + try: + self.llm_client._adapter_generated_candidates = candidate_dicts.copy() + except Exception as e: + self.logger.error(f"Failed to inject candidates: {e}") + + # Evaluate generated candidates on Dpareto for fair comparison + if hasattr(self, '_evaluating_generated_candidates'): + pass # Skip to prevent recursion + elif self._valset and len(self._valset) > 0: + self._evaluating_generated_candidates = True + self.logger.debug(f"Evaluating {len(candidate_dicts)} candidates on Dpareto ({len(self._valset)} samples)") + + # ๐Ÿ”ฅ NEW: Collect all candidates with scores for batch update + candidates_with_scores = [] + + for i, cand_dict in enumerate(candidate_dicts, 1): + cand_prompt = cand_dict.get('prompt', '') + cand_source = cand_dict.get('source', 'unknown') + + if not cand_prompt: + continue + + # Normalize prompt for duplicate detection + normalized_prompt = cand_prompt.strip().strip('"\'') + + # Check if already evaluated on Dpareto (avoid double evaluation) + if normalized_prompt in self._dpareto_evaluated_candidates: + existing_score, existing_type, _ = self._dpareto_evaluated_candidates[normalized_prompt] + + # Still add to batch for Pareto update (with existing score) + notation_map = { + 'seed': 'Sโ‚€', + 'gepa_reflection': 'Sแตฃ', + 'llego_crossover': 'Oโ‚“โ‚’', + 'llego_mutation': 'Oโ‚˜แตคโ‚œ' + } + cand_notation = notation_map.get(cand_source, 'S') + candidates_with_scores.append({ + 'prompt': cand_prompt, + 'score': existing_score, + 'type': cand_source, + 'notation': cand_notation + }) + continue + + # Evaluate this candidate on valset (Dpareto) + try: + # Set candidate type for proper logging + self._current_evaluation_type = cand_source + + # ๐Ÿ”ฅ CRITICAL: Temporarily disable individual Pareto updates + # We'll do batch update after all evaluations + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + original_log_method = pareto_log.log_candidate_evaluation + + # Temporarily replace to prevent individual updates + def noop_log(*args, **kwargs): + pass # Skip individual logging - we'll batch update later + + pareto_log.log_candidate_evaluation = noop_log + + # Evaluate on valset - THIS IS THE FAIR EVALUATION ON SAME DATASET + valset_eval = self.evaluate( + batch=self._valset, # Same valset as seed! + candidate={'system_prompt': cand_prompt, 'source': cand_source}, + capture_traces=True + ) + + # Restore original method + pareto_log.log_candidate_evaluation = original_log_method + + avg_score = sum(valset_eval.scores) / len(valset_eval.scores) if valset_eval.scores else 0.0 + + # Store evaluation result to avoid double evaluation + self._dpareto_evaluated_candidates[normalized_prompt] = ( + avg_score, + cand_source, + 'evaluated_in_make_reflective_dataset' + ) + + self.logger.debug(f"Candidate {i} evaluated: score={avg_score:.4f}") + + # Generate notation + notation_map = { + 'seed': 'Sโ‚€', + 'gepa_reflection': 'Sแตฃ', + 'llego_crossover': 'Oโ‚“โ‚’', + 'llego_mutation': 'Oโ‚˜แตคโ‚œ' + } + cand_notation = notation_map.get(cand_source, 'S') + + # Add to batch for Pareto update + candidates_with_scores.append({ + 'prompt': cand_prompt, + 'score': avg_score, + 'type': cand_source, + 'notation': cand_notation + }) + + # ๐Ÿ”ฅ CRITICAL: Explicitly add this candidate to LLEGO population with Dpareto fitness + if self.llego: + from ..operators.llego_operators import PromptCandidate + + # Check if already in population + existing_in_pop = False + for p in self.llego.population: + if p.prompt.strip().strip('"\'') == normalized_prompt: + # Update fitness if this Dpareto score is better + if avg_score > p.fitness: + old_fitness = p.fitness + p.fitness = avg_score + if p.metadata: + p.metadata['candidate_type'] = cand_source + p.metadata['dataset_evaluated'] = 'dpareto' + self.logger.debug(f"Updated LLEGO fitness: {old_fitness:.4f} โ†’ {avg_score:.4f}") + existing_in_pop = True + break + + if not existing_in_pop: + # Add new candidate to population + prompt_candidate = PromptCandidate( + prompt=cand_prompt, + fitness=avg_score, + metadata={ + 'generation': self.llego.current_generation, + 'operator': 'evaluated_on_dpareto', + 'prompt_length': len(cand_prompt), + 'word_count': len(cand_prompt.split()), + 'evaluation_samples': len(valset_eval.scores) if valset_eval.scores else 0, + 'candidate_type': cand_source, + 'dataset_evaluated': 'dpareto' + } + ) + self.llego.update_population([prompt_candidate]) + + except Exception as e: + self.logger.error(f" โŒ Error evaluating candidate #{i} on Dpareto: {e}") + import traceback + self.logger.error(traceback.format_exc()) + + # Batch Pareto front update + if candidates_with_scores: + + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + added_candidates = pareto_log.batch_update_pareto_front(candidates_with_scores) + + # ๐Ÿ”ฅ CRITICAL: Update queue with scores for best-candidate selection + # Create a mapping of prompt -> score for quick lookup + prompt_to_score = {c['prompt'].strip().strip('"\''): c['score'] for c in candidates_with_scores} + + # Update candidates in queue with their scores + if hasattr(self.llm_client, '_adapter_generated_candidates'): + updated_queue = [] + for cand in self.llm_client._adapter_generated_candidates: + if isinstance(cand, dict): + cand_prompt = cand.get('prompt', '') + normalized = cand_prompt.strip().strip('"\'') + if normalized in prompt_to_score: + # Update with score + cand['score'] = prompt_to_score[normalized] + updated_queue.append(cand) + else: + updated_queue.append(cand) + else: + updated_queue.append(cand) + + self.llm_client._adapter_generated_candidates = updated_queue + + self.logger.debug(f"Pareto update: {len(added_candidates)} added, front size={len(pareto_log.pareto_front)}") + + # Clear flag after evaluation complete + self._evaluating_generated_candidates = False + elif not hasattr(self, '_evaluating_generated_candidates'): + self.logger.error("Valset not available - cannot evaluate generated candidates") + + # Signal LLEGO-enhanced client for reflection mode + if self.llego and hasattr(self.llm_client, 'set_reflection_context'): + self.llm_client.set_reflection_context( + current_prompt=system_prompt, + feedback=eval_batch, + in_reflection=True + ) + + # ๐Ÿ”ฅ CRITICAL: Also set reflection context on reflection_lm_client if it exists + # This ensures hybrid mode candidate generation is triggered when GEPA calls reflection_lm_callable + if hasattr(self, 'reflection_lm_client') and self.reflection_lm_client: + if hasattr(self.reflection_lm_client, 'set_reflection_context'): + self.logger.info("๐Ÿ”ฅ CRITICAL: Setting reflection context on reflection_lm_client for hybrid mode") + self.reflection_lm_client.set_reflection_context( + current_prompt=system_prompt, + feedback=eval_batch, + in_reflection=True # This enables hybrid candidate generation! + ) + + self._log_reflection_dataset_creation(candidate, eval_batch, components_to_update) + + # Inject generated candidates into reflective dataset + suggested_prompts = [] + if hasattr(self, '_generated_candidates') and self._generated_candidates: + suggested_prompts = [c['prompt'] for c in self._generated_candidates if isinstance(c, dict) and 'prompt' in c] + self.logger.debug(f"Injecting {len(suggested_prompts)} suggested prompts") + + for component in components_to_update: + reflective_dataset[component] = [] + for trace in eval_batch.trajectories: + # Generate feedback based on evaluation results + # ๐Ÿ†• Phase 2: Pass trace and current_prompt for LLM-as-Judge + feedback = self._generate_feedback( + trace['evaluation_results'], + trace=trace, + current_prompt=system_prompt + ) + + # Base reflection data + # ๐Ÿ”ฅ FIX: Strip image_base64 from input_data to prevent massive base64 strings in logs + input_data_clean = trace['input_data'].copy() if isinstance(trace['input_data'], dict) else {} + if 'image_base64' in input_data_clean: + input_data_clean['image_base64'] = f"[IMAGE_DATA_{len(input_data_clean['image_base64'])}_chars]" + + # ๐Ÿ”ฅ FIX: Clean detailed_scores to remove any base64 references or large data + detailed_scores_clean = {} + if isinstance(trace['evaluation_results'], dict): + for key, value in trace['evaluation_results'].items(): + # Skip any values that look like base64 (very long strings) + if isinstance(value, str) and len(value) > 1000: + detailed_scores_clean[key] = f"[DATA_{len(value)}_chars]" + else: + detailed_scores_clean[key] = value + else: + detailed_scores_clean = trace['evaluation_results'] + + reflection_entry = { + "current_prompt": system_prompt, + "input_data": input_data_clean, # Use cleaned version without full base64 + "predicted_output": trace['predicted_output'], + "score": trace['evaluation_results'].get("composite_score", 0.0), + "feedback": feedback, + "detailed_scores": detailed_scores_clean # Cleaned scores without large data + } + + # ๐Ÿ”ฅ CRITICAL: Only optimize system_prompt, NOT user_prompt + # The user_prompt contains the task description (command) and should NOT be modified + if component == 'system_prompt' and suggested_prompts: + # Add suggested improved prompts to the reflection entry + # GEPA might use these if the structure supports it + reflection_entry["suggested_improved_prompts"] = suggested_prompts + reflection_entry["num_suggestions"] = len(suggested_prompts) + # Also add the best suggested prompt as a direct suggestion + if suggested_prompts: + reflection_entry["suggested_prompt"] = suggested_prompts[0] # First candidate as primary suggestion + reflection_entry["optimize_component"] = "system_prompt_only" # Mark that we only optimize system_prompt + elif component != 'system_prompt': + # For non-system_prompt components (like user_prompt), do NOT add suggestions + # We only want to optimize system_prompt + reflection_entry["optimize_component"] = "skip" # Mark to skip optimization + self.logger.info(f"โš ๏ธ Skipping optimization for component '{component}' - only optimizing system_prompt") + + reflective_dataset[component].append(reflection_entry) + + total_samples = sum(len(data) for data in reflective_dataset.values()) + avg_score = sum(trace['score'] for data in reflective_dataset.values() for trace in data) / total_samples if total_samples > 0 else 0.0 + self.logger.info(f"๐Ÿ“ Reflection dataset created - {total_samples} samples, avg score: {avg_score:.4f}") + + return reflective_dataset + + def _generate_feedback( + self, + evaluation_results: Dict[str, Any], + trace: Optional[Dict[str, Any]] = None, + current_prompt: Optional[str] = None + ) -> str: + """ + Generate feedback using hybrid approach: + - LLM-as-Judge for low/medium scores (detailed, actionable) + - Simple feedback for high scores (efficient) + + Args: + evaluation_results: Evaluation scores and extracted data + trace: Full trace with input_data, predicted_output, etc. (optional) + current_prompt: The current system prompt being optimized (optional) + + Returns: + Feedback string focused on prompt improvement + """ + composite_score = evaluation_results.get("composite_score", 0.0) + + # Check if LLM-as-Judge is enabled + use_llm_judge = getattr(self._config, 'use_llm_as_judge', True) + threshold = getattr(self._config, 'llm_as_judge_threshold', 0.8) + + # ๐Ÿ”ฅ FIX: Check both attribute names (inconsistency in codebase) + reflection_lm = getattr(self, '_reflection_lm_client', None) or getattr(self, 'reflection_lm_client', None) + + # Debug logging - use INFO so we can see what's happening + self.logger.info(f"๐Ÿ” Feedback generation: score={composite_score:.4f}, use_llm_judge={use_llm_judge}, threshold={threshold}, has_trace={trace is not None}, has_reflection_lm={reflection_lm is not None}") + if trace: + input_data = trace.get('input_data', {}) + predicted = trace.get('predicted_output', '')[:100] if trace.get('predicted_output') else 'N/A' + expected = input_data.get('output', '')[:100] if input_data.get('output') else 'N/A' + self.logger.info(f" Predicted preview: {predicted}...") + self.logger.info(f" Expected preview: {expected}...") + + # Use LLM-as-Judge for scores needing improvement + if use_llm_judge and composite_score < threshold and trace: + if not reflection_lm: + self.logger.warning("โš ๏ธ LLM-as-Judge requested but reflection_lm_client not available - using simple feedback") + self.logger.warning(f" Checked: _reflection_lm_client={getattr(self, '_reflection_lm_client', None) is not None}, reflection_lm_client={getattr(self, 'reflection_lm_client', None) is not None}") + else: + try: + self.logger.info(f"๐Ÿค– Calling LLM-as-Judge for detailed feedback (score: {composite_score:.4f} < threshold: {threshold})") + feedback = self._llm_as_judge_feedback( + evaluation_results, + trace, + current_prompt + ) + self.logger.info(f"โœ… LLM-as-Judge returned feedback (length: {len(feedback)} chars)") + return feedback + except Exception as e: + self.logger.error(f"โŒ LLM-as-Judge failed: {e}, falling back to simple feedback") + import traceback + self.logger.error(traceback.format_exc()) + # Fall through to simple feedback + + # Simple actionable feedback (for high scores or as fallback) + if composite_score >= threshold: + self.logger.debug(f"โœ… Score {composite_score:.4f} >= threshold {threshold} - using simple feedback") + elif not trace: + self.logger.debug(f"โš ๏ธ No trace provided - using simple feedback") + elif not use_llm_judge: + self.logger.debug(f"โš ๏ธ LLM-as-Judge disabled - using simple feedback") + + feedback = self._simple_actionable_feedback( + evaluation_results, + trace, + current_prompt + ) + + # ๐Ÿ”ฅ ADD FORMAT FEEDBACK: Append format-specific feedback if available + if self._detected_format and trace: + from ..utils.format_detection import generate_format_feedback + input_data = trace.get('input_data', {}) + format_feedback = generate_format_feedback( + predicted_output=trace.get('predicted_output', ''), + expected_output=input_data.get('output', ''), + format_info=self._detected_format + ) + if format_feedback: + feedback += format_feedback + + return feedback + + def _llm_as_judge_feedback( + self, + evaluation_results: Dict[str, Any], + trace: Dict[str, Any], + current_prompt: Optional[str] = None + ) -> str: + """ + Generate detailed, actionable feedback using LLM-as-Judge. + + ๐Ÿ”ฅ UNIVERSAL VERSION: Works for ANY task type (text, JSON, structured outputs). + No UI-specific assumptions. Pure semantic and structural comparison. + + Args: + evaluation_results: Evaluation scores and extracted data + trace: Full trace with input_data, predicted_output, etc. + current_prompt: The current system prompt being optimized + + Returns: + Detailed feedback string focused on prompt improvement + """ + # Import universal judge prompt builder + from ..utils.universal_judge_prompt import ( + build_universal_judge_prompt, + get_universal_judge_system_prompt, + format_universal_judge_feedback, + build_empty_output_feedback + ) + + # Extract data from trace + input_data = trace.get('input_data', {}) + predicted_output = trace.get('predicted_output', '') or '' + expected_output = input_data.get('output', '') or '' + task_input = input_data.get('input', '') or '' + + # Get image if available (for multi-modal tasks) + image_base64 = input_data.get('image', '') or input_data.get('image_base64', '') + + # Log what we're working with + self.logger.info(f"๐Ÿ” LLM-as-Judge input check:") + self.logger.info(f" predicted_output length: {len(predicted_output)} chars") + self.logger.info(f" expected_output length: {len(expected_output)} chars") + self.logger.info(f" image available: {bool(image_base64)} (length: {len(image_base64) if image_base64 else 0} chars)") + self.logger.info(f" predicted_output preview: {predicted_output[:200] if predicted_output else '[EMPTY]'}...") + self.logger.info(f" expected_output preview: {expected_output[:200] if expected_output else '[EMPTY]'}...") + + # Handle empty predicted output specially + if not predicted_output or not predicted_output.strip(): + self.logger.warning(f"โš ๏ธ Predicted output is empty - generating specialized feedback") + return build_empty_output_feedback(task_input, expected_output, current_prompt) + + if not image_base64: + self.logger.debug(f"โ„น๏ธ No image provided - text-only analysis") + + # Get the LLM for judging + judge_llm = getattr(self, '_reflection_lm_client', None) or getattr(self, 'reflection_lm_client', None) + + if not judge_llm: + self.logger.error("โŒ CRITICAL: No reflection_lm_client available for LLM-as-Judge!") + raise ValueError("reflection_lm_client not available") + + # Build the universal judge prompt + judge_prompt = build_universal_judge_prompt( + task_input=task_input, + predicted_output=predicted_output, + expected_output=expected_output, + current_prompt=current_prompt, + evaluation_results=evaluation_results, + image_base64=image_base64 + ) + + # Get the universal system prompt + system_prompt = get_universal_judge_system_prompt(has_image=bool(image_base64)) + + # Call LLM-as-Judge + try: + self.logger.info(f"๐Ÿค– Calling Universal LLM-as-Judge for semantic analysis") + result = judge_llm.generate( + system_prompt=system_prompt, + user_prompt=judge_prompt, + image_base64=image_base64 if image_base64 else "" + ) + + if isinstance(result, dict): + judge_output = result.get('content', '') + else: + judge_output = str(result) + + # Format the feedback using the universal formatter + score = evaluation_results.get('composite_score', 0.0) + feedback = format_universal_judge_feedback( + judge_output=judge_output, + task_input=task_input, + predicted_output=predicted_output, + expected_output=expected_output, + score=score + ) + + # ๐Ÿ”ฅ ADD FORMAT FEEDBACK: Append format-specific feedback + if self._detected_format: + from ..utils.format_detection import generate_format_feedback + format_feedback = generate_format_feedback( + predicted_output=predicted_output, + expected_output=expected_output, + format_info=self._detected_format + ) + if format_feedback: + feedback += format_feedback + + # Also add format constraint for next iteration + feedback += f"\n\n{self._detected_format['format_constraint']}" + + self.logger.info(f"โœ… Universal LLM-as-Judge generated feedback") + return feedback + + except Exception as e: + self.logger.error(f"LLM-as-Judge failed: {e}") + import traceback + self.logger.error(traceback.format_exc()) + # Fallback to simple feedback + return self._simple_actionable_feedback(evaluation_results, trace, current_prompt) + + def _extract_reasoning_from_expected(self, expected_output: str) -> str: + """Extract reasoning section from expected output.""" + if not expected_output: + return "" + + # Look for "Reason:" or "Reasoning:" section + reason_patterns = [ + r'Reason[:\s]+(.*?)(?:\n\n|\Z)', + r'Reasoning[:\s]+(.*?)(?:\n\n|\Z)', + ] + + for pattern in reason_patterns: + match = re.search(pattern, expected_output, re.IGNORECASE | re.DOTALL) + if match: + return match.group(1).strip()[:500] # Truncate to 500 chars + + return "" + + def _extract_reasoning_from_predicted(self, predicted_output: str) -> str: + """Extract reasoning from predicted output if available.""" + # Similar to _extract_reasoning_from_expected + # Or return first 200 chars if no clear reasoning section + if not predicted_output: + return "" + + # Look for reasoning patterns + reason_patterns = [ + r'Reason[:\s]+(.*?)(?:\n\n|\Z)', + r'Reasoning[:\s]+(.*?)(?:\n\n|\Z)', + ] + + for pattern in reason_patterns: + match = re.search(pattern, predicted_output, re.IGNORECASE | re.DOTALL) + if match: + return match.group(1).strip()[:500] + + # If no reasoning found, return first 200 chars + if len(predicted_output) > 200: + return predicted_output[:200] + "..." + return predicted_output + + def _simple_actionable_feedback( + self, + evaluation_results: Dict[str, Any], + trace: Dict[str, Any] = None, + current_prompt: Optional[str] = None + ) -> str: + """ + Simple feedback without LLM-as-Judge. + + ๐Ÿ”ฅ UNIVERSAL VERSION: Works for any task type. + """ + composite_score = evaluation_results.get("composite_score", 0.0) + semantic_sim = evaluation_results.get("semantic_similarity", 0.0) + structural_sim = evaluation_results.get("structural_similarity", 0.0) + + feedback_parts = [] + + # Extract task context if available + if trace: + input_data = trace.get('input_data', {}) + predicted = trace.get('predicted_output', '') + expected = input_data.get('output', '') + + # Check for empty output + if not predicted or not predicted.strip(): + feedback_parts.append( + "โŒ CRITICAL: No output generated. " + "Add explicit output instructions to the prompt." + ) + # Check for format mismatch + elif structural_sim < 0.5: + feedback_parts.append( + f"โš ๏ธ Format mismatch (structural similarity: {structural_sim:.0%}). " + "Add output format instructions (e.g., 'Return as JSON with fields: ...')." + ) + # Check for semantic mismatch + elif semantic_sim < 0.5: + feedback_parts.append( + f"โš ๏ธ Semantic mismatch (similarity: {semantic_sim:.0%}). " + "The output meaning differs from expected. Add clearer task instructions." + ) + + # Score-based feedback + if composite_score >= 0.9: + feedback_parts.append("โœ… Excellent match - prompt is working well.") + elif composite_score >= 0.8: + feedback_parts.append("โœ… Good match - minor refinements possible.") + elif composite_score >= 0.6: + feedback_parts.append( + f"โš ๏ธ Partial match (score: {composite_score:.0%}). " + "Consider adding examples or more specific field names to the prompt." + ) + elif composite_score >= 0.3: + feedback_parts.append( + f"โš ๏ธ Low match (score: {composite_score:.0%}). " + "The prompt needs clearer instructions about expected output format and content." + ) + else: + feedback_parts.append( + f"โŒ Poor match (score: {composite_score:.0%}). " + "Major revision required - add explicit output format, field names, and examples." + ) + + return "\n".join(feedback_parts) if feedback_parts else f"Score: {composite_score:.0%}" + + def get_best_candidate(self) -> Optional[Dict[str, str]]: + """ + Get the best candidate from GEPA Pareto front. + + GEPA Pareto front is the single source of truth because: + - All candidates (GEPA reflection, LLEGO crossover, LLEGO mutation) are evaluated on Dpareto + - All non-dominated candidates are added to GEPA Pareto front + - Therefore, the best candidate MUST be in GEPA Pareto front + + Returns: + Best candidate dictionary from GEPA Pareto front, or None if empty + """ + # PRIMARY: Get best candidate from GEPA Pareto front (single source of truth) + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + + if pareto_log.pareto_front: + try: + # Get best candidate from GEPA Pareto front (highest score = best) + gepa_best = max(pareto_log.pareto_front, key=lambda x: x['score']) + gepa_fitness = gepa_best['score'] + gepa_prompt = gepa_best['prompt'] + gepa_type = gepa_best.get('type', 'unknown') + gepa_notation = gepa_best.get('notation', 'S') + + self.logger.info(f"โœ… Best candidate from GEPA Pareto front: {gepa_notation} with f({gepa_notation})={gepa_fitness:.4f}") + self.logger.info(f" Type: {gepa_type}, Prompt length: {len(gepa_prompt)} chars") + self.logger.info(f" ๐Ÿ’ก GEPA Pareto front is single source of truth (all candidates evaluated on Dpareto)") + + return { + 'system_prompt': gepa_prompt, + 'fitness': gepa_fitness, + 'source': 'gepa_pareto_front', + 'candidate_type': gepa_type, + 'notation': gepa_notation + } + except Exception as e: + self.logger.error(f"โŒ Failed to get best from GEPA Pareto front: {e}") + import traceback + self.logger.error(traceback.format_exc()) + + # EDGE CASE: Pareto front empty (shouldn't happen, but handle gracefully) + self.logger.warning("โš ๏ธ GEPA Pareto front is empty - no best candidate available") + self.logger.warning(" This should not happen if all candidates are evaluated on Dpareto") + return None + + def get_best_score(self) -> float: + """Get the best score from GEPA Pareto front (single source of truth).""" + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + + if pareto_log.pareto_front: + try: + gepa_best_fitness = max(p['score'] for p in pareto_log.pareto_front) + return gepa_best_fitness + except Exception as e: + self.logger.warning(f"โš ๏ธ Failed to get best fitness from GEPA Pareto front: {e}") + + # Edge case: Pareto front empty - fallback to adapter's score + return self._best_score + + def log_proposed_candidate(self, candidate: Dict[str, str], iteration: int = 0): + """ + Pretty print the new proposed candidate prompt. + + Args: + candidate: The new candidate prompt from GEPA + iteration: Current optimization iteration + """ + system_prompt = candidate.get('system_prompt', '') + candidate_source = candidate.get('source', 'unknown') + + # Store source in adapter state so evaluate() can access it + self._current_evaluation_type = candidate_source + + # Also store in mapping by prompt text for lookup + if candidate_source != 'unknown' and system_prompt: + self._candidate_sources[system_prompt] = candidate_source + + # Use clean logger for simpler output + from ..utils.clean_logger import get_clean_logger + clean_log = get_clean_logger() + + # Update iteration if needed + if iteration > clean_log.current_iteration: + clean_log.log_iteration_start(iteration, seed_prompt=None) + + # Don't log here - let evaluate() handle it with full context + + def _log_reflection_dataset_creation(self, candidate: Dict[str, str], eval_batch: EvaluationBatch, + components_to_update: List[str]): + """ + Pretty print the reflection dataset creation process. + + Args: + candidate: Current candidate being evaluated + eval_batch: Evaluation results + components_to_update: Components being updated + """ + system_prompt = candidate.get('system_prompt', '') + + self.logger.info(f"๐Ÿ” DEBUG: Inside _log_reflection_dataset_creation") + self.logger.info(f"๐Ÿ” DEBUG: system_prompt length: {len(system_prompt)}") + self.logger.info(f"๐Ÿ” DEBUG: eval_batch.scores: {eval_batch.scores}") + self.logger.info(f"๐Ÿ” DEBUG: eval_batch.trajectories: {len(eval_batch.trajectories) if eval_batch.trajectories else 0}") + + # Determine candidate notation + notation_map = {'seed': 'Sโ‚€', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโ‚“โ‚’', 'llego_mutation': 'Oโ‚˜แตคโ‚œ'} + notation = notation_map.get(self._current_evaluation_type, 'S') + cand_num = self._evaluation_count if hasattr(self, '_evaluation_count') else '?' + cand_label = f"{notation}{cand_num}" + + # Use logger for the main output too + self.logger.info("\n" + "="*80) + self.logger.info("๐Ÿ” REFLECTION DATASET CREATION") + self.logger.info("="*80) + + self.logger.info(f"\n๐Ÿ“‹ CURRENT PROMPT BEING ANALYZED: {cand_label}") + self.logger.info(f" Candidate Type: {self._current_evaluation_type or 'unknown'}") + self.logger.info("-" * 40) + self.logger.info(f'"{system_prompt}"') + self.logger.info("-" * 40) + + self.logger.info(f"\n๐Ÿ“Š EVALUATION SUMMARY:") + self.logger.info("-" * 40) + if eval_batch.scores: + avg_score = sum(eval_batch.scores) / len(eval_batch.scores) + min_score = min(eval_batch.scores) + max_score = max(eval_batch.scores) + self.logger.info(f" โ€ข Average Score: {avg_score:.4f}") + self.logger.info(f" โ€ข Min Score: {min_score:.4f}") + self.logger.info(f" โ€ข Max Score: {max_score:.4f}") + self.logger.info(f" โ€ข Total Samples: {len(eval_batch.scores)}") + + self.logger.info(f"\n๐ŸŽฏ COMPONENTS TO UPDATE:") + self.logger.info("-" * 40) + for i, component in enumerate(components_to_update, 1): + self.logger.info(f" {i}. {component}") + + if eval_batch.trajectories: + self.logger.info(f"\n๐Ÿ” DETAILED ANALYSIS (FULL FEEDBACK - NO TRUNCATION):") + self.logger.info("-" * 80) + for i, trace in enumerate(eval_batch.trajectories[:5], 1): # Show first 5 samples with FULL details + evaluation_results = trace['evaluation_results'] + composite_score = evaluation_results.get("composite_score", 0.0) + + # Extract element IDs for concise logging + predicted_element = evaluation_results.get('predicted_element', 'Unknown') + expected_element = evaluation_results.get('expected_element', 'Unknown') + + # Concise, direct logging with candidate notation + status_icon = "โœ…" if composite_score == 1.0 else "โŒ" + + # Add notation for candidate type + notation_map = {'seed': 'Sโ‚€', 'gepa_reflection': 'Sแตฃ', 'llego_crossover': 'Oโ‚“โ‚’', 'llego_mutation': 'Oโ‚˜แตคโ‚œ'} + notation = notation_map.get(self._current_evaluation_type, 'S') + + self.logger.info(f" [{notation}] Sample {i}: Predicted={predicted_element}, Expected={expected_element}, Score={composite_score:.2f} {status_icon}") + + # ๐Ÿ”ฅ FIX: Pass trace and current_prompt to enable LLM-as-Judge! + feedback = self._generate_feedback( + evaluation_results, + trace=trace, # Pass the full trace! + current_prompt=system_prompt # Pass current prompt being analyzed! + ) + self.logger.info(f" ๐Ÿ’ฌ FEEDBACK (FULL):") + self.logger.info(f" \"{feedback}\"") + + if len(eval_batch.trajectories) > 5: + self.logger.info(f"\n ... and {len(eval_batch.trajectories) - 5} more samples (all logged similarly)") + + self.logger.info("="*80) + + def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str: + """ + ๐Ÿ›ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions. + + NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text. + However, this extraction logic serves as a safety net in case the LLM still adds: + "Based on the performance analysis... + ### Recommendations... + ### Revised Prompt Example: + [THE ACTUAL PROMPT HERE] + ### Conclusion..." + + This is now a defensive measure, not the primary mechanism. + + Args: + reflection_output: Full reflection output (should be clean prompt, but may contain analysis) + + Returns: + str: Clean, extracted prompt (or original if extraction fails or not needed) + """ + if not reflection_output or not isinstance(reflection_output, str): + return reflection_output + + # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:" + patterns = [ + r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)', + ] + + for pattern in patterns: + match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL) + if match: + extracted = match.group(1).strip() + # Clean up common artifacts + extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE) + extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE) + extracted = extracted.strip() + + if len(extracted) > 50: # Reasonable minimum length for a prompt + self.logger.debug(f"โœ… Extracted clean prompt using pattern: {pattern[:50]}...") + self.logger.debug(f" Original length: {len(reflection_output)} chars") + self.logger.debug(f" Extracted length: {len(extracted)} chars") + return extracted + + # Pattern 2: If output starts with a quote or prompt-like structure + # Look for text that starts with "You are..." and is substantial + if 'You are' in reflection_output: + # Find the longest continuous block that starts with "You are" + prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)', + reflection_output, re.IGNORECASE | re.DOTALL) + if prompt_match: + extracted = prompt_match.group(1).strip() + if len(extracted) > 50: + self.logger.debug(f"โœ… Extracted prompt starting with 'You are...'") + return extracted + + # Pattern 3: If the reflection output is actually just a clean prompt (no analysis) + # Check if it's relatively short and doesn't contain analysis keywords + analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion', + 'optimization', 'analysis', 'feedback'] + if (len(reflection_output) < 2000 and + not any(keyword in reflection_output.lower() for keyword in analysis_keywords)): + # Likely a clean prompt, return as-is + self.logger.debug(f"โœ… Reflection output appears to be a clean prompt (no analysis detected)") + return reflection_output.strip() + + # Fallback: Return original (with warning) + self.logger.warning(f"โš ๏ธ Could not extract clean prompt from reflection output") + self.logger.warning(f" Output length: {len(reflection_output)} chars") + self.logger.warning(f" Output preview: {reflection_output[:200]}...") + self.logger.warning(f" Returning original output (may contain analysis text)") + return reflection_output.strip() + + def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]: + """ + ๐Ÿ”ฅ OPTIMIZED: Parse N prompt variations from JSON format response. + + Uses robust JSON parsing with multiple fallback strategies: + 1. Extract JSON from markdown code blocks (```json ... ```) + 2. Find JSON object directly in text + 3. Attempt JSON repair for common issues + 4. Fallback to numbered section parsing if JSON fails + + Args: + response_text: LLM response containing JSON with variations + num_expected: Expected number of variations + + Returns: + List[str]: List of prompt variations (in order by index) + + Raises: + ValueError: If parsing fails and no valid variations found + """ + import json + import re + + if not response_text or not isinstance(response_text, str): + raise ValueError("Empty or invalid response text") + + # Strategy 1: Extract JSON from markdown code block + json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response_text, re.DOTALL) + if json_match: + json_str = json_match.group(1) + try: + data = json.loads(json_str) + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError as e: + self.logger.debug(f"JSON in code block invalid: {e}, trying repair...") + + # Strategy 2: Find JSON object directly in text + json_match = re.search(r'\{[^{}]*"variations"[^{}]*\[.*?\]\s*[^{}]*\}', response_text, re.DOTALL) + if json_match: + json_str = json_match.group(0) + try: + data = json.loads(json_str) + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + # Try to find largest JSON object + json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + if json_match: + try: + data = json.loads(json_match.group(0)) + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 3: Attempt JSON repair (common issues: trailing commas, unescaped quotes) + json_match = re.search(r'\{.*\}', response_text, re.DOTALL) + if json_match: + json_str = json_match.group(0) + # Try common repairs + repaired = re.sub(r',\s*}', '}', json_str) # Remove trailing commas before } + repaired = re.sub(r',\s*]', ']', repaired) # Remove trailing commas before ] + try: + data = json.loads(repaired) + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 4: Fallback to numbered section parsing + self.logger.warning(f"JSON parsing failed, trying numbered section fallback...") + try: + return self._parse_numbered_section_variations(response_text, num_expected) + except ValueError: + pass + + # All strategies failed + raise ValueError(f"Could not parse {num_expected} variations from response. Response preview: {response_text[:300]}...") + + def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]: + """Extract and validate variations from parsed JSON data.""" + if not isinstance(data, dict): + raise ValueError("JSON data is not a dictionary") + + variations_list = data.get('variations', []) + if not isinstance(variations_list, list): + raise ValueError("'variations' field is not a list") + + if len(variations_list) < num_expected: + self.logger.warning(f"Expected {num_expected} variations, found {len(variations_list)} in JSON") + + # Extract and sort by index + variations_with_index = [] + for var in variations_list: + if not isinstance(var, dict): + continue + index = var.get('index', 0) + prompt = var.get('prompt', '') + if prompt and isinstance(prompt, str): + variations_with_index.append((index, prompt.strip())) + + # Sort by index + variations_with_index.sort(key=lambda x: x[0]) + + # Extract just the prompts + variations = [v[1] for v in variations_with_index] + + # Validate count + if len(variations) < num_expected: + self.logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}") + # Pad with duplicates if needed (not ideal but better than failing) + while len(variations) < num_expected: + variations.append(variations[-1] if variations else "") + + # Take first N if we got more + variations = variations[:num_expected] + + # Validate all variations are non-empty + if not all(v for v in variations): + raise ValueError(f"Some variations are empty after parsing") + + return variations + + def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]: + """ + Fallback parser: Extract variations from numbered sections. + + Format: --- VARIATION N --- or Variation N: or similar + """ + variations = [] + + # Pattern 1: --- VARIATION N --- + pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)' + matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE) + + # Pattern 2: Variation N: + pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)' + matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE) + + # Pattern 3: Numbered list (1. 2. 3.) + pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)' + matches3 = re.findall(pattern3, response_text, re.DOTALL) + + # Use the pattern with most matches + matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3) + + if len(matches) >= num_expected: + # Sort by index + matches.sort(key=lambda x: int(x[0])) + # Extract prompts + variations = [match[1].strip() for match in matches[:num_expected]] + + if len(variations) != num_expected: + raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}") + + return variations + + def _generate_hybrid_candidates_adapter_level( + self, + current_prompt: str, + eval_batch: EvaluationBatch, + candidate: Dict[str, str] + ) -> List[str]: + """ + ๐Ÿ”ฅ ADAPTER-LEVEL HYBRID CANDIDATE GENERATION + + Generate candidates from BOTH GEPA reflection AND LLEGO operators + when GEPA's adapter mode ignores the reflection_lm parameter. + + This method: + 1. Builds comprehensive feedback from evaluation results + 2. Generates GEPA reflection candidates + 3. Generates LLEGO crossover/mutation candidates + 4. Logs ALL candidates with FULL prompts (no truncation) + 5. Stores candidates for potential use + + Args: + current_prompt: The current prompt being optimized + eval_batch: Evaluation results with trajectories + candidate: Current candidate dict + + Returns: + List of generated candidate prompts + """ + try: + from ..llms.llego_enhanced_llm import LLEGOEnhancedLLMClient + + all_candidates = [] + gepa_count = 0 + + # ๐Ÿ”ฅ CRITICAL: Pass format info to LLM client before generating candidates + if self._detected_format and self._reflection_lm_client: + if isinstance(self._reflection_lm_client, LLEGOEnhancedLLMClient): + self._reflection_lm_client._detected_format = self._detected_format + self.logger.info(f"๐Ÿ“ Passed format info to reflection LLM: {self._detected_format['format_type']}") + + self.logger.info(f"๐Ÿ”ฅ STEP 1: Building comprehensive feedback from evaluation") + + # ๐Ÿ”ฅ REMOVED: Excessive diagnostic logs - moved to DEBUG level + # Build comprehensive feedback text from trajectories + if not hasattr(eval_batch, 'trajectories'): + self.logger.error(f"โŒ eval_batch has no 'trajectories' attribute! Type: {type(eval_batch)}") + return [] + + trajectories = eval_batch.trajectories + if not trajectories: + self.logger.warning(f"โš ๏ธ eval_batch.trajectories is empty - no feedback to generate candidates from") + return [] + + self.logger.debug(f"Processing {len(trajectories)} trajectories for feedback generation") + + feedback_lines = [] + feedback_lines.append(f"Current prompt performance analysis:\n") + feedback_lines.append(f"Current prompt:\n{current_prompt}\n") + feedback_lines.append(f"\nEvaluation results:\n") + + for i, trace in enumerate(trajectories[:8], 1): # Use up to 8 samples for feedback + try: + eval_results = trace.get('evaluation_results', {}) + score = eval_results.get("composite_score", 0.0) if isinstance(eval_results, dict) else 0.0 + input_data = trace.get('input_data', {}) + predicted = trace.get('predicted_output', '') + expected = input_data.get('output', '') if isinstance(input_data, dict) else '' + + # ๐Ÿ”ฅ FIX: Clean input_data to remove base64 images before logging + input_data_clean = input_data.copy() if isinstance(input_data, dict) else {} + if 'image_base64' in input_data_clean: + input_data_clean['image_base64'] = f"[IMAGE_DATA_{len(input_data_clean['image_base64'])}_chars]" + + feedback_lines.append(f" Sample {i}:") + feedback_lines.append(f" Input: {input_data_clean.get('input', '') if isinstance(input_data_clean, dict) else ''}") + feedback_lines.append(f" Expected: {expected}") + feedback_lines.append(f" Predicted: {predicted}") + feedback_lines.append(f" Score: {score:.4f}") + + if isinstance(eval_results, dict): + # ๐Ÿ”ฅ FIX: Pass trace and current_prompt to enable LLM-as-Judge! + feedback = self._generate_feedback( + eval_results, + trace=trace, # Pass the full trace! + current_prompt=current_prompt # Pass current prompt! + ) + feedback_lines.append(f" Feedback: {feedback}") + else: + feedback_lines.append(f" Feedback: Evaluation results not in expected format") + feedback_lines.append("") + except Exception as e: + self.logger.error(f"โŒ Error processing trace {i}: {e}") + import traceback + self.logger.error(traceback.format_exc()) + continue + + feedback_text = "\n".join(feedback_lines) + + self.logger.info(f"\n๐Ÿ“‹ FULL FEEDBACK TEXT (NO TRUNCATION):") + self.logger.info(feedback_text) + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # PART 1: GEPA REFLECTION CANDIDATES + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + self.logger.info(f"๐Ÿ“ PART 2: GEPA REFLECTION - Semantic Understanding") + + num_gepa = self._config.num_gepa_reflection_candidates if hasattr(self._config, 'num_gepa_reflection_candidates') else 3 + + self.logger.info(f"\n๐Ÿ“ Generating {num_gepa} GEPA Reflection candidates in single optimized call...") + + # Set reflection context + if isinstance(self._reflection_lm_client, LLEGOEnhancedLLMClient): + self._reflection_lm_client.set_reflection_context( + current_prompt=current_prompt, + feedback=eval_batch, + in_reflection=True + ) + + # ๐Ÿ”ฅ OPTIMIZED: Single call with JSON format for multiple variations + try: + # Precision-engineered system prompt requesting JSON format + optimization_system_prompt = f"""You are an expert prompt engineer specializing in iterative prompt optimization. + +Your task: Given the CURRENT PROMPT and its EVALUATION FEEDBACK, generate {num_gepa} DISTINCT variations of improved prompts that address the identified issues through DIFFERENT improvement strategies. + +CRITICAL OUTPUT FORMAT - MUST BE VALID JSON: +{{ + "variations": [ + {{ + "index": 1, + "prompt": "[First improved prompt text - complete and self-contained]" + }}, + {{ + "index": 2, + "prompt": "[Second improved prompt text - complete and self-contained]" + }}, + {{ + "index": 3, + "prompt": "[Third improved prompt text - complete and self-contained]" + }} + ] +}} + +DIVERSITY REQUIREMENTS: +- Variation 1: Focus on clarity, specificity, and explicit instructions +- Variation 2: Focus on edge case handling, robustness, and error prevention +- Variation 3: Focus on structural organization, examples, and step-by-step guidance +- Each variation must be MEANINGFULLY DIFFERENT (not just rewordings) +- Each variation must address ALL feedback issues but through different approaches + +QUALITY STANDARDS (apply to all variations): +- Be specific and concrete (avoid vague instructions) +- Use clear, imperative language for task instructions +- Include edge case handling if feedback identifies confusion +- Ensure each prompt is self-contained and unambiguous +- Preserve the core task domain and output format requirements + +OUTPUT FORMAT: +- Output MUST be valid JSON (can be wrapped in ```json ... ``` markdown code block) +- Generate EXACTLY {num_gepa} variations +- Index must be 1, 2, 3, ... (sequential, starting at 1) +- Each "prompt" field must contain the complete, self-contained prompt text +- NO explanations, NO analysis, NO meta-commentary - just the JSON structure + +DO NOT include: +- Analysis of what went wrong +- Explanations of your changes +- Meta-text like "Here's an improved version..." or "Based on feedback..." +- Recommendations or suggestions (those are already in the feedback) +- Any text outside the JSON structure + +Output ONLY the JSON object with the variations.""" + + # Construct user prompt with clear structure + optimization_user_prompt = f"""CURRENT PROMPT (to be improved): +{current_prompt} + +{feedback_text} + +TASK: Generate {num_gepa} DISTINCT variations of improved prompts. Each variation should: +- Address ALL feedback issues identified above +- Use a DIFFERENT improvement strategy (clarity, robustness, structure) +- Be meaningfully different from the others (not just rewordings) +- Be complete and self-contained + +Remember: Output ONLY the JSON object with {num_gepa} variations. No explanations.""" + + result = self._reflection_lm_client.generate( + system_prompt=optimization_system_prompt, + user_prompt=optimization_user_prompt, + image_base64="" + ) + + if isinstance(result, dict): + response_text = result.get("content", str(result)) + else: + response_text = str(result) + + # Parse JSON variations + gepa_variations = self._parse_json_variations(response_text, num_gepa) + + # Add all variations to candidates + for idx, variation_prompt in enumerate(gepa_variations, 1): + # ๐Ÿ›ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite instructions + gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt) + + if gepa_candidate != variation_prompt: + self.logger.debug(f" Variation {idx}: Extracted clean prompt (removed {len(variation_prompt) - len(gepa_candidate)} chars)") + + all_candidates.append({ + 'prompt': gepa_candidate, + 'source': 'gepa_reflection', + 'index': idx + }) + + # ๐Ÿ”ฅ CAPTURE CANDIDATE FOR LIVE UI DISPLAY + try: + import sys + if 'app' in sys.modules: + app_module = sys.modules['app'] + if hasattr(app_module, 'add_candidate_to_store'): + app_module.add_candidate_to_store({ + 'prompt': gepa_candidate, + 'source': 'gepa_reflection', + 'timestamp': f"Candidate #{idx}" + }) + except Exception: + pass # Silent fail - UI capture is optional + + self.logger.info(f"\nโœ… GEPA REFLECTION CANDIDATE #{idx}/{num_gepa} (FULL PROMPT - NO TRUNCATION):") + self.logger.info(f"{'โ–“'*80}") + self.logger.info(f"{gepa_candidate}") + self.logger.info(f"{'โ–“'*80}") + self.logger.info(f" Length: {len(gepa_candidate)} chars, Words: {len(gepa_candidate.split())}") + + gepa_count = len(all_candidates) + self.logger.info(f"\nโœ… GEPA Reflection: {gepa_count} candidates generated in single optimized call") + + except Exception as e: + self.logger.error(f"โŒ Error generating GEPA reflection candidates: {e}") + self.logger.warning(f" Falling back to sequential generation...") + import traceback + self.logger.debug(traceback.format_exc()) + + # Fallback: Sequential generation (when JSON parsing fails) + for i in range(num_gepa): + self.logger.info(f"\n๐Ÿ“ Generating GEPA Reflection candidate #{i+1}/{num_gepa} (fallback mode)...") + try: + fallback_user_prompt = f"""CURRENT PROMPT (to be improved): +{current_prompt} + +{feedback_text} + +TASK: Generate an improved version of the CURRENT PROMPT that addresses all issues identified in the evaluation feedback above. + +Remember: Output ONLY the improved prompt text. No explanations.""" + + result = self._reflection_lm_client.generate( + system_prompt=self._FALLBACK_SYSTEM_PROMPT, + user_prompt=fallback_user_prompt, + image_base64="" + ) + + if isinstance(result, dict): + gepa_candidate_raw = result.get("content", str(result)) + else: + gepa_candidate_raw = str(result) + + gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw) + + all_candidates.append({ + 'prompt': gepa_candidate, + 'source': 'gepa_reflection', + 'index': i + 1 + }) + + # ๐Ÿ”ฅ CAPTURE CANDIDATE FOR LIVE UI DISPLAY + try: + import sys + if 'app' in sys.modules: + app_module = sys.modules['app'] + if hasattr(app_module, 'add_candidate_to_store'): + app_module.add_candidate_to_store({ + 'prompt': gepa_candidate, + 'source': 'gepa_reflection', + 'timestamp': f"Fallback #{i+1}" + }) + except Exception: + pass # Silent fail - UI capture is optional + except Exception as fallback_error: + self.logger.error(f"โŒ Error in fallback generation #{i+1}: {fallback_error}") + + gepa_count = len(all_candidates) + if gepa_count > 0: + self.logger.info(f"\nโœ… GEPA Reflection: {gepa_count} candidates generated") + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # PART 2: LLEGO GENETIC OPERATORS + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + self.logger.info(f"๐Ÿงฌ PART 3: LLEGO GENETIC OPERATORS - Structural Diversity") + + if self.llego: + # ๐Ÿ”ฅ FIX 2: Get Pareto front from GEPA (not LLEGO population) + # This ensures LLEGO operators use true non-dominated solutions + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + gepa_pareto_front = pareto_log.pareto_front + + # Convert GEPA Pareto front to PromptCandidate format + pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front) + pareto_front = pareto_candidates + + self.logger.info(f" Using GEPA Pareto front (size: {len(gepa_pareto_front)})") + self.logger.info(f" Converted to {len(pareto_front)} PromptCandidate objects") + for idx, p in enumerate(pareto_front, 1): + cand_type = p.metadata.get('candidate_type', 'unknown') if p.metadata else 'unknown' + notation = p.metadata.get('notation', 'S') if p.metadata else 'S' + self.logger.info(f" {notation}: [fitness={p.fitness:.3f}, type={cand_type}, length={len(p.prompt)} chars]") + + # Create LLM callable for LLEGO + def llm_callable(genetic_prompt: str) -> str: + # ๐Ÿ”ฅ LLEGO genetic prompt already contains full instructions + # Use minimal system prompt to avoid instruction conflict + result = self._reflection_lm_client.generate( + system_prompt="You are an expert prompt engineer. Follow the instructions provided in the user message to generate an improved prompt. Output only the prompt text, no explanations.", + user_prompt=genetic_prompt, + image_base64="" + ) + if isinstance(result, dict): + return result.get('content', str(result)) + return str(result) + + # Generate LLEGO offspring + try: + llego_prompts = self.llego.evolve_generation( + llm=llm_callable, + pareto_front=pareto_front + ) + + n_crossover = self._config.n_crossover if hasattr(self._config, 'n_crossover') else 2 + crossover_count = min(n_crossover, len(llego_prompts)) + + for i, prompt in enumerate(llego_prompts): + if i < crossover_count: + source = 'llego_crossover' + else: + source = 'llego_mutation' + + all_candidates.append({ + 'prompt': prompt, + 'source': source, + 'index': i + 1 + }) + + # ๐Ÿ”ฅ CAPTURE CANDIDATE FOR LIVE UI DISPLAY + try: + import sys + if 'app' in sys.modules: + app_module = sys.modules['app'] + if hasattr(app_module, 'add_candidate_to_store'): + app_module.add_candidate_to_store({ + 'prompt': prompt, + 'source': source, + 'timestamp': f"Candidate #{i+1}" + }) + except Exception: + pass # Silent fail - UI capture is optional + + border_char = "โ–“" if source == 'llego_crossover' else "โ–’" + self.logger.info(f"\n{border_char*80}") + self.logger.info(f"{border_char} {'๐Ÿ”€ LLEGO CROSSOVER' if source == 'llego_crossover' else '๐ŸŽฒ LLEGO MUTATION'} candidate #{i+1}") + self.logger.info(f"{border_char*80}") + self.logger.info(f"{prompt}") + self.logger.info(f"{border_char*80}") + self.logger.info(f" Length: {len(prompt)} chars, Words: {len(prompt.split())}") + + self.logger.info(f"โœ… LLEGO Genetic Operators: {len(llego_prompts)} candidates generated") + + except Exception as e: + self.logger.error(f"โŒ Error generating LLEGO candidates: {e}") + import traceback + self.logger.error(traceback.format_exc()) + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # SUMMARY + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + self.logger.info(f"\n{'='*80}") + self.logger.info(f"๐Ÿ“Š ADAPTER-LEVEL HYBRID GENERATION SUMMARY") + self.logger.info(f"{'='*80}") + self.logger.info(f" ๐Ÿ“ GEPA Reflection: {gepa_count} candidates") + self.logger.info(f" ๐Ÿ”€ LLEGO Crossover: {len([c for c in all_candidates if c['source'] == 'llego_crossover'])} candidates") + self.logger.info(f" ๐ŸŽฒ LLEGO Mutation: {len([c for c in all_candidates if c['source'] == 'llego_mutation'])} candidates") + self.logger.info(f" ๐Ÿ“ฆ TOTAL: {len(all_candidates)} diverse candidates") + self.logger.info(f"{'='*80}\n") + + # Store candidates (GEPA might access them through some mechanism) + self._generated_candidates = all_candidates + + # Log each candidate with FULL text + self.logger.info(f"\n{'='*80}") + self.logger.info(f"๐Ÿ“‹ ALL GENERATED CANDIDATES (FULL PROMPTS - NO TRUNCATION)") + self.logger.info(f"{'='*80}") + for i, cand in enumerate(all_candidates, 1): + source_emoji = "๐Ÿ“" if cand['source'] == 'gepa_reflection' else "๐Ÿ”€" if cand['source'] == 'llego_crossover' else "๐ŸŽฒ" + self.logger.info(f"\n{source_emoji} CANDIDATE #{i} - {cand['source'].upper().replace('_', ' ')}") + self.logger.info(f"{cand['prompt']}") + self.logger.info(f" Length: {len(cand['prompt'])} characters") + self.logger.info(f" Words: {len(cand['prompt'].split())} words") + self.logger.info(f"{'='*80}\n") + + # Return candidates as list of dicts with metadata (not just strings) + # This ensures source information is preserved + return all_candidates # Return full dicts with source info + + except Exception as e: + self.logger.error(f"\n{'โŒ'*80}") + self.logger.error(f"โŒ CRITICAL ERROR in _generate_hybrid_candidates_adapter_level!") + self.logger.error(f"โŒ Error: {str(e)}") + self.logger.error(f"{'โŒ'*80}\n") + import traceback + self.logger.error(traceback.format_exc()) + return [] + + def propose_new_texts( + self, + candidate: Dict[str, str], + reflective_dataset: Dict[str, List[Dict[str, Any]]], + components_to_update: List[str] + ) -> Dict[str, str]: + """ + ๐Ÿ”ฅ CRITICAL: This method is called by GEPA to propose new component texts. + + This is the KEY integration point - GEPA checks if adapter.propose_new_texts exists, + and if it does, uses it instead of the default InstructionProposalSignature. + + This method: + 1. Uses reflective_dataset to generate improved prompts + 2. Optionally uses LLEGO for additional diversity + 3. Returns dict mapping component_name -> new component text + + Args: + candidate: Current candidate dict (component_name -> component_text) + reflective_dataset: Feedback data per component (from make_reflective_dataset) + components_to_update: List of component names to update + + Returns: + Dict mapping component_name -> new component text + """ + self.logger.info(f"\n{'='*80}") + self.logger.info(f"๐ŸŽฏ PROPOSE_NEW_TEXTS CALLED BY GEPA") + self.logger.info(f"{'='*80}") + self.logger.info(f" Components to update: {components_to_update}") + self.logger.info(f" Reflective dataset keys: {list(reflective_dataset.keys())}") + + # ๐Ÿ”ฅ FIX: Check if we already generated candidates in hybrid mode + # If yes, return one of them instead of generating a new one (avoids duplicate work and context overflow) + if hasattr(self, '_generated_candidates') and self._generated_candidates: + self.logger.info(f"\nโœ… HYBRID MODE: Using pre-generated candidates from make_reflective_dataset") + self.logger.info(f" Available candidates: {len(self._generated_candidates)}") + self.logger.info(f" Returning first candidate (GEPA will evaluate all of them)") + + # Return the first candidate (GEPA will get others via queue) + first_candidate = self._generated_candidates[0] + new_texts = {} + for component in components_to_update: + if isinstance(first_candidate, dict) and 'prompt' in first_candidate: + new_texts[component] = first_candidate['prompt'] + source = first_candidate.get('source', 'unknown') + self.logger.info(f" Returning {source} candidate (length: {len(first_candidate['prompt'])} chars)") + else: + new_texts[component] = str(first_candidate) + + self.logger.info(f"{'='*80}\n") + return new_texts + + new_texts = {} + + # Check if we have reflection_lm_client (required for proposal) + if not self._reflection_lm_client: + self.logger.error("โŒ reflection_lm_client not available - cannot generate proposals") + # Fallback: return current candidate (no change) + for component in components_to_update: + new_texts[component] = candidate.get(component, '') + return new_texts + + # For each component to update + for component_name in components_to_update: + self.logger.info(f"๐Ÿ“ Proposing new text for component: {component_name}") + + current_text = candidate.get(component_name, '') + dataset = reflective_dataset.get(component_name, []) + + if not dataset: + self.logger.warning(f"โš ๏ธ No feedback data for {component_name}, keeping current text") + new_texts[component_name] = current_text + continue + + self.logger.info(f" Current text length: {len(current_text)} chars") + self.logger.info(f" Feedback examples: {len(dataset)}") + + # Generate improved prompt using reflection LM + try: + # ๐Ÿ”ฅ FIX: Clean dataset to remove base64 images (prevents context overflow) + cleaned_dataset = [] + for item in dataset: + cleaned_item = item.copy() + # Remove or truncate base64 image data + if 'image_base64' in cleaned_item: + img_len = len(cleaned_item['image_base64']) + cleaned_item['image_base64'] = f'[IMAGE_DATA_REMOVED_{img_len}_chars]' + if 'image' in cleaned_item and isinstance(cleaned_item['image'], str) and len(cleaned_item['image']) > 1000: + img_len = len(cleaned_item['image']) + cleaned_item['image'] = f'[IMAGE_DATA_REMOVED_{img_len}_chars]' + # Also clean any nested detailed_scores + if 'detailed_scores' in cleaned_item and isinstance(cleaned_item['detailed_scores'], dict): + for key in list(cleaned_item['detailed_scores'].keys()): + val = cleaned_item['detailed_scores'][key] + if isinstance(val, str) and len(val) > 5000: + cleaned_item['detailed_scores'][key] = f'[LARGE_DATA_REMOVED_{len(val)}_chars]' + cleaned_dataset.append(cleaned_item) + + self.logger.info(f" ๐Ÿ“‹ Cleaned dataset: removed base64 images to prevent context overflow") + + # Use GEPA's default instruction proposal format + from gepa.strategies.instruction_proposal import InstructionProposalSignature + + # Build input dict for GEPA's instruction proposal + input_dict = { + "current_instruction_doc": current_text, + "dataset_with_feedback": cleaned_dataset # Use cleaned dataset! + } + + # Generate prompt using GEPA's signature + prompt = InstructionProposalSignature.prompt_renderer(input_dict) + + # Call reflection LM to generate new instruction + self.logger.info(f" Generating improved prompt via reflection LM...") + + result = self._reflection_lm_client.generate( + system_prompt="You are an expert prompt engineer. Follow the instructions in the user message to generate an improved prompt.", + user_prompt=prompt, + image_base64="" + ) + + # Extract response + if isinstance(result, dict): + response_text = result.get("content", str(result)) + else: + response_text = str(result) + + # Extract instruction using GEPA's extractor + extracted = InstructionProposalSignature.output_extractor(response_text) + new_instruction = extracted.get("new_instruction", response_text.strip()) + + # Clean up the instruction (remove markdown, quotes, etc.) + new_instruction = self._clean_extracted_prompt(new_instruction) + + self.logger.info(f" โœ… Generated new text (length: {len(new_instruction)} chars)") + self.logger.info(f" Preview: {new_instruction[:150]}...") + + new_texts[component_name] = new_instruction + + except Exception as e: + self.logger.error(f"โŒ Error generating proposal for {component_name}: {e}") + import traceback + self.logger.error(traceback.format_exc()) + # Fallback: return current text + new_texts[component_name] = current_text + + self.logger.info(f"\n{'='*80}") + self.logger.info(f"โœ… PROPOSE_NEW_TEXTS COMPLETE") + self.logger.info(f" Generated {len(new_texts)} new component texts") + self.logger.info(f"{'='*80}\n") + + return new_texts + + def _clean_extracted_prompt(self, prompt: str) -> str: + """ + Clean extracted prompt by removing markdown, quotes, and extra whitespace. + + Args: + prompt: Raw extracted prompt text + + Returns: + Cleaned prompt text + """ + if not prompt: + return prompt + + # Remove markdown code blocks + prompt = re.sub(r'```[\w]*\n?', '', prompt) + prompt = re.sub(r'```', '', prompt) + + # Remove quotes if entire prompt is quoted + prompt = prompt.strip() + if (prompt.startswith('"') and prompt.endswith('"')) or \ + (prompt.startswith("'") and prompt.endswith("'")): + prompt = prompt[1:-1] + + # Remove leading/trailing whitespace + prompt = prompt.strip() + + return prompt \ No newline at end of file diff --git a/src/gepa_optimizer/data/__init__.py b/src/gepa_optimizer/data/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8fbd2af54c03c26e5d54c79d12e6d16140d76190 --- /dev/null +++ b/src/gepa_optimizer/data/__init__.py @@ -0,0 +1,27 @@ +""" +Data module for GEPA Optimizer +""" + +from .converters import UniversalConverter +from .loaders import DataLoader +from .validators import DataValidator +from .scroll_dataset_loader import ScrollDatasetLoader, load_scroll_dataset +from .validation_dataset_loader import ValidationDatasetLoader, load_validation_dataset, load_validation_split +from .index_caching_loader import IndexCachingDatasetLoader, load_index_caching_dataset, load_index_caching_split + +__all__ = [ + "UniversalConverter", + "DataLoader", + "DataValidator", + # Scroll dataset + "ScrollDatasetLoader", + "load_scroll_dataset", + # Validation dataset + "ValidationDatasetLoader", + "load_validation_dataset", + "load_validation_split", + # Index caching dataset + "IndexCachingDatasetLoader", + "load_index_caching_dataset", + "load_index_caching_split", +] diff --git a/src/gepa_optimizer/data/converters.py b/src/gepa_optimizer/data/converters.py new file mode 100644 index 0000000000000000000000000000000000000000..cc46ec186d799e47b92c756e594a8f796b8e8364 --- /dev/null +++ b/src/gepa_optimizer/data/converters.py @@ -0,0 +1,265 @@ +""" +Universal converter for dataset to GEPA format with 3-way split (train/val/test) +""" + +import os +import json +from typing import Any, List, Tuple, Union, Dict, Optional +from pathlib import Path +import pandas as pd +import logging + +from .loaders import DataLoader +from ..utils.exceptions import DatasetError +from ..models.config import DataSplitConfig + +logger = logging.getLogger(__name__) + +class UniversalConverter: + """ + Universal converter for datasets to GEPA format. + + Handles 3-way splitting (train/val/test) with configurable ratios and + graceful handling of small datasets. + """ + + def __init__(self, data_split_config: Optional[DataSplitConfig] = None): + """ + Initialize converter with optional split configuration. + + Args: + data_split_config: Configuration for train/val/test splits. + If None, uses default 60/20/20 split. + """ + self.supported_extensions = [ + '.csv', '.json', '.jsonl', '.txt', '.md', + '.png', '.jpg', '.jpeg' + ] + self.loader = DataLoader() + self.data_split_config = data_split_config or DataSplitConfig() + + def convert( + self, + dataset: Union[List[Any], str, Any, Dict[str, Any]], + split_config: Optional[DataSplitConfig] = None + ) -> Tuple[List[dict], List[dict], List[dict]]: + """ + Convert any dataset to GEPA format with 3-way split (train/val/test). + + Args: + dataset: Input dataset in any supported format + split_config: Optional split configuration (overrides instance config) + + Returns: + Tuple of (trainset, valset, testset) where: + - trainset: Used for reflection/feedback (Dfeedback in GEPA paper) + - valset: Used for Pareto selection (Dpareto in GEPA paper) + - testset: Held-out for final evaluation (not passed to GEPA) + + Raises: + DatasetError: If dataset cannot be converted or is too small + """ + try: + # Use provided split config or instance default + config = split_config or self.data_split_config + + # Handle UI tree dataset format + if isinstance(dataset, dict) and 'type' in dataset and dataset['type'] == 'ui_tree_dataset': + return self.convert_ui_tree_dataset( + dataset.get('json_dir', 'json_tree'), + dataset.get('screenshots_dir', 'screenshots'), + split_config=config + ) + elif isinstance(dataset, str): + data = self._load_from_path(dataset) + elif hasattr(dataset, 'to_dict'): # pandas DataFrame + data = dataset.to_dict(orient='records') + elif isinstance(dataset, list): + data = dataset + else: + data = [dataset] + + logger.info(f"Normalized data length: {len(data)}") + standardized = self._standardize(data) + train, val, test = self._split_three_way(standardized, config) + return train, val, test + except (FileNotFoundError, ValueError, TypeError) as e: + raise DatasetError(f"Failed to convert dataset: {str(e)}") + + def _load_from_path(self, path: str) -> List[Any]: + """Load data from file path""" + p = Path(path) + if not p.exists(): + raise FileNotFoundError(f"File not found: {path}") + + ext = p.suffix.lower() + if ext in self.supported_extensions: + return [self.loader.load(p)] + else: + raise DatasetError(f"Unsupported file extension: {ext}") + + def _standardize(self, data: List[Any]) -> List[dict]: + """Standardize data to input/output format + + Handles both UI tree JSON format and simple text inputs. + UI tree format should have: {'screenshot': str, 'ui_tree': dict, 'expected_output': str} + Simple format can be: {'input': str, 'output': str} or {'question': str, 'answer': str} etc. + """ + out = [] + for item in data: + if not isinstance(item, dict): + item = {'input': str(item)} + + # Handle UI tree JSON format + if 'ui_tree' in item and 'screenshot' in item: + ui_tree = item['ui_tree'] + input_text = ui_tree.get('text', '') + output_text = item.get('expected_output', '') + image = item.get('screenshot', '') + out.append({'input': input_text, 'output': output_text, 'image': image}) + # Handle simple text format + else: + inp = self._extract(item, ['input', 'question', 'text', 'prompt']) or '' + outp = self._extract(item, ['output', 'result', 'response', 'answer', 'expected_output']) or '' + image = self._extract(item, ['image', 'image_base64', 'screenshot']) or '' + out.append({'input': inp, 'output': outp, 'image': image}) + + return out + + def _extract(self, d: dict, keys: List[str]) -> Union[str, None]: + """Extract value by trying multiple keys""" + for k in keys: + if k in d: + return d[k] + return None + + def _split_three_way( + self, + data: List[dict], + config: DataSplitConfig + ) -> Tuple[List[dict], List[dict], List[dict]]: + """ + Split data into train, validation, and test sets. + + Args: + data: Standardized dataset + config: Split configuration with ratios and strategies + + Returns: + Tuple of (train, val, test) datasets + + Raises: + ValueError: If dataset is too small for configured splits + """ + dataset_size = len(data) + + # ๐Ÿ”ฅ NEW: Log adaptive strategy if being used + if config.small_dataset_strategy == 'adaptive': + train_ratio, val_ratio, test_ratio = config.get_adaptive_ratios(dataset_size) + logger.info( + f"๐Ÿ“Š Adaptive dataset splitting (strategy: adaptive, size: {dataset_size}): " + f"ratios = {train_ratio*100:.0f}%/{val_ratio*100:.0f}%/{test_ratio*100:.0f}% " + f"(prioritizes validation for reliable candidate ranking)" + ) + + # Get split indices from config + try: + train_end, val_end, test_end, _ = config.get_split_indices(dataset_size) + except ValueError as e: + logger.error(f"Dataset split error: {e}") + raise DatasetError(str(e)) + + # Perform the split + train = data[:train_end] + val = data[train_end:val_end] + test = data[val_end:test_end] + + # Log split information with strategy + strategy_note = "" + if config.small_dataset_strategy == 'adaptive': + strategy_note = " (adaptive)" + logger.info( + f"Dataset split{strategy_note}: {len(train)} train ({len(train)/dataset_size*100:.1f}%), " + f"{len(val)} val ({len(val)/dataset_size*100:.1f}%), " + f"{len(test)} test ({len(test)/dataset_size*100:.1f}%)" + ) + + # Validate splits are not empty + if len(train) == 0: + raise DatasetError("Training set is empty after split") + if len(val) == 0: + logger.warning("Validation set is empty - this may cause issues with Pareto selection") + val = [train[-1]] # Use last training sample as fallback + if len(test) == 0: + logger.warning("Test set is empty - final evaluation will not be performed") + + return train, val, test + + def _split(self, data: List[dict], ratio: float = 0.8) -> Tuple[List[dict], List[dict]]: + """ + DEPRECATED: Legacy 2-way split for backwards compatibility. + + Use _split_three_way() instead for production code. + + Args: + data: Standardized dataset + ratio: Train ratio (0.0-1.0) + + Returns: + Tuple of (train, val) datasets + """ + import warnings + warnings.warn( + "_split() is deprecated. Use _split_three_way() for 3-way splitting.", + DeprecationWarning, + stacklevel=2 + ) + + split = max(1, int(len(data) * ratio)) + train = data[:split] + val = data[split:] or data[-1:] # Ensure val is not empty + return train, val + + def convert_ui_tree_dataset( + self, + json_dir: str, + screenshots_dir: str, + split_config: Optional[DataSplitConfig] = None + ) -> Tuple[List[dict], List[dict], List[dict]]: + """ + Convert UI tree dataset (JSON + screenshots) to GEPA format with 3-way split. + + Args: + json_dir: Directory containing JSON files + screenshots_dir: Directory containing screenshot images + split_config: Optional split configuration (overrides instance config) + + Returns: + Tuple of (train_data, val_data, test_data) in GEPA format + + Raises: + DatasetError: If dataset cannot be loaded or is invalid + """ + try: + # Load paired dataset + dataset = self.loader.load_ui_tree_dataset(json_dir, screenshots_dir) + + if not dataset: + raise DatasetError("No valid image-JSON pairs found") + + logger.info(f"Loaded {len(dataset)} UI tree samples") + + # Use provided config or instance default + config = split_config or self.data_split_config + + # Split into train/val/test + train, val, test = self._split_three_way(dataset, config) + + logger.info( + f"Split UI tree dataset: {len(train)} train, " + f"{len(val)} validation, {len(test)} test" + ) + return train, val, test + + except Exception as e: + raise DatasetError(f"Failed to convert UI tree dataset: {str(e)}") diff --git a/src/gepa_optimizer/data/index_caching_loader.py b/src/gepa_optimizer/data/index_caching_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e38edd78fbd009bca23e28505da62c89841c99 --- /dev/null +++ b/src/gepa_optimizer/data/index_caching_loader.py @@ -0,0 +1,278 @@ +""" +Index Caching Dataset Loader + +Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format. +""" + +import os +import json +import base64 +import logging +from typing import List, Dict, Any, Optional +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class IndexCachingDatasetLoader: + """ + Loads index caching dataset from JSON file. + + Expected JSON format: + [ + { + "command": "Tap on first option from the suggestion", + "image": "element_images/QMxgc_14_0_tap_IkALe_element.png", + "xml": "xml/IkALe__debug.xml", + "expected": { + "is_index_based": true, + "index_value": 1, + "parent_element_id": "aaaabf", + "element_id_of_nth_child_of_parent": "aaaabg", + "selected_element_is_correct": true + } + }, + ... + ] + + Converts to GEPA format: + - input: command text (seed prompt will be provided in test script) + - output: JSON string with expected values + - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter) + - input: Command + XML content (combined in user prompt) + - metadata: All original fields plus converted values + """ + + def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None): + """ + Initialize index caching dataset loader. + + Args: + json_path: Path to JSON file. Default: "./note2_debug.json" or from env var + base_dir: Base directory for resolving relative paths in JSON. + Default: Directory containing JSON file + + Raises: + FileNotFoundError: If JSON file doesn't exist + json.JSONDecodeError: If JSON file is invalid + """ + # Get JSON path from env or use default + if json_path is None: + json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json") + + self.json_path = Path(json_path).resolve() + + if not self.json_path.exists(): + raise FileNotFoundError( + f"Dataset file not found: {self.json_path}\n" + f"Make sure note2_debug.json exists in the project root." + ) + + # Base directory for resolving relative paths + if base_dir is None: + base_dir = self.json_path.parent + self.base_dir = Path(base_dir).resolve() + + def load_dataset(self) -> List[Dict[str, Any]]: + """ + Load dataset from JSON file and convert to GEPA format. + + Returns: + List of dataset items in GEPA format: + [ + { + "input": "Tap on first option from the suggestion", # Command only + "output": '{"is_index_based": true, "index_value": 1, ...}', # Expected JSON + "image_base64": "", # TOP LEVEL + "metadata": { + "command": "...", + "image_path": "...", + "xml_path": "...", + "expected": {...} + } + }, + ... + ] + + Raises: + FileNotFoundError: If image or XML file doesn't exist + json.JSONDecodeError: If JSON file is invalid + """ + # Load JSON file + with open(self.json_path, "r", encoding="utf-8") as f: + dataset = json.load(f) + + gepa_dataset = [] + + for idx, entry in enumerate(dataset): + command = entry.get("command", "") + image_path = entry.get("image", "") + xml_path = entry.get("xml", "") + expected = entry.get("expected", {}) + + # Resolve paths relative to base_dir + abs_image_path = (self.base_dir / image_path).resolve() + abs_xml_path = (self.base_dir / xml_path).resolve() + + # Validate paths + if not abs_image_path.exists(): + raise FileNotFoundError( + f"Image file not found: {abs_image_path}\n" + f"Entry {idx + 1}: {command}" + ) + + if not abs_xml_path.exists(): + raise FileNotFoundError( + f"XML file not found: {abs_xml_path}\n" + f"Entry {idx + 1}: {command}" + ) + + # Load and encode image + with open(abs_image_path, "rb") as f: + image_data = f.read() + image_base64 = base64.b64encode(image_data).decode("utf-8") + + # Load XML content + with open(abs_xml_path, "r", encoding="utf-8") as f: + xml_content = f.read() + + # Convert expected to JSON string + expected_json = json.dumps(expected, ensure_ascii=False) + + # Create user prompt with command + XML content + # The XML will be included in the user prompt text (as the agent does) + user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```" + + # For reflection, we don't need full XML - just the command is enough + # Reflection is about improving the prompt based on evaluation feedback, + # not analyzing specific XML structures + reflection_input = command # Just the command, no XML + + # Create GEPA format item + gepa_item = { + "input": user_prompt, # Command + XML content (for evaluation) + "reflection_input": reflection_input, # Just command (for reflection) + "output": expected_json, # Expected output as JSON string + "image_base64": image_base64, # TOP LEVEL for UniversalConverter + "metadata": { + "command": command, + "image_path": str(image_path), + "xml_path": str(xml_path), + "abs_image_path": str(abs_image_path), + "abs_xml_path": str(abs_xml_path), + "xml_content": xml_content, # Store XML separately in metadata + "expected": expected, + "dataset_index": idx + } + } + + gepa_dataset.append(gepa_item) + + return gepa_dataset + + def load_split( + self, + train_ratio: float = 0.6, + val_ratio: float = 0.4 + ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Load dataset and split into train/val sets (no test set). + + Args: + train_ratio: Ratio for training set (default: 0.6) + val_ratio: Ratio for validation set (default: 0.4) + + Returns: + Tuple of (train_set, val_set) + + Raises: + ValueError: If ratios don't sum to 1.0 + """ + if abs(train_ratio + val_ratio - 1.0) > 0.01: + raise ValueError( + f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}" + ) + + dataset = self.load_dataset() + total = len(dataset) + + train_end = int(total * train_ratio) + + train_set = dataset[:train_end] + val_set = dataset[train_end:] + + return train_set, val_set + + +def load_index_caching_dataset( + json_path: Optional[str] = None, + base_dir: Optional[str] = None +) -> List[Dict[str, Any]]: + """ + Convenience function to load index caching dataset. + + Args: + json_path: Path to JSON file + base_dir: Base directory for resolving relative paths + + Returns: + List of dataset items in GEPA format + """ + loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir) + return loader.load_dataset() + + +def load_index_caching_split( + json_path: Optional[str] = None, + base_dir: Optional[str] = None, + train_ratio: float = 0.6, + val_ratio: float = 0.4 +) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Convenience function to load and split index caching dataset. + + Args: + json_path: Path to JSON file + base_dir: Base directory for resolving relative paths + train_ratio: Ratio for training set + val_ratio: Ratio for validation set + + Returns: + Tuple of (train_set, val_set) - no test set + """ + loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir) + return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio) + + +# Example usage +if __name__ == "__main__": + print("๐Ÿš€ Testing Index Caching Dataset Loader...") + + # Test loading + try: + loader = IndexCachingDatasetLoader(json_path="./note2_debug.json") + dataset = loader.load_dataset() + + print(f"\nโœ… Loaded {len(dataset)} items") + + # Show sample + if dataset: + sample = dataset[0] + print(f"\n๐Ÿ“ Sample Item:") + print(f" Command: {sample['input']}") + print(f" Image path: {sample['metadata']['image_path']}") + print(f" XML path: {sample['metadata']['xml_path']}") + print(f" Expected: {sample['output'][:100]}...") + print(f" Image base64 length: {len(sample['image_base64'])}") + print(f" XML content length: {len(sample['metadata'].get('xml_content', ''))}") + + # Test split + train, val = loader.load_split() + print(f"\n๐Ÿ“Š Dataset Split:") + print(f" Training: {len(train)} samples") + print(f" Validation: {len(val)} samples") + print(f" Test: Not used (no test set)") + + except Exception as e: + print(f"โŒ Error: {e}") + diff --git a/src/gepa_optimizer/data/loaders.py b/src/gepa_optimizer/data/loaders.py new file mode 100644 index 0000000000000000000000000000000000000000..2f70b857e9972f5dac3a267ec6f3db9d073ca0b0 --- /dev/null +++ b/src/gepa_optimizer/data/loaders.py @@ -0,0 +1,237 @@ +""" +Data loading utilities for various file formats +""" + +import json +import base64 +import pandas as pd +from typing import Any, Optional, Union, List , Dict +from pathlib import Path +import logging + +logger = logging.getLogger(__name__) + +class DataLoader: + """ + Utility class for loading data from various sources + """ + + def __init__(self): + self.supported_formats = [ + '.csv', '.json', '.jsonl', '.txt', '.md', '.xlsx', + '.png', '.jpg', '.jpeg' + ] + + def load(self, source: Union[str, Path], format_hint: Optional[str] = None) -> Optional[Any]: + """ + Load data from any supported source + + Args: + source: File path or data source + format_hint: Optional format hint to override auto-detection + + Returns: + Loaded data or None if failed + """ + try: + path = Path(source) + + if not path.exists(): + logger.error(f"File not found: {source}") + return None + + # Use format hint or detect from extension + file_format = format_hint or path.suffix.lower() + + if file_format == '.csv': + return self.load_csv(path) + elif file_format == '.json': + return self.load_json(path) + elif file_format == '.jsonl': + return self.load_jsonl(path) + elif file_format in ['.txt', '.md']: + return self.load_text(path) + elif file_format == '.xlsx': + return self.load_excel(path) + elif file_format in ['.png', '.jpg', '.jpeg']: + return self.load_image_base64(path) + else: + logger.warning(f"Unsupported format: {file_format}") + return None + + except Exception as e: + logger.error(f"Failed to load data from {source}: {str(e)}") + return None + + def load_csv(self, path: Union[str, Path]) -> Optional[pd.DataFrame]: + """Load CSV file as pandas DataFrame""" + try: + df = pd.read_csv(path) + logger.info(f"Loaded CSV with {len(df)} rows and {len(df.columns)} columns") + return df + except Exception as e: + logger.error(f"Failed to load CSV {path}: {str(e)}") + return None + + def load_json(self, path: Union[str, Path]) -> Optional[Any]: + """Load JSON file""" + try: + with open(path, 'r', encoding='utf-8') as f: + data = json.load(f) + + if isinstance(data, list): + logger.info(f"Loaded JSON with {len(data)} items") + else: + logger.info("Loaded JSON object") + + return data + except Exception as e: + logger.error(f"Failed to load JSON {path}: {str(e)}") + return None + + def load_jsonl(self, path: Union[str, Path]) -> Optional[List[Dict]]: + """Load JSONL (JSON Lines) file""" + try: + data = [] + with open(path, 'r', encoding='utf-8') as f: + for line_num, line in enumerate(f, 1): + line = line.strip() + if line: + try: + data.append(json.loads(line)) + except json.JSONDecodeError as e: + logger.warning(f"Invalid JSON on line {line_num}: {str(e)}") + + logger.info(f"Loaded JSONL with {len(data)} items") + return data + except Exception as e: + logger.error(f"Failed to load JSONL {path}: {str(e)}") + return None + + def load_text(self, path: Union[str, Path]) -> Optional[str]: + """Load plain text file""" + try: + with open(path, 'r', encoding='utf-8') as f: + content = f.read() + + logger.info(f"Loaded text file with {len(content)} characters") + return content + except Exception as e: + logger.error(f"Failed to load text {path}: {str(e)}") + return None + + def load_excel(self, path: Union[str, Path]) -> Optional[pd.DataFrame]: + """Load Excel file as pandas DataFrame""" + try: + df = pd.read_excel(path) + logger.info(f"Loaded Excel with {len(df)} rows and {len(df.columns)} columns") + return df + except Exception as e: + logger.error(f"Failed to load Excel {path}: {str(e)}") + return None + + def load_image_base64(self, path: Union[str, Path]) -> Optional[str]: + """Load image file and encode as Base64 string""" + try: + with open(path, 'rb') as f: + encoded_string = base64.b64encode(f.read()).decode('utf-8') + logger.info(f"Loaded image {path} and encoded to Base64") + return encoded_string + except Exception as e: + logger.error(f"Failed to load image {path}: {str(e)}") + return None + + def is_supported_format(self, file_path: Union[str, Path]) -> bool: + """Check if file format is supported""" + path = Path(file_path) + return path.suffix.lower() in self.supported_formats + + def get_file_info(self, file_path: Union[str, Path]) -> Dict[str, Any]: + """Get information about a file""" + path = Path(file_path) + + if not path.exists(): + return {'exists': False} + + return { + 'exists': True, + 'size': path.stat().st_size, + 'format': path.suffix.lower(), + 'supported': self.is_supported_format(path), + 'name': path.name, + 'stem': path.stem, + 'parent': str(path.parent) + } + + def load_ui_tree_dataset(self, json_dir: str, screenshots_dir: str) -> List[Dict[str, Any]]: + """ + Load UI tree dataset by pairing JSON files with corresponding screenshots + + Args: + json_dir: Directory containing JSON files (e.g., "json_tree") + screenshots_dir: Directory containing screenshot images (e.g., "screenshots") + + Returns: + List of dictionaries with 'input', 'output', and 'image' keys + """ + json_path = Path(json_dir) + screenshots_path = Path(screenshots_dir) + + if not json_path.exists(): + raise FileNotFoundError(f"JSON directory not found: {json_dir}") + if not screenshots_path.exists(): + raise FileNotFoundError(f"Screenshots directory not found: {screenshots_dir}") + + dataset = [] + + # Get all JSON files + json_files = list(json_path.glob("*.json")) + logger.info(f"Found {len(json_files)} JSON files in {json_dir}") + + for json_file in json_files: + # Extract filename without extension (e.g., "2" from "2.json") + file_stem = json_file.stem + + # Look for corresponding image file + image_extensions = ['.jpg', '.jpeg', '.png'] + image_file = None + + for ext in image_extensions: + potential_image = screenshots_path / f"{file_stem}{ext}" + if potential_image.exists(): + image_file = potential_image + break + + if not image_file: + logger.warning(f"No corresponding image found for {json_file.name}") + continue + + try: + # Load JSON content + json_data = self.load_json(json_file) + if not json_data: + logger.warning(f"Failed to load JSON: {json_file}") + continue + + # Load image as base64 + image_base64 = self.load_image_base64(image_file) + if not image_base64: + logger.warning(f"Failed to load image: {image_file}") + continue + + # Create dataset entry + dataset_entry = { + 'input': 'Extract UI elements from this screenshot and provide the complete UI tree structure', + 'output': json.dumps(json_data, indent=2), # Convert JSON to string + 'image': image_base64 + } + + dataset.append(dataset_entry) + logger.debug(f"Loaded pair: {json_file.name} + {image_file.name}") + + except Exception as e: + logger.error(f"Error loading {json_file.name}: {str(e)}") + continue + + logger.info(f"Successfully loaded {len(dataset)} image-JSON pairs") + return dataset diff --git a/src/gepa_optimizer/data/scroll_dataset_loader.py b/src/gepa_optimizer/data/scroll_dataset_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..d0c29e9e6119b2aca10de309e709db45374fb95c --- /dev/null +++ b/src/gepa_optimizer/data/scroll_dataset_loader.py @@ -0,0 +1,334 @@ +""" +Scroll Element Dataset Loader for Drizz Mobile App Testing + +Loads screenshots with bounding boxes and commands to identify scroll elements. +Converts to GEPA-compatible format for prompt optimization. +""" + +import base64 +import random +import logging +from typing import List, Dict, Any, Tuple, Optional +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class ScrollDatasetLoader: + """ + GENERIC dataset loader for image-based tasks. + + This is a LIBRARY class - NO hardcoded assumptions about: + - What the task is (OCR, element detection, classification, etc.) + - Input format (questions, commands, descriptions, etc.) + - Output format (IDs, text, JSON, etc.) + + Users define their dataset in the test script and pass it here. + + Dataset format per item: (image_filename, input_text, expected_output) + + Example usage (ANY task): + # Define YOUR dataset in YOUR test script + my_dataset = [ + ("img1.png", "What is the main color?", "blue"), + ("img2.png", "Count the objects", "5"), + ("img3.png", "Describe the scene", "A cat on a sofa"), + ] + + # Pass to loader + loader = ScrollDatasetLoader( + images_dir="images", + dataset_config=my_dataset + ) + data = loader.load_dataset() + """ + + def __init__( + self, + images_dir: str = "images", + dataset_config: Optional[List[Tuple[str, str, str]]] = None + ): + """ + Initialize dataset loader. + + Args: + images_dir: Directory containing images + dataset_config: List of (image_filename, input_text, expected_output) tuples. + REQUIRED - no hardcoded defaults to keep library generic. + + Raises: + FileNotFoundError: If images_dir doesn't exist + ValueError: If dataset_config is None + """ + self.images_dir = Path(images_dir) + + if not self.images_dir.exists(): + raise FileNotFoundError(f"Images directory not found: {images_dir}") + + if dataset_config is None: + raise ValueError( + "dataset_config is required. This is a library class - define your " + "dataset in the test script:\n" + " dataset = [('img1.png', 'your input', 'expected output'), ...]\n" + " loader = ScrollDatasetLoader(images_dir='...', dataset_config=dataset)" + ) + + self.dataset_config = dataset_config + + def load_dataset(self) -> List[Dict[str, Any]]: + """ + Load complete dataset with images. + + Phase 1: Includes element_id extraction from expected output. + + Returns: + List of dataset items in GEPA format: + [ + { + "input": "Command: Scroll down by 70%", + "output": "3", + "image_base64": "", # TOP LEVEL + "metadata": { + "image_path": "images/5.png", + "input_text": "Command: Scroll down by 70%", + "expected_output": "3", + "image_filename": "5.png", + "element_id": 3 # Extracted integer (None if extraction fails) + } + }, + ... + ] + """ + dataset = [] + + # Generic variable names - no assumptions about data type + for image_filename, input_text, expected_output in self.dataset_config: + image_path = self.images_dir / image_filename + + # Validate image exists + if not image_path.exists(): + logger.warning(f"Image not found: {image_path}") + continue + + # Read and encode image + try: + image_base64 = self._encode_image(image_path) + except Exception as e: + logger.warning(f"Error encoding {image_filename}: {e}") + continue + + # ๐Ÿ”ฅ Phase 1: Extract element_id from expected_output for robust evaluation + element_id = self._extract_element_id(expected_output) + if element_id is None: + logger.warning(f"Could not extract element_id from '{expected_output}' in {image_filename}") + + # Create dataset item - COMPLETELY GENERIC + # NO assumptions about output format (element IDs, commands, etc.) + # Just: image + input text + expected output text + # Library doesn't know or care what the task is! + # IMPORTANT: Put image_base64 at TOP LEVEL for UniversalConverter to find it + dataset_item = { + "input": input_text, # Generic input text (ANY format) + "output": expected_output, # Generic expected output (ANY format, full reasoning) + "image_base64": image_base64, # TOP LEVEL for converter + "metadata": { + "image_path": str(image_path), + "input_text": input_text, + "expected_output": expected_output, + "image_filename": image_filename, + "element_id": element_id # NEW: Extracted element ID (int or None) + } + } + + dataset.append(dataset_item) + + if not dataset: + raise ValueError("No valid images found in dataset") + + logger.info(f"Loaded {len(dataset)} scroll element detection samples") + return dataset + + def _extract_element_id(self, expected_output: str) -> Optional[int]: + """ + Extract element ID from expected output string. + + Handles multiple formats: + - "Element: 4" + - "Element 4" + - "4" (standalone) + - "Element: 4, Description: ..." (full reasoning) + + Args: + expected_output: Full expected output string with reasoning + + Returns: + Element ID as integer, or None if not found + """ + import re + + if not expected_output: + return None + + # Pattern 1: "Element: X" or "Element X" (case insensitive) + patterns = [ + r'element[:\s]+(\d+)', # "Element: 4" or "Element 4" + r'\belement\s+(\d+)\b', # "element 4" (word boundary) + ] + + for pattern in patterns: + match = re.search(pattern, expected_output, re.IGNORECASE) + if match: + try: + element_id = int(match.group(1)) + # Validate range (reasonable UI element IDs) + if 1 <= element_id <= 100: + return element_id + except (ValueError, IndexError): + continue + + # Pattern 2: First standalone number (if no "Element:" pattern found) + # Only use if it's a reasonable element ID (1-100) + number_match = re.search(r'\b(\d{1,3})\b', expected_output) + if number_match: + try: + element_id = int(number_match.group(1)) + if 1 <= element_id <= 100: # Reasonable range for UI elements + return element_id + except ValueError: + pass + + return None + + def _encode_image(self, image_path: Path) -> str: + """ + Encode image to base64 string. + + Args: + image_path: Path to image file + + Returns: + Base64 encoded image string + """ + with open(image_path, "rb") as image_file: + encoded = base64.b64encode(image_file.read()).decode('utf-8') + return encoded + + def split_dataset( + self, + dataset: List[Dict[str, Any]], + train_size: int = 4, + val_size: int = 1, + test_size: int = 1, + shuffle: bool = True, + seed: Optional[int] = None + ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Split dataset into train, validation, and test sets. + + ๐Ÿ”ฅ NEW: Added shuffling support to ensure different image distribution + across splits, preventing hard images from always landing in validation set. + + Args: + dataset: Complete dataset + train_size: Number of samples for training (default: 4) + val_size: Number of samples for validation (default: 1) + test_size: Number of samples for test (default: 1) + shuffle: Whether to shuffle dataset before splitting (default: True) + seed: Random seed for reproducible shuffling (default: None = random) + + Returns: + Tuple of (train_set, val_set, test_set) + """ + n = len(dataset) + + # Validate split sizes + total_size = train_size + val_size + test_size + if total_size > n: + logger.warning(f"Requested split ({total_size}) exceeds dataset size ({n}). Adjusting split proportionally...") + ratio = n / total_size + train_size = int(train_size * ratio) + val_size = int(val_size * ratio) + test_size = n - train_size - val_size + + # ๐Ÿ”ฅ CRITICAL: Shuffle dataset to ensure different image distribution + # This prevents the same hard images from always being in validation set + dataset_copy = dataset.copy() # Don't modify original + if shuffle: + if seed is not None: + random.seed(seed) + logger.debug(f"Shuffling dataset with seed={seed} for reproducible splits") + else: + logger.debug(f"Shuffling dataset randomly (no seed)") + random.shuffle(dataset_copy) + else: + logger.warning(f"Not shuffling dataset - using original order") + + # Split shuffled dataset + train_set = dataset_copy[:train_size] + val_set = dataset_copy[train_size:train_size + val_size] + test_set = dataset_copy[train_size + val_size:train_size + val_size + test_size] + + logger.info(f"Dataset split: {len(train_set)} train, {len(val_set)} val, {len(test_set)} test") + + # Log which images are in each split for debugging + if shuffle: + train_images = [item['metadata'].get('image_filename', 'N/A') for item in train_set] + val_images = [item['metadata'].get('image_filename', 'N/A') for item in val_set] + test_images = [item['metadata'].get('image_filename', 'N/A') for item in test_set] + print(f" Train images: {train_images[:5]}{'...' if len(train_images) > 5 else ''}") + print(f" Val images: {val_images}") + print(f" Test images: {test_images[:5]}{'...' if len(test_images) > 5 else ''}") + + return train_set, val_set, test_set + + +def load_scroll_dataset( + images_dir: str = "images", + dataset_config: List[Tuple[str, str, str]] = None, + split: bool = True +) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Convenience function to load image-based dataset (GENERIC). + + Args: + images_dir: Directory containing images + dataset_config: List of (image_filename, input_text, expected_output) tuples + split: Whether to split into train/val/test + + Returns: + If split=True: (train_set, val_set, test_set) + If split=False: (full_dataset, [], []) + + Example (works for ANY task): + dataset_config = [ + ("img1.png", "What color is the sky?", "blue"), + ("img2.png", "Count the dogs", "2"), + ] + train, val, test = load_scroll_dataset( + images_dir="images", + dataset_config=dataset_config + ) + """ + loader = ScrollDatasetLoader(images_dir, dataset_config=dataset_config) + dataset = loader.load_dataset() + + if split: + return loader.split_dataset(dataset) + else: + return dataset, [], [] + + +# Example usage (for testing the library loader itself) +if __name__ == "__main__": + print("๐Ÿš€ Testing Scroll Dataset Loader...") + print("โš ๏ธ NOTE: This is a library class. Define your dataset in your test script.") + print("\nExample:") + print(" dataset_config = [") + print(" ('image1.png', 'Scroll down by 50%', '3'),") + print(" ('image2.png', 'Swipe left', '4'),") + print(" ]") + print(" train, val, test = load_scroll_dataset(") + print(" images_dir='images',") + print(" dataset_config=dataset_config") + print(" )") + diff --git a/src/gepa_optimizer/data/validation_dataset_loader.py b/src/gepa_optimizer/data/validation_dataset_loader.py new file mode 100644 index 0000000000000000000000000000000000000000..5c500db6127ae4c136eaf7c17ce0af0b88eca955 --- /dev/null +++ b/src/gepa_optimizer/data/validation_dataset_loader.py @@ -0,0 +1,376 @@ +""" +Validation Dataset Loader for UI Validation Use Case + +Loads validation datapoints from SQLite database and converts to GEPA-compatible format. +Supports filtering by data_type (trainset/valset/testset) and confirmed status. +""" + +import os +import sqlite3 +import base64 +import logging +from typing import List, Dict, Any, Optional, Literal +from pathlib import Path + +logger = logging.getLogger(__name__) + + +class ValidationDatasetLoader: + """ + Loads validation dataset from SQLite database. + + Database schema: + - validation_data: id, image_id, command, result (0/1), reasoning, data_type, confirmed, created_at + - images: image_id, mime, bytes (BLOB), created_at + + Converts to GEPA format: + - input: command text (seed prompt will be provided in test script) + - output: "true" or "false" (converted from 0/1) + - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter) + - metadata: All original fields plus converted values + + Note: The seed prompt is NOT stored in database - it will be provided in the test script. + The input field contains just the command, and the image is at top level. + """ + + def __init__( + self, + db_path: Optional[str] = None, + confirmed_only: bool = True + ): + """ + Initialize validation dataset loader. + + Args: + db_path: Path to SQLite database file. + Default: "./validation_data.db" or from VD_DB_PATH env var + confirmed_only: If True, only load datapoints where confirmed=1. + Default: True (only manually reviewed data) + + Raises: + FileNotFoundError: If database file doesn't exist + sqlite3.Error: If database connection fails + """ + # Get database path from env or use default + if db_path is None: + db_path = os.getenv("VD_DB_PATH", "./validation_data.db") + + self.db_path = Path(db_path).resolve() + + if not self.db_path.exists(): + raise FileNotFoundError( + f"Database file not found: {self.db_path}\n" + f"Make sure validation_data_ui_server_async.py has been run at least once to create the database." + ) + + self.confirmed_only = confirmed_only + + def load_dataset( + self, + data_type: Optional[Literal["trainset", "valset", "testset"]] = None, + confirmed_only: Optional[bool] = None + ) -> List[Dict[str, Any]]: + """ + Load dataset from database and convert to GEPA format. + + Args: + data_type: Filter by data_type. If None, loads all types. + Options: "trainset", "valset", "testset" + confirmed_only: Override instance default. If True, only load confirmed datapoints. + If None, uses instance default (self.confirmed_only) + + Returns: + List of dataset items in GEPA format: + [ + { + "input": "Validate Submit button is visible", # Command only (seed prompt in test script) + "output": "true", # or "false" (converted from 0/1) + "image_base64": "", # TOP LEVEL (image + command together) + "metadata": { + "id": 1, + "image_id": "abc123...", + "command": "Validate Submit button is visible", + "result": True, # Boolean + "result_int": 1, # Original 0/1 + "reasoning": "Detailed explanation...", + "data_type": "trainset", + "confirmed": True, + "created_at": "2024-01-01 12:00:00" + } + }, + ... + ] + + Note: Seed prompt is provided separately in test script, not in database. + + Raises: + sqlite3.Error: If database query fails + ValueError: If no datapoints found matching criteria + """ + # Use provided confirmed_only or instance default + use_confirmed = confirmed_only if confirmed_only is not None else self.confirmed_only + + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row # Access columns by name + dataset = [] + + try: + # Build query with filters + query = """ + SELECT + v.id, + v.image_id, + v.command, + v.result, + v.reasoning, + v.data_type, + v.confirmed, + v.created_at, + i.mime, + i.bytes + FROM validation_data v + INNER JOIN images i ON v.image_id = i.image_id + WHERE 1=1 + """ + params = [] + + # Add filters + if use_confirmed: + query += " AND v.confirmed = 1" + + if data_type: + query += " AND v.data_type = ?" + params.append(data_type) + + query += " ORDER BY v.id ASC" + + # Execute query + cursor = conn.execute(query, params) + rows = cursor.fetchall() + + if not rows: + filter_msg = [] + if use_confirmed: + filter_msg.append("confirmed=1") + if data_type: + filter_msg.append(f"data_type='{data_type}'") + + filter_str = " with filters: " + ", ".join(filter_msg) if filter_msg else "" + raise ValueError( + f"No datapoints found{filter_str} in database: {self.db_path}\n" + f"Make sure you have generated and saved datapoints using the validation UI." + ) + + # Convert rows to GEPA format + for row in rows: + # Convert 0/1 to "true"/"false" string for GEPA + result_str = "true" if row["result"] == 1 else "false" + + # Encode image bytes to base64 + image_base64 = base64.b64encode(row["bytes"]).decode("utf-8") + + # Create GEPA format item + # Input: command (seed prompt will be provided in test script) + # Image: separate at top level (image_base64) + # Output: "true" or "false" (converted from 0/1) + dataset_item = { + "input": row["command"], # Just the command - seed prompt will be in test script + "output": result_str, # "true" or "false" (string) + "image_base64": image_base64, # TOP LEVEL for UniversalConverter (image + command together) + "metadata": { + "id": row["id"], + "image_id": row["image_id"], + "command": row["command"], # Keep original for reference + "result": bool(row["result"]), # Boolean for reference + "result_int": row["result"], # Original 0/1 for reference + "reasoning": row["reasoning"], + "data_type": row["data_type"], + "confirmed": bool(row["confirmed"]), + "created_at": row["created_at"], + "mime": row["mime"], + } + } + + dataset.append(dataset_item) + + # Log summary + data_type_str = f" ({data_type})" if data_type else "" + confirmed_str = " (confirmed only)" if use_confirmed else " (all)" + logger.info(f"Loaded {len(dataset)} validation datapoints{data_type_str}{confirmed_str}") + + return dataset + + finally: + conn.close() + + def load_split_dataset( + self, + confirmed_only: Optional[bool] = None + ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Load dataset split by data_type (trainset/valset/testset). + + Convenience method that loads all three splits at once. + + Args: + confirmed_only: Override instance default. If True, only load confirmed datapoints. + + Returns: + Tuple of (train_set, val_set, test_set) in GEPA format + + Example: + loader = ValidationDatasetLoader(db_path="./validation_data.db") + train, val, test = loader.load_split_dataset() + """ + train_set = self.load_dataset(data_type="trainset", confirmed_only=confirmed_only) + val_set = self.load_dataset(data_type="valset", confirmed_only=confirmed_only) + test_set = self.load_dataset(data_type="testset", confirmed_only=confirmed_only) + + logger.info(f"Dataset Split Summary: Training={len(train_set)}, Validation={len(val_set)}, Test={len(test_set)}, Total={len(train_set) + len(val_set) + len(test_set)}") + + return train_set, val_set, test_set + + def get_dataset_stats(self) -> Dict[str, Any]: + """ + Get statistics about the dataset in the database. + + Returns: + Dictionary with dataset statistics: + { + "total": 100, + "confirmed": 95, + "unconfirmed": 5, + "by_data_type": { + "trainset": 70, + "valset": 15, + "testset": 15 + }, + "by_result": { + "true": 50, + "false": 50 + } + } + """ + conn = sqlite3.connect(str(self.db_path)) + conn.row_factory = sqlite3.Row + + try: + stats = {} + + # Total counts + total = conn.execute("SELECT COUNT(*) FROM validation_data").fetchone()[0] + confirmed = conn.execute("SELECT COUNT(*) FROM validation_data WHERE confirmed = 1").fetchone()[0] + stats["total"] = total + stats["confirmed"] = confirmed + stats["unconfirmed"] = total - confirmed + + # By data_type + data_type_rows = conn.execute(""" + SELECT data_type, COUNT(*) as count + FROM validation_data + GROUP BY data_type + """).fetchall() + stats["by_data_type"] = {row["data_type"]: row["count"] for row in data_type_rows} + + # By result (true/false) + result_rows = conn.execute(""" + SELECT result, COUNT(*) as count + FROM validation_data + GROUP BY result + """).fetchall() + stats["by_result"] = { + "true": sum(row["count"] for row in result_rows if row["result"] == 1), + "false": sum(row["count"] for row in result_rows if row["result"] == 0) + } + + return stats + + finally: + conn.close() + + +def load_validation_dataset( + db_path: Optional[str] = None, + data_type: Optional[Literal["trainset", "valset", "testset"]] = None, + confirmed_only: bool = True +) -> List[Dict[str, Any]]: + """ + Convenience function to load validation dataset. + + Args: + db_path: Path to SQLite database file. Default: "./validation_data.db" + data_type: Filter by data_type. If None, loads all types. + confirmed_only: If True, only load confirmed datapoints. + + Returns: + List of dataset items in GEPA format + + Example: + # Load all confirmed training data + train_data = load_validation_dataset(data_type="trainset", confirmed_only=True) + + # Load all confirmed data + all_data = load_validation_dataset(confirmed_only=True) + """ + loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only) + return loader.load_dataset(data_type=data_type, confirmed_only=confirmed_only) + + +def load_validation_split( + db_path: Optional[str] = None, + confirmed_only: bool = True +) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: + """ + Convenience function to load validation dataset split by data_type. + + Args: + db_path: Path to SQLite database file. Default: "./validation_data.db" + confirmed_only: If True, only load confirmed datapoints. + + Returns: + Tuple of (train_set, val_set, test_set) in GEPA format + + Example: + train, val, test = load_validation_split(confirmed_only=True) + """ + loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only) + return loader.load_split_dataset(confirmed_only=confirmed_only) + + +# Example usage and testing +if __name__ == "__main__": + print("๐Ÿš€ Testing Validation Dataset Loader...") + + try: + loader = ValidationDatasetLoader() + + # Get stats + print("\n๐Ÿ“Š Dataset Statistics:") + stats = loader.get_dataset_stats() + print(f" Total: {stats['total']}") + print(f" Confirmed: {stats['confirmed']}") + print(f" Unconfirmed: {stats['unconfirmed']}") + print(f" By data_type: {stats['by_data_type']}") + print(f" By result: {stats['by_result']}") + + # Load split dataset + print("\n๐Ÿ“ฆ Loading split dataset...") + train, val, test = loader.load_split_dataset() + + # Show sample + if train: + sample = train[0] + print(f"\n๐Ÿ“ Sample Training Item:") + print(f" Input: {sample['input']}") + print(f" Output: {sample['output']}") + print(f" Image ID: {sample['metadata']['image_id'][:8]}...") + print(f" Data Type: {sample['metadata']['data_type']}") + print(f" Result: {sample['metadata']['result']} (int: {sample['metadata']['result_int']})") + + except FileNotFoundError as e: + print(f"โŒ {e}") + print("\n๐Ÿ’ก Make sure validation_data_ui_server_async.py has been run to create the database.") + except ValueError as e: + print(f"โŒ {e}") + print("\n๐Ÿ’ก Generate and save some datapoints using the validation UI first.") + diff --git a/src/gepa_optimizer/data/validators.py b/src/gepa_optimizer/data/validators.py new file mode 100644 index 0000000000000000000000000000000000000000..28fbc6048f91a25c1a4d54befc7f62be4498f898 --- /dev/null +++ b/src/gepa_optimizer/data/validators.py @@ -0,0 +1,207 @@ +""" +Data validation utilities for GEPA optimizer +""" + +from typing import List, Dict, Any, Optional, Tuple +import logging + +logger = logging.getLogger(__name__) + +class DataValidator: + """ + Validates datasets for completeness and GEPA compatibility + """ + + def __init__(self): + self.required_fields = ['input', 'output'] + self.optional_fields = ['metadata', 'id', 'tags'] + + def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Tuple[bool, List[str]]: + """ + Validate entire dataset + + Args: + dataset: List of data items to validate + + Returns: + Tuple[bool, List[str]]: (is_valid, list_of_errors) + """ + errors = [] + + # Basic dataset checks + if not dataset: + errors.append("Dataset is empty") + return False, errors + + if not isinstance(dataset, list): + errors.append("Dataset must be a list") + return False, errors + + # Validate each item + for idx, item in enumerate(dataset): + item_errors = self.validate_item(item, idx) + errors.extend(item_errors) + + # Check for minimum dataset size + if len(dataset) < 2: + errors.append("Dataset should have at least 2 items for proper train/val split") + + # Log validation results + if errors: + logger.warning(f"Dataset validation failed with {len(errors)} errors") + else: + logger.info(f"Dataset validation passed for {len(dataset)} items") + + return len(errors) == 0, errors + + def validate_item(self, item: Dict[str, Any], index: Optional[int] = None) -> List[str]: + """ + Validate a single dataset item + + Args: + item: Single data item to validate + index: Optional item index for error reporting + + Returns: + List[str]: List of validation errors + """ + errors = [] + item_ref = f"item {index}" if index is not None else "item" + + # Check if item is a dictionary + if not isinstance(item, dict): + errors.append(f"{item_ref}: Must be a dictionary") + return errors + + # Check for required fields + if 'input' not in item: + errors.append(f"{item_ref}: Missing required 'input' field") + elif not isinstance(item['input'], str): + errors.append(f"{item_ref}: 'input' field must be a string") + elif not item['input'].strip(): + errors.append(f"{item_ref}: 'input' field cannot be empty") + + # Check output field (can be empty but should exist for supervised learning) + if 'output' in item: + if not isinstance(item['output'], str): + errors.append(f"{item_ref}: 'output' field must be a string") + + # Validate metadata if present + if 'metadata' in item and not isinstance(item['metadata'], dict): + errors.append(f"{item_ref}: 'metadata' field must be a dictionary") + + return errors + + def validate_gepa_format(self, gepa_data: List[Dict[str, Any]]) -> Tuple[bool, List[str]]: + """ + Validate data in GEPA format + + Args: + gepa_data: Data in GEPA format + + Returns: + Tuple[bool, List[str]]: (is_valid, list_of_errors) + """ + errors = [] + + if not gepa_data: + errors.append("GEPA dataset is empty") + return False, errors + + for idx, item in enumerate(gepa_data): + if 'input' not in item: + errors.append(f"GEPA item {idx}: Missing 'input' field") + + if 'expected_output' not in item: + errors.append(f"GEPA item {idx}: Missing 'expected_output' field") + + if 'metadata' not in item: + errors.append(f"GEPA item {idx}: Missing 'metadata' field") + elif not isinstance(item['metadata'], dict): + errors.append(f"GEPA item {idx}: 'metadata' must be a dictionary") + + return len(errors) == 0, errors + + def validate_split(self, trainset: List[Dict], valset: List[Dict]) -> Tuple[bool, List[str]]: + """ + Validate train/validation split + + Args: + trainset: Training data + valset: Validation data + + Returns: + Tuple[bool, List[str]]: (is_valid, list_of_errors) + """ + errors = [] + + if not trainset: + errors.append("Training set is empty") + + if not valset: + errors.append("Validation set is empty") + + # Check proportions + total_size = len(trainset) + len(valset) + if total_size > 0: + train_ratio = len(trainset) / total_size + if train_ratio < 0.5: + errors.append(f"Training set too small: {train_ratio:.2%} of total data") + elif train_ratio > 0.95: + errors.append(f"Validation set too small: {1-train_ratio:.2%} of total data") + + return len(errors) == 0, errors + + def get_dataset_stats(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Get statistics about the dataset + + Args: + dataset: Dataset to analyze + + Returns: + Dict[str, Any]: Dataset statistics + """ + if not dataset: + return {'total_items': 0, 'valid': False} + + stats = { + 'total_items': len(dataset), + 'has_output': sum(1 for item in dataset if item.get('output')), + 'avg_input_length': 0, + 'avg_output_length': 0, + 'empty_inputs': 0, + 'empty_outputs': 0 + } + + input_lengths = [] + output_lengths = [] + + for item in dataset: + if isinstance(item, dict): + input_text = item.get('input', '') + output_text = item.get('output', '') + + if isinstance(input_text, str): + input_lengths.append(len(input_text)) + if not input_text.strip(): + stats['empty_inputs'] += 1 + + if isinstance(output_text, str): + output_lengths.append(len(output_text)) + if not output_text.strip(): + stats['empty_outputs'] += 1 + + if input_lengths: + stats['avg_input_length'] = sum(input_lengths) / len(input_lengths) + + if output_lengths: + stats['avg_output_length'] = sum(output_lengths) / len(output_lengths) + + # Determine if dataset looks valid + stats['valid'] = ( + stats['total_items'] > 0 and + stats['empty_inputs'] < stats['total_items'] * 0.5 # Less than 50% empty inputs + ) + + return stats diff --git a/src/gepa_optimizer/evaluation/__init__.py b/src/gepa_optimizer/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e464dcf245f5c1bf1f11eba3eb30d64fe60499be --- /dev/null +++ b/src/gepa_optimizer/evaluation/__init__.py @@ -0,0 +1,28 @@ +""" +Evaluation module for GEPA Optimizer + +Includes: +- UniversalSemanticEvaluator: Works for ANY task (recommended for general use) +- BaseEvaluator: Abstract base class for custom evaluators +- Task-specific evaluators for specialized use cases +""" + +from .base_evaluator import BaseEvaluator +from .universal_evaluator import UniversalSemanticEvaluator, create_universal_evaluator +from .ui_evaluator import UITreeEvaluator +from .scroll_evaluator import ScrollElementEvaluator +from .validation_evaluator import ValidationEvaluator +from .index_caching_evaluator import IndexCachingEvaluator + +__all__ = [ + # Universal (recommended) + "UniversalSemanticEvaluator", + "create_universal_evaluator", + # Base class + "BaseEvaluator", + # Task-specific + "UITreeEvaluator", + "ScrollElementEvaluator", + "ValidationEvaluator", + "IndexCachingEvaluator", +] diff --git a/src/gepa_optimizer/evaluation/base_evaluator.py b/src/gepa_optimizer/evaluation/base_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..c63f322935ceffa0d9fb38a7d1b2049c078bda6c --- /dev/null +++ b/src/gepa_optimizer/evaluation/base_evaluator.py @@ -0,0 +1,51 @@ +""" +Base evaluator class for all evaluation strategies. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional +import logging + +logger = logging.getLogger(__name__) + +class BaseEvaluator(ABC): + """ + Abstract base class for all evaluation strategies. + + This enforces a consistent interface while allowing complete customization + of evaluation logic for any use case. + """ + + def __init__(self, metric_weights: Optional[Dict[str, float]] = None): + """ + Initialize evaluator with optional metric weights. + + Args: + metric_weights: Optional weights for different metrics. + If None, subclasses should provide defaults. + """ + self.metric_weights = metric_weights or {} + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + @abstractmethod + def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]: + """ + Evaluate predicted output against expected output. + + Args: + predicted: The model's predicted output + expected: The ground truth expected output + + Returns: + Dictionary with metric names as keys and scores as values. + Must include 'composite_score' key for GEPA integration. + """ + pass + + def validate_weights(self) -> bool: + """Validate that metric weights sum to approximately 1.0""" + if not self.metric_weights: + return True + + total = sum(self.metric_weights.values()) + return abs(total - 1.0) < 0.01 # Allow small floating point errors diff --git a/src/gepa_optimizer/evaluation/index_caching_evaluator.py b/src/gepa_optimizer/evaluation/index_caching_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..6d4ca53c9277645fa8037e27c65aae633fe47ca1 --- /dev/null +++ b/src/gepa_optimizer/evaluation/index_caching_evaluator.py @@ -0,0 +1,357 @@ +""" +Index Caching Evaluator for Index-Based Element Selection Use Case + +Evaluates predicted index caching results against expected results. +Compares all 5 fields with equal weight: +- is_index_based +- index_value +- parent_element_id +- element_id_of_nth_child_of_parent +- selected_element_is_correct +""" + +from typing import Dict, Any, Optional +import json +import re +import logging + +from .base_evaluator import BaseEvaluator + + +class IndexCachingEvaluator(BaseEvaluator): + """ + Evaluator for index caching use case. + + Features: + - Compares all 5 fields with equal weight (20% each) + - Parses JSON from LLM response + - Handles null values correctly + - Returns detailed field-by-field comparison + """ + + def __init__(self, metric_weights: Optional[Dict[str, float]] = None): + """ + Initialize index caching evaluator. + + Args: + metric_weights: Weights for evaluation metrics + Default: Equal weight for all 5 fields (0.2 each) + """ + # Each field gets 20% weight (5 fields * 0.2 = 1.0) + default_weights = { + "is_index_based_match": 0.2, + "index_value_match": 0.2, + "parent_element_id_match": 0.2, + "element_id_of_nth_child_match": 0.2, + "selected_element_correct_match": 0.2, + } + + weights = metric_weights or default_weights + super().__init__(metric_weights=weights) + + def evaluate(self, predicted: str, expected: str) -> Dict[str, float]: + """ + Evaluate predicted index caching result against expected result. + + Args: + predicted: LLM's output (JSON string with all 5 fields) + expected: Expected output (JSON string or dict with all 5 fields) + + Returns: + Dictionary with evaluation metrics: + { + "is_index_based_match": 1.0 or 0.0, + "index_value_match": 1.0 or 0.0, + "parent_element_id_match": 1.0 or 0.0, + "element_id_of_nth_child_match": 1.0 or 0.0, + "selected_element_correct_match": 1.0 or 0.0, + "composite_score": 0.0 to 1.0, + "predicted_output": str, + "expected_output": str, + "field_scores": {...}, + "evaluation_reason": str + } + """ + if not predicted or not expected: + return { + "is_index_based_match": 0.0, + "index_value_match": 0.0, + "parent_element_id_match": 0.0, + "element_id_of_nth_child_match": 0.0, + "selected_element_correct_match": 0.0, + "composite_score": 0.0, + "predicted_output": str(predicted).strip() if predicted else "", + "expected_output": str(expected).strip() if expected else "", + "field_scores": {}, + "evaluation_reason": "โŒ Empty or missing input/output" + } + + # Parse expected (could be JSON string or dict) + try: + if isinstance(expected, str): + expected_dict = json.loads(expected) + else: + expected_dict = expected + except (json.JSONDecodeError, TypeError): + # If expected is already a dict from dataset + expected_dict = expected if isinstance(expected, dict) else {} + + # Parse predicted (must be JSON string) + try: + predicted_dict = self._parse_json_response(predicted) + except Exception as e: + # Log the actual response for debugging + response_preview = predicted[:200] if predicted else "(empty)" + self.logger.warning(f"Failed to parse predicted JSON: {e}") + self.logger.warning(f"Response preview: {response_preview}...") + predicted_dict = {} + + # NOTE: "notes" field is present in the output but is NOT used for scoring or reflection + # It's kept for reference but ignored in evaluation + + # Compare each field (only the 5 core fields, ignoring "notes") + field_scores = {} + field_reasons = [] + + # 1. is_index_based (boolean) + pred_is_index = predicted_dict.get("is_index_based") + exp_is_index = expected_dict.get("is_index_based") + is_index_match = (pred_is_index == exp_is_index) if (pred_is_index is not None and exp_is_index is not None) else False + field_scores["is_index_based"] = 1.0 if is_index_match else 0.0 + field_reasons.append(f"is_index_based: {pred_is_index} vs {exp_is_index} โ†’ {'โœ…' if is_index_match else 'โŒ'}") + + # 2. index_value (int or null) + pred_index_val = predicted_dict.get("index_value") + exp_index_val = expected_dict.get("index_value") + # Handle null/None comparison + index_val_match = (pred_index_val == exp_index_val) or (pred_index_val is None and exp_index_val is None) + field_scores["index_value"] = 1.0 if index_val_match else 0.0 + field_reasons.append(f"index_value: {pred_index_val} vs {exp_index_val} โ†’ {'โœ…' if index_val_match else 'โŒ'}") + + # 3. parent_element_id (string or null) + pred_parent = predicted_dict.get("parent_element_id") + exp_parent = expected_dict.get("parent_element_id") + # Handle null/None comparison + parent_match = (pred_parent == exp_parent) or (pred_parent is None and exp_parent is None) + field_scores["parent_element_id"] = 1.0 if parent_match else 0.0 + field_reasons.append(f"parent_element_id: {pred_parent} vs {exp_parent} โ†’ {'โœ…' if parent_match else 'โŒ'}") + + # 4. element_id_of_nth_child_of_parent (string or null) + pred_element = predicted_dict.get("element_id_of_nth_child_of_parent") + exp_element = expected_dict.get("element_id_of_nth_child_of_parent") + # Handle null/None comparison + element_match = (pred_element == exp_element) or (pred_element is None and exp_element is None) + field_scores["element_id_of_nth_child_of_parent"] = 1.0 if element_match else 0.0 + field_reasons.append(f"element_id_of_nth_child: {pred_element} vs {exp_element} โ†’ {'โœ…' if element_match else 'โŒ'}") + + # 5. selected_element_is_correct (boolean) + pred_selected = predicted_dict.get("selected_element_is_correct") + exp_selected = expected_dict.get("selected_element_is_correct") + selected_match = (pred_selected == exp_selected) if (pred_selected is not None and exp_selected is not None) else False + field_scores["selected_element_is_correct"] = 1.0 if selected_match else 0.0 + field_reasons.append(f"selected_element_is_correct: {pred_selected} vs {exp_selected} โ†’ {'โœ…' if selected_match else 'โŒ'}") + + # Calculate composite score (weighted average) + composite_score = ( + field_scores["is_index_based"] * 0.2 + + field_scores["index_value"] * 0.2 + + field_scores["parent_element_id"] * 0.2 + + field_scores["element_id_of_nth_child_of_parent"] * 0.2 + + field_scores["selected_element_is_correct"] * 0.2 + ) + + # Build evaluation reason + all_match = composite_score == 1.0 + reason = "โœ… All fields match!" if all_match else f"โŒ Partial match ({composite_score:.1%})" + reason += "\n" + "\n".join(f" {r}" for r in field_reasons) + + # Log evaluation details + self.logger.info(f"\n{'โ”€'*70}") + self.logger.info(f"๐Ÿ“Š INDEX CACHING EVALUATION") + self.logger.info(f"{'โ”€'*70}") + self.logger.info(f" ๐ŸŽฏ COMPOSITE SCORE: {composite_score:.2f} ({composite_score:.1%})") + for field, score in field_scores.items(): + status = "โœ…" if score == 1.0 else "โŒ" + self.logger.info(f" {status} {field}: {score:.0f}") + self.logger.info(f"{'โ”€'*70}\n") + + return { + "is_index_based_match": field_scores["is_index_based"], + "index_value_match": field_scores["index_value"], + "parent_element_id_match": field_scores["parent_element_id"], + "element_id_of_nth_child_match": field_scores["element_id_of_nth_child_of_parent"], + "selected_element_correct_match": field_scores["selected_element_is_correct"], + "composite_score": composite_score, + "predicted_output": predicted, + "expected_output": json.dumps(expected_dict) if isinstance(expected_dict, dict) else str(expected), + "predicted_dict": predicted_dict, + "expected_dict": expected_dict, + "field_scores": field_scores, + "evaluation_reason": reason + } + + def _parse_json_response(self, response: str) -> Dict[str, Any]: + """ + Parse JSON from LLM response, handling markdown code blocks and various formats. + + Args: + response: LLM response string (may contain markdown) + + Returns: + Parsed JSON dictionary (empty dict if parsing fails) + """ + if not response or not isinstance(response, str): + return {} + + response = response.strip() + + # If response is empty, return empty dict + if not response: + return {} + + # Strategy 1: Try to extract JSON from markdown code block + json_match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', response, re.DOTALL) + if json_match: + try: + json_str = json_match.group(1).strip() + return json.loads(json_str) + except json.JSONDecodeError: + pass + + # Strategy 2: Find JSON object in response (handle nested braces) + json_start = response.find('{') + if json_start != -1: + # Find matching closing brace + brace_count = 0 + json_end = json_start + for i in range(json_start, len(response)): + if response[i] == '{': + brace_count += 1 + elif response[i] == '}': + brace_count -= 1 + if brace_count == 0: + json_end = i + 1 + break + + if brace_count == 0: + json_str = response[json_start:json_end] + try: + return json.loads(json_str) + except json.JSONDecodeError: + pass + + # Strategy 3: Try to find any JSON-like structure (more lenient) + # Look for patterns like {"key": "value"} even if not perfectly formatted + json_pattern = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', response, re.DOTALL) + if json_pattern: + try: + return json.loads(json_pattern.group(0)) + except json.JSONDecodeError: + pass + + # Strategy 4: Try parsing entire response as JSON + try: + return json.loads(response) + except json.JSONDecodeError: + pass + + # If all strategies fail, return empty dict + self.logger.debug(f"Could not parse JSON from response: {response[:100]}...") + return {} + + def get_evaluation_summary(self, results: list) -> Dict[str, Any]: + """ + Get summary statistics for a batch of evaluations. + + Args: + results: List of evaluation result dictionaries + + Returns: + Summary statistics including accuracy per field and overall + """ + if not results: + return { + "total_samples": 0, + "overall_accuracy": 0.0, + "field_accuracies": {}, + "perfect_matches": 0 + } + + total = len(results) + perfect_matches = sum(1 for r in results if r.get("composite_score", 0.0) == 1.0) + overall_accuracy = perfect_matches / total if total > 0 else 0.0 + + # Calculate accuracy per field + field_accuracies = { + "is_index_based": sum(1 for r in results if r.get("is_index_based_match", 0.0) == 1.0) / total, + "index_value": sum(1 for r in results if r.get("index_value_match", 0.0) == 1.0) / total, + "parent_element_id": sum(1 for r in results if r.get("parent_element_id_match", 0.0) == 1.0) / total, + "element_id_of_nth_child": sum(1 for r in results if r.get("element_id_of_nth_child_match", 0.0) == 1.0) / total, + "selected_element_is_correct": sum(1 for r in results if r.get("selected_element_correct_match", 0.0) == 1.0) / total, + } + + return { + "total_samples": total, + "overall_accuracy": overall_accuracy, + "field_accuracies": field_accuracies, + "perfect_matches": perfect_matches, + "partial_matches": total - perfect_matches + } + + +# Example usage and testing +if __name__ == "__main__": + print("๐Ÿš€ Testing Index Caching Evaluator...") + + evaluator = IndexCachingEvaluator() + + # Test cases + test_cases = [ + # (predicted, expected, should_be_perfect) + ( + '{"is_index_based": true, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}', + {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True}, + True + ), + ( + '{"is_index_based": false, "index_value": null, "parent_element_id": null, "element_id_of_nth_child_of_parent": null, "selected_element_is_correct": true}', + {"is_index_based": False, "index_value": None, "parent_element_id": None, "element_id_of_nth_child_of_parent": None, "selected_element_is_correct": True}, + True + ), + ( + '{"is_index_based": true, "index_value": 3, "parent_element_id": null, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": true}', + {"is_index_based": True, "index_value": 3, "parent_element_id": None, "element_id_of_nth_child_of_parent": "aaaaaw", "selected_element_is_correct": True}, + True + ), + ( + '{"is_index_based": true, "index_value": 2, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true}', + {"is_index_based": True, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": True}, + False # index_value mismatch + ), + ] + + print("\n๐Ÿ“ Running test cases:") + print("-" * 80) + + results = [] + for predicted, expected, should_be_perfect in test_cases: + result = evaluator.evaluate(predicted, expected) + is_perfect = result["composite_score"] == 1.0 + + status = "โœ…" if is_perfect == should_be_perfect else "โŒ" + print(f"{status} Test: Perfect match = {is_perfect} (expected {should_be_perfect})") + print(f" Score: {result['composite_score']:.2f}") + print() + + results.append(result) + + # Summary + print("\n๐Ÿ“Š Summary:") + summary = evaluator.get_evaluation_summary(results) + print(f" Total: {summary['total_samples']}") + print(f" Perfect matches: {summary['perfect_matches']}") + print(f" Overall accuracy: {summary['overall_accuracy']:.1%}") + print(f" Field accuracies:") + for field, acc in summary['field_accuracies'].items(): + print(f" {field}: {acc:.1%}") + diff --git a/src/gepa_optimizer/evaluation/scroll_evaluator.py b/src/gepa_optimizer/evaluation/scroll_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..64171b2b9ae384339eec3d846e7d34c90fb36070 --- /dev/null +++ b/src/gepa_optimizer/evaluation/scroll_evaluator.py @@ -0,0 +1,251 @@ +""" +GENERIC String Match Evaluator + +Compares predicted output against expected output (simple string comparison). +NO assumptions about what the output represents (IDs, text, JSON, etc.). + +Let GEPA discover the correct output format through evolution and feedback! +""" + +from typing import Dict, Any + +try: + from .base_evaluator import BaseEvaluator +except ImportError: + # For standalone testing + import sys + from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator + + +class ScrollElementEvaluator(BaseEvaluator): + """ + GENERIC evaluator - just compares strings! + + NO assumptions about: + - Output format (element IDs, text, JSON, etc.) + - Output structure + - What the task is + + GEPA will learn the correct format through feedback and evolution. + """ + + def __init__(self, metric_weights: Dict[str, float] = None): + """ + Initialize evaluator. + + Args: + metric_weights: Weights for evaluation metrics + Default: {"output_match": 1.0} + """ + default_weights = { + "output_match": 1.0 # Simple string comparison + } + + weights = metric_weights or default_weights + super().__init__(metric_weights=weights) + + def evaluate(self, predicted: str, expected: str) -> Dict[str, float]: + """ + Binary evaluation with element ID extraction. + + Phase 1 Implementation: + - Extracts element IDs using regex patterns (flexible format support) + - Uses INTEGER comparison for robustness (prevents "4" vs "14" bugs) + - Binary scoring: correct element = 1.0, wrong/missing = 0.0 + + Scoring Strategy: + 1. Extract element ID from both predicted and expected outputs + 2. Compare using integer arithmetic (not string comparison) + 3. Return 1.0 if match, 0.0 otherwise (no partial credit) + + Args: + predicted: LLM's output (may include verbose explanation) + expected: Expected output (may include verbose explanation) + + Returns: + Dictionary with evaluation metrics and extracted element IDs + """ + import re + + if not predicted or not expected: + return { + "content_match": 0.0, + "output_match": 0.0, + "composite_score": 0.0, + "predicted_output": str(predicted).strip() if predicted else "", + "expected_output": str(expected).strip() if expected else "", + "predicted_element": "None", + "expected_element": "None", + "evaluation_reason": "โŒ Empty or missing input/output" + } + + predicted_str = str(predicted).strip() + expected_str = str(expected).strip() + + # 1. Extract element numbers using MULTIPLE strategies (flexible!) + # Strategy A: "Element: X" or "Element X" (explicit format) + element_pattern_a = r'element[:\s]+(\d+)' + + # Strategy B: "element X" or "Element X" anywhere in text + element_pattern_b = r'\belement\s+(\d+)\b' + + # Strategy C: Just find ANY number if other strategies fail (last resort) + number_pattern = r'\b(\d+)\b' + + # Try to extract from predicted + pred_match = re.search(element_pattern_a, predicted_str, re.IGNORECASE) + if not pred_match: + pred_match = re.search(element_pattern_b, predicted_str, re.IGNORECASE) + if not pred_match: + # Last resort: find first number in the text + pred_match = re.search(number_pattern, predicted_str) + + # Try to extract from expected + exp_match = re.search(element_pattern_a, expected_str, re.IGNORECASE) + if not exp_match: + exp_match = re.search(element_pattern_b, expected_str, re.IGNORECASE) + if not exp_match: + exp_match = re.search(number_pattern, expected_str) + + # 2. Check if we found element numbers in both + if not exp_match: + # Expected doesn't have element pattern - fallback to exact match + content_score = 1.0 if predicted_str.lower() == expected_str.lower() else 0.0 + elif not pred_match: + # Predicted doesn't have element number - WRONG + content_score = 0.0 + else: + # Both have element pattern - compare using INTEGER comparison + pred_element = pred_match.group(1) + exp_element = exp_match.group(1) + + # ๐Ÿ”ฅ Phase 1: Use INTEGER comparison for robustness + # This prevents bugs like "4" != "14" string comparison issues + try: + pred_num = int(pred_element) + exp_num = int(exp_element) + + # Integer comparison (more robust than string) + content_score = 1.0 if pred_num == exp_num else 0.0 + + # Log comparison for debugging + if pred_num != exp_num: + import logging + logger = logging.getLogger(__name__) + logger.debug(f"Element mismatch: predicted={pred_num}, expected={exp_num}") + + except (ValueError, TypeError) as e: + # Fallback to string comparison if conversion fails + import logging + logger = logging.getLogger(__name__) + logger.warning(f"Could not convert elements to integers: {e}, using string comparison") + content_score = 1.0 if pred_element == exp_element else 0.0 + + # 3. Binary score and reason + if content_score == 1.0: + composite_score = 1.0 + reason = "โœ… Correct! Element number matches" + else: + composite_score = 0.0 + if pred_match and exp_match: + reason = "โŒ Wrong element number (predicted different element)" + else: + reason = "โŒ Missing or invalid element number" + + pred_element = pred_match.group(1) if pred_match else "None" + exp_element = exp_match.group(1) if exp_match else "None" + + # Detailed logging for transparency + import logging + logger = logging.getLogger(__name__) + logger.info(f"\n{'โ”€'*70}") + logger.info(f"๐Ÿ“Š EVALUATION DETAILS") + logger.info(f"{'โ”€'*70}") + logger.info(f" Expected: '{expected_str}' (Element: {exp_element})") + logger.info(f" Predicted: '{predicted_str}' (Element: {pred_element})") + logger.info(f" {'โ”€'*66}") + logger.info(f" ๐ŸŽฏ SCORE: {composite_score:.2f} - {reason}") + logger.info(f"{'โ”€'*70}\n") + + return { + "content_match": content_score, + "output_match": composite_score, # This is what GEPA uses + "composite_score": composite_score, + "predicted_output": predicted_str, + "expected_output": expected_str, + "predicted_element": pred_element, + "expected_element": exp_element, + "evaluation_reason": reason + } + + def get_evaluation_summary(self, results: list) -> Dict[str, Any]: + """ + Get summary statistics for a batch of evaluations. + + Args: + results: List of evaluation result dictionaries + + Returns: + Summary statistics + """ + if not results: + return { + "total_samples": 0, + "accuracy": 0.0, + "correct_predictions": 0 + } + + total = len(results) + correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0) + accuracy = correct / total if total > 0 else 0.0 + + return { + "total_samples": total, + "accuracy": accuracy, + "correct_predictions": correct, + "incorrect_predictions": total - correct + } + + +# Example usage and testing +if __name__ == "__main__": + print("๐Ÿš€ Testing Scroll Element Evaluator...") + + evaluator = ScrollElementEvaluator() + + # Test cases + test_cases = [ + ("4", "4", True), + ("Element: 4", "4", True), + ("Element 4", "4", True), + ("The element to interact with is 4", "4", True), + ("Element ID: 4", "4", True), + ("Click on element 4 to scroll", "4", True), + ("5", "4", False), + ("Element: 5", "4", False), + ("No element found", "4", False), + ("", "4", False), + ] + + print("\n๐Ÿ“ Running test cases:") + print("-" * 80) + + results = [] + for predicted, expected, should_match in test_cases: + result = evaluator.evaluate(predicted, expected) + match = result["composite_score"] == 1.0 + + status = "โœ…" if match == should_match else "โŒ" + print(f"{status} Predicted: '{predicted}' | Expected: '{expected}' | Match: {match}") + + results.append(result) + + # Summary + print("\n๐Ÿ“Š Summary:") + summary = evaluator.get_evaluation_summary(results) + print(f" Total: {summary['total_samples']}") + print(f" Correct: {summary['correct_predictions']}") + print(f" Accuracy: {summary['accuracy']:.1%}") + diff --git a/src/gepa_optimizer/evaluation/ui_evaluator.py b/src/gepa_optimizer/evaluation/ui_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..b4aadf122212ad2d07ae1c7c4f23342eb5be88a3 --- /dev/null +++ b/src/gepa_optimizer/evaluation/ui_evaluator.py @@ -0,0 +1,297 @@ +""" +UI Tree Evaluator for GEPA Optimizer +""" + +import json +import logging +import difflib +from typing import Any, Dict, List, Optional + +from .base_evaluator import BaseEvaluator + +logger = logging.getLogger(__name__) + +class UITreeEvaluator(BaseEvaluator): + """ + Comprehensive evaluator for UI tree extraction quality. + """ + + def __init__(self, metric_weights: Optional[Dict[str, float]] = None): + """ + Initializes the UITreeEvaluator with configurable metric weights. + + Args: + metric_weights: A dictionary of weights for different metrics. + If None, default weights will be used. + """ + # Set default weights for UI tree evaluation + default_weights = { + "element_completeness": 0.3, # How many elements are captured + "element_type_accuracy": 0.25, # Correct element types (Button, Text, etc.) + "text_content_accuracy": 0.2, # Text content matches + "hierarchy_accuracy": 0.15, # Parent-child relationships + "style_accuracy": 0.1, # Style properties captured + } + + # Use provided weights or defaults + weights = metric_weights or default_weights + + # Initialize parent class + super().__init__(metric_weights=weights) + + # Normalize weights + self._normalize_weights() + + def _normalize_weights(self): + """Normalize weights to sum to 1.0""" + total_weight = sum(self.metric_weights.values()) + if total_weight > 0: + self.metric_weights = {k: v / total_weight for k, v in self.metric_weights.items()} + else: + self.logger.warning("Total metric weight is zero. Scores will be zero.") + + def evaluate(self, predicted_json: Dict[str, Any], expected_json: Dict[str, Any]) -> Dict[str, float]: + """ + Generates a weighted composite score from individual metrics. + + Args: + predicted_json: The JSON generated by the LLM. + expected_json: The ground truth JSON. + + Returns: + A dictionary of individual metric scores and the composite score. + """ + scores = { + "element_completeness": self.calculate_element_completeness(predicted_json, expected_json), + "element_type_accuracy": self.calculate_element_type_accuracy(predicted_json, expected_json), + "text_content_accuracy": self.calculate_text_content_accuracy(predicted_json, expected_json), + "hierarchy_accuracy": self.calculate_hierarchy_accuracy(predicted_json, expected_json), + "style_accuracy": self.calculate_style_accuracy(predicted_json, expected_json), + } + + composite_score = sum(scores[metric] * self.metric_weights.get(metric, 0) for metric in scores) + scores["composite_score"] = composite_score + + # Add detailed logging for debugging + logger.debug(f"Evaluation scores: {scores}") + logger.debug(f"Composite score: {composite_score:.4f}") + + # Add small improvement bonus for better prompts (encourage GEPA to accept improvements) + # This helps GEPA recognize even tiny improvements + if composite_score > 0.05: # If we have any meaningful content + composite_score = min(composite_score + 0.001, 1.0) # Small bonus to encourage acceptance + + return scores + + def calculate_element_completeness(self, predicted: Dict, expected: Dict) -> float: + """ + Calculates how many UI elements are captured in the predicted JSON. + This is the most important metric for UI tree extraction. + """ + def _count_elements(node): + """Count total elements in the tree""" + if not isinstance(node, dict): + return 0 + count = 1 # Count current node + for child in node.get("children", []): + count += _count_elements(child) + return count + + try: + predicted_count = _count_elements(predicted) + expected_count = _count_elements(expected) + + if expected_count == 0: + return 1.0 if predicted_count == 0 else 0.0 + + # Score based on how many elements are captured + completeness_ratio = predicted_count / expected_count + + # Give bonus for capturing more elements (up to 1.0) + # Penalize heavily for missing elements + if completeness_ratio >= 1.0: + return 1.0 # Perfect or better + elif completeness_ratio >= 0.8: + return completeness_ratio # Good coverage + elif completeness_ratio >= 0.5: + return completeness_ratio * 0.8 # Moderate coverage with penalty + else: + return completeness_ratio * 0.5 # Poor coverage with heavy penalty + + except Exception as e: + logger.warning(f"Error calculating element completeness: {e}") + return 0.0 + + def calculate_element_type_accuracy(self, predicted: Dict, expected: Dict) -> float: + """ + Calculates element type accuracy by comparing the 'type' attribute of corresponding nodes. + Focuses on common UI element types like Button, Text, Image, etc. + """ + def _get_all_types(node): + if not isinstance(node, dict): + return [] + types = [node.get("type")] + for child in node.get("children", []): + types.extend(_get_all_types(child)) + return [t for t in types if t is not None] + + try: + predicted_types = _get_all_types(predicted) + expected_types = _get_all_types(expected) + + if not expected_types: + return 1.0 if not predicted_types else 0.5 + + if not predicted_types: + return 0.0 + + # Count matching types with frequency consideration + expected_type_counts = {} + for t in expected_types: + expected_type_counts[t] = expected_type_counts.get(t, 0) + 1 + + predicted_type_counts = {} + for t in predicted_types: + predicted_type_counts[t] = predicted_type_counts.get(t, 0) + 1 + + # Calculate accuracy based on type matches + total_matches = 0 + for type_name, expected_count in expected_type_counts.items(): + predicted_count = predicted_type_counts.get(type_name, 0) + # Count matches up to the expected count + total_matches += min(predicted_count, expected_count) + + return total_matches / len(expected_types) if expected_types else 0.0 + + except Exception as e: + logger.warning(f"Error calculating element type accuracy: {e}") + return 0.0 + + def calculate_hierarchy_accuracy(self, predicted: Dict, expected: Dict) -> float: + """ + Calculates hierarchy accuracy by comparing parent-child relationships. + """ + def _get_hierarchy_structure(node, parent_type="ROOT"): + """Extract hierarchy structure as (parent_type, child_type) pairs""" + if not isinstance(node, dict): + return [] + + current_type = node.get("type", "unknown") + hierarchy = [(parent_type, current_type)] + + for child in node.get("children", []): + hierarchy.extend(_get_hierarchy_structure(child, current_type)) + + return hierarchy + + try: + predicted_hierarchy = _get_hierarchy_structure(predicted) + expected_hierarchy = _get_hierarchy_structure(expected) + + if not expected_hierarchy: + return 1.0 if not predicted_hierarchy else 0.5 + + if not predicted_hierarchy: + return 0.0 + + # Count matching hierarchy relationships + expected_hierarchy_set = set(expected_hierarchy) + predicted_hierarchy_set = set(predicted_hierarchy) + + matches = len(expected_hierarchy_set.intersection(predicted_hierarchy_set)) + total_expected = len(expected_hierarchy_set) + + return matches / total_expected if total_expected > 0 else 0.0 + + except Exception as e: + logger.warning(f"Error calculating hierarchy accuracy: {e}") + return 0.0 + + def calculate_text_content_accuracy(self, predicted: Dict, expected: Dict) -> float: + """ + Calculates text content accuracy by comparing the 'text' attribute of corresponding nodes. + """ + def _get_all_texts(node): + if not isinstance(node, dict): + return [] + texts = [node.get("text")] + for child in node.get("children", []): + texts.extend(_get_all_texts(child)) + return [t for t in texts if t is not None and str(t).strip()] + + try: + predicted_texts = _get_all_texts(predicted) + expected_texts = _get_all_texts(expected) + + if not expected_texts: + return 1.0 if not predicted_texts else 0.5 # Partial credit if predicted has texts but expected doesn't + + if not predicted_texts: + return 0.0 # No predicted texts, so no match + + total_similarity = 0.0 + for p_text in predicted_texts: + best_similarity = 0.0 + for e_text in expected_texts: + similarity = difflib.SequenceMatcher(None, str(p_text).strip(), str(e_text).strip()).ratio() + best_similarity = max(best_similarity, similarity) + total_similarity += best_similarity + + # Average similarity over all predicted texts + if not predicted_texts and not expected_texts: + return 1.0 + elif not predicted_texts: + return 0.0 + else: + return total_similarity / len(predicted_texts) + except Exception as e: + logger.warning(f"Error calculating text content accuracy: {e}") + return 0.0 + + def calculate_style_accuracy(self, predicted: Dict, expected: Dict) -> float: + """ + Calculates style accuracy by comparing style properties. + """ + def _get_all_styles(node): + """Extract all style properties from the tree""" + if not isinstance(node, dict): + return [] + + styles = [] + if "style" in node and isinstance(node["style"], dict): + styles.append(node["style"]) + + for child in node.get("children", []): + styles.extend(_get_all_styles(child)) + + return styles + + try: + predicted_styles = _get_all_styles(predicted) + expected_styles = _get_all_styles(expected) + + if not expected_styles: + return 1.0 if not predicted_styles else 0.5 + + if not predicted_styles: + return 0.0 + + # Calculate style property overlap + total_style_properties = 0 + matching_properties = 0 + + for exp_style in expected_styles: + for prop_name, prop_value in exp_style.items(): + total_style_properties += 1 + + # Find matching property in predicted styles + for pred_style in predicted_styles: + if prop_name in pred_style and pred_style[prop_name] == prop_value: + matching_properties += 1 + break + + return matching_properties / total_style_properties if total_style_properties > 0 else 0.0 + + except Exception as e: + logger.warning(f"Error calculating style accuracy: {e}") + return 0.0 diff --git a/src/gepa_optimizer/evaluation/universal_evaluator.py b/src/gepa_optimizer/evaluation/universal_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..16714de4ecad31a7a44a0551c3a7b45918a34b2a --- /dev/null +++ b/src/gepa_optimizer/evaluation/universal_evaluator.py @@ -0,0 +1,911 @@ +""" +Universal Semantic Evaluator for ANY prompt optimization use case. + +This evaluator uses LLM-powered semantic analysis to compare predicted vs expected outputs, +enabling prompt optimization for ANY task without requiring custom evaluator code. + +Key Features: +- Semantic understanding (not just string matching) +- Works with text, JSON, numbers, structured outputs +- Provides rich feedback for GEPA reflection +- No task-specific assumptions +""" + +import json +import re +import logging +from typing import Dict, Any, Optional, List +from difflib import SequenceMatcher + +from .base_evaluator import BaseEvaluator + +logger = logging.getLogger(__name__) + + +class UniversalSemanticEvaluator(BaseEvaluator): + """ + Universal evaluator using LLM for semantic comparison. + + Works for ANY task without hardcoded assumptions: + - Text outputs: "The answer is 42" vs "42" + - JSON outputs: {"count": 23} vs {"count": 22} + - Structured data: Lists, nested objects + - Multi-modal: Image descriptions, analysis results + + Evaluation Strategy: + 1. Quick checks (exact match, empty handling) + 2. Structural comparison (for JSON/structured data) + 3. LLM semantic analysis (for meaning understanding) + 4. Combine into composite score with rich feedback + """ + + def __init__( + self, + llm_client=None, + use_llm_analysis: bool = True, + semantic_weight: float = 0.6, + structural_weight: float = 0.25, + exact_match_bonus: float = 0.15, + metric_weights: Optional[Dict[str, float]] = None + ): + """ + Initialize Universal Semantic Evaluator. + + Args: + llm_client: LLM client for semantic analysis (optional, falls back to heuristics) + use_llm_analysis: Whether to use LLM for semantic comparison + semantic_weight: Weight for semantic similarity (0.0-1.0) + structural_weight: Weight for structural similarity (0.0-1.0) + exact_match_bonus: Bonus weight for exact matches (0.0-1.0) + metric_weights: Optional custom weights (overrides above) + """ + default_weights = metric_weights or { + "semantic_similarity": semantic_weight, + "structural_similarity": structural_weight, + "exact_match": exact_match_bonus + } + super().__init__(metric_weights=default_weights) + + self.llm_client = llm_client + self.use_llm_analysis = use_llm_analysis and llm_client is not None + + # Cache for LLM analysis to reduce API calls + self._analysis_cache: Dict[str, Dict] = {} + + logger.info(f"๐ŸŽฏ Universal Semantic Evaluator initialized") + logger.info(f" LLM analysis: {'enabled' if self.use_llm_analysis else 'disabled (using heuristics)'}") + logger.info(f" Weights: semantic={semantic_weight}, structural={structural_weight}, exact={exact_match_bonus}") + + def evaluate(self, predicted: Any, expected: Any) -> Dict[str, float]: + """ + Evaluate predicted output against expected output using semantic understanding. + + Args: + predicted: The model's predicted output (string, dict, or any serializable type) + expected: The ground truth expected output + + Returns: + Dictionary with metrics including 'composite_score' (required for GEPA) + """ + # Convert to strings for comparison + predicted_str = self._to_string(predicted) + expected_str = self._to_string(expected) + + # Initialize result + result = { + "composite_score": 0.0, + "exact_match": 0.0, + "semantic_similarity": 0.0, + "structural_similarity": 0.0, + "predicted_output": predicted_str[:500], # Truncate for logging + "expected_output": expected_str[:500], + "analysis": {}, + "improvement_feedback": "" + } + + # Handle empty/missing outputs + if not predicted_str or not predicted_str.strip(): + result["improvement_feedback"] = "โŒ Output is EMPTY. The prompt must instruct the model to produce output." + result["analysis"] = {"status": "empty_predicted"} + return result + + if not expected_str or not expected_str.strip(): + result["improvement_feedback"] = "โš ๏ธ Expected output is empty - cannot evaluate." + result["analysis"] = {"status": "empty_expected"} + result["composite_score"] = 0.5 # Neutral score + return result + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # STEP 1: Exact Match Check (Fast Path) + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + normalized_pred = self._normalize(predicted_str) + normalized_exp = self._normalize(expected_str) + + if normalized_pred == normalized_exp: + result["exact_match"] = 1.0 + result["semantic_similarity"] = 1.0 + result["structural_similarity"] = 1.0 + result["composite_score"] = 1.0 + result["improvement_feedback"] = "โœ… Perfect match! Output exactly matches expected." + result["analysis"] = {"status": "exact_match"} + return result + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # STEP 1.5: FORMAT MISMATCH DETECTION (CRITICAL FIX) + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # ๐Ÿ”ฅ CRITICAL: Detect when expected is JSON but predicted is narrative text + # This causes catastrophically low scores and needs explicit handling + expected_is_json = self._try_parse_json(expected_str) is not None + predicted_is_json = self._try_parse_json(predicted_str) is not None + + format_mismatch = expected_is_json and not predicted_is_json + if format_mismatch: + # Expected JSON but got narrative - this is a CRITICAL format error + # Give partial credit for semantic content but penalize heavily for format + result["analysis"]["format_mismatch"] = True + result["improvement_feedback"] = ( + "โŒ FORMAT ERROR: Expected JSON output but received narrative text. " + "The prompt MUST enforce JSON output format. " + "Add explicit instructions like: 'Output ONLY valid JSON, no explanations.' " + "Consider adding: 'Do NOT write prose or explanations.'" + ) + # Still evaluate semantic content but cap the score + # This gives feedback for improving the prompt + logger.warning(f"โš ๏ธ Format mismatch: expected JSON ({len(expected_str)} chars), got narrative ({len(predicted_str)} chars)") + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # STEP 2: Structural Comparison (for JSON/structured data) + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + structural_result = self._compare_structure(predicted_str, expected_str) + result["structural_similarity"] = structural_result["score"] + result["analysis"]["structural"] = structural_result.get("details", {}) + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # STEP 3: Semantic Analysis + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + if self.use_llm_analysis: + semantic_result = self._llm_semantic_analysis(predicted_str, expected_str) + else: + semantic_result = self._heuristic_semantic_analysis(predicted_str, expected_str) + + result["semantic_similarity"] = semantic_result["score"] + result["analysis"]["semantic"] = semantic_result.get("details", {}) + result["improvement_feedback"] = semantic_result.get("feedback", "") + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # STEP 4: Compute Composite Score + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + weights = self.metric_weights + composite = ( + result["semantic_similarity"] * weights.get("semantic_similarity", 0.6) + + result["structural_similarity"] * weights.get("structural_similarity", 0.25) + + result["exact_match"] * weights.get("exact_match", 0.15) + ) + + # ๐Ÿ”ฅ CRITICAL FIX: Apply format mismatch penalty + # If expected JSON but got narrative, cap the score to encourage format compliance + if result.get("analysis", {}).get("format_mismatch"): + # Cap at 0.3 to indicate "partial semantic match but wrong format" + # This ensures format-correct outputs always score higher + composite = min(composite, 0.30) + logger.debug(f"๐Ÿ“Š Format mismatch penalty applied: score capped at {composite:.3f}") + + result["composite_score"] = min(max(composite, 0.0), 1.0) + + # Add score breakdown to feedback + if not result["improvement_feedback"]: + result["improvement_feedback"] = self._generate_default_feedback(result) + + # Log evaluation + logger.debug(f"๐Ÿ“Š Evaluation: composite={result['composite_score']:.3f}, " + f"semantic={result['semantic_similarity']:.3f}, " + f"structural={result['structural_similarity']:.3f}") + + # #region agent log + try: + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "G", "location": "universal_evaluator.py:final_score", "message": "Final evaluation score breakdown", "data": {"composite": result["composite_score"], "semantic": result["semantic_similarity"], "structural": result["structural_similarity"], "exact_match": result["exact_match"], "format_mismatch": result.get("analysis", {}).get("format_mismatch", False), "predicted_preview": predicted_str[:150] if predicted_str else "EMPTY", "expected_preview": expected_str[:150] if expected_str else "EMPTY"}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + except Exception as _e: + pass # Silent fail for instrumentation + # #endregion + + return result + + def _to_string(self, value: Any) -> str: + """Convert any value to string for comparison.""" + if value is None: + return "" + if isinstance(value, str): + return value.strip() + if isinstance(value, dict): + try: + return json.dumps(value, sort_keys=True, indent=2) + except (TypeError, ValueError): + return str(value) + if isinstance(value, (list, tuple)): + try: + return json.dumps(list(value), sort_keys=True) + except (TypeError, ValueError): + return str(value) + return str(value).strip() + + def _normalize(self, text: str) -> str: + """Normalize text for comparison (lowercase, whitespace).""" + # Lowercase and normalize whitespace + normalized = ' '.join(text.lower().split()) + # Remove common punctuation that doesn't affect meaning + normalized = re.sub(r'[.,;:!?\'"]+$', '', normalized) + return normalized + + def _compare_structure(self, predicted: str, expected: str) -> Dict[str, Any]: + """ + Compare structural similarity (especially for JSON/structured outputs). + + Returns: + Dict with 'score' (0.0-1.0) and 'details' + """ + result = {"score": 0.0, "details": {}} + + # Try to parse as JSON + pred_json = self._try_parse_json(predicted) + exp_json = self._try_parse_json(expected) + + if pred_json is not None and exp_json is not None: + # Both are valid JSON - do structural comparison + return self._compare_json_structures(pred_json, exp_json) + + # Fallback: Compare as text structure + return self._compare_text_structure(predicted, expected) + + def _try_parse_json(self, text: str) -> Optional[Any]: + """ + Try to parse text as JSON with robust extraction. + + ๐Ÿ”ฅ FIX: LLMs often wrap JSON in markdown code blocks or add extra text. + This method now handles multiple formats: + - Direct JSON + - ```json ... ``` blocks + - ``` ... ``` blocks (no language tag) + - JSON embedded in prose + - Escaped newlines and quotes + """ + if not text or not isinstance(text, str): + return None + + # ๐Ÿ”ฅ PREPROCESSING: Clean common LLM output issues + cleaned = text.strip() + + # Remove BOM and other invisible characters + cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d') + + # Strategy 1: Try direct parse (cleanest case) + try: + return json.loads(cleaned) + except json.JSONDecodeError: + pass + + # Strategy 2: Extract JSON from markdown code block (```json ... ```) + # More permissive regex that handles optional language tags + json_match = re.search(r'```(?:json|JSON)?\s*([\{|\[].*?[\}|\]])\s*```', cleaned, re.DOTALL) + if json_match: + try: + return json.loads(json_match.group(1)) + except json.JSONDecodeError: + pass + + # Strategy 3: Find JSON using balanced brace matching (handles nested objects) + def extract_balanced_json(s: str, start_char: str, end_char: str) -> Optional[str]: + """Extract JSON with balanced braces/brackets.""" + count = 0 + start_idx = -1 + for i, char in enumerate(s): + if char == start_char: + if count == 0: + start_idx = i + count += 1 + elif char == end_char: + count -= 1 + if count == 0 and start_idx >= 0: + return s[start_idx:i+1] + return None + + # Try to find JSON object + json_obj = extract_balanced_json(cleaned, '{', '}') + if json_obj: + try: + return json.loads(json_obj) + except json.JSONDecodeError: + # Try to repair common issues + repaired = self._repair_json(json_obj) + try: + return json.loads(repaired) + except json.JSONDecodeError: + pass + + # Try to find JSON array + json_arr = extract_balanced_json(cleaned, '[', ']') + if json_arr: + try: + return json.loads(json_arr) + except json.JSONDecodeError: + repaired = self._repair_json(json_arr) + try: + return json.loads(repaired) + except json.JSONDecodeError: + pass + + return None + + def _repair_json(self, json_str: str) -> str: + """ + Attempt to repair common JSON issues from LLM output. + + Fixes: + - Trailing commas before } or ] + - Single quotes instead of double quotes + - Unquoted keys + - Comments (// and /* */) + """ + repaired = json_str + + # Remove trailing commas + repaired = re.sub(r',\s*}', '}', repaired) + repaired = re.sub(r',\s*]', ']', repaired) + + # Remove single-line comments + repaired = re.sub(r'//[^\n]*', '', repaired) + + # Remove multi-line comments + repaired = re.sub(r'/\*.*?\*/', '', repaired, flags=re.DOTALL) + + # Replace single quotes with double quotes (but be careful with apostrophes) + # Only replace when it looks like a JSON delimiter + def replace_single_quotes(match): + content = match.group(0) + # Skip if it looks like an apostrophe in a word + if re.match(r"'\w+'\s*:", content) or re.match(r":\s*'[^']*'", content): + return content.replace("'", '"') + return content + + # Basic single quote replacement for keys + repaired = re.sub(r"'([^']+)'\s*:", r'"\1":', repaired) + + return repaired + + def _compare_json_structures(self, pred: Any, exp: Any) -> Dict[str, Any]: + """Compare two JSON structures.""" + result = {"score": 0.0, "details": {"type": "json", "matches": [], "mismatches": []}} + + if type(pred) != type(exp): + result["details"]["mismatches"].append(f"Type mismatch: predicted={type(pred).__name__}, expected={type(exp).__name__}") + result["score"] = 0.2 # Some credit for being JSON + return result + + if isinstance(pred, dict) and isinstance(exp, dict): + return self._compare_dicts(pred, exp) + elif isinstance(pred, list) and isinstance(exp, list): + return self._compare_lists(pred, exp) + else: + # Primitive types + if pred == exp: + result["score"] = 1.0 + result["details"]["matches"].append(f"Values match: {pred}") + else: + result["score"] = self._value_similarity(pred, exp) + result["details"]["mismatches"].append(f"Value mismatch: predicted={pred}, expected={exp}") + return result + + def _compare_dicts(self, pred: dict, exp: dict) -> Dict[str, Any]: + """ + Compare two dictionaries with CASE-INSENSITIVE key matching. + + ๐Ÿ”ฅ FIX: LLMs often produce keys like 'Category' when expected is 'category'. + This method now normalizes keys before comparison for fair scoring. + """ + result = {"score": 0.0, "details": {"type": "dict", "matches": [], "mismatches": [], "missing_keys": [], "extra_keys": []}} + + # ๐Ÿ”ฅ NORMALIZE: Convert all keys to lowercase for comparison + # Also handle common variations like underscores vs camelCase + def normalize_key(key: str) -> str: + """Normalize key: lowercase, underscores to nothing, strip spaces.""" + return re.sub(r'[_\s-]', '', str(key).lower()) + + # Build normalized key mappings + pred_normalized = {normalize_key(k): (k, v) for k, v in pred.items()} + exp_normalized = {normalize_key(k): (k, v) for k, v in exp.items()} + + pred_norm_keys = set(pred_normalized.keys()) + exp_norm_keys = set(exp_normalized.keys()) + + # Check for missing/extra keys (using normalized comparison) + missing_norm = exp_norm_keys - pred_norm_keys + extra_norm = pred_norm_keys - exp_norm_keys + common_norm = pred_norm_keys & exp_norm_keys + + # Convert back to original key names for reporting + missing = [exp_normalized[k][0] for k in missing_norm] + extra = [pred_normalized[k][0] for k in extra_norm] + + result["details"]["missing_keys"] = missing + result["details"]["extra_keys"] = extra + + if not exp_norm_keys: + result["score"] = 1.0 if not pred_norm_keys else 0.5 + return result + + # Score based on key overlap (normalized) + key_score = len(common_norm) / len(exp_norm_keys) if exp_norm_keys else 1.0 + + # Score based on value matches + value_scores = [] + for norm_key in common_norm: + pred_orig_key, pred_val = pred_normalized[norm_key] + exp_orig_key, exp_val = exp_normalized[norm_key] + + if pred_val == exp_val: + value_scores.append(1.0) + result["details"]["matches"].append(f"{exp_orig_key}: {exp_val}") + else: + sim = self._value_similarity(pred_val, exp_val) + value_scores.append(sim) + if sim < 0.8: + result["details"]["mismatches"].append(f"{exp_orig_key}: predicted={pred_val}, expected={exp_val}") + + value_score = sum(value_scores) / len(value_scores) if value_scores else 0.0 + + # Combine scores + result["score"] = 0.3 * key_score + 0.7 * value_score + + # Penalty for missing keys (reduced from 0.1 to 0.05 per key) + if missing: + result["score"] *= (1 - 0.05 * len(missing)) + + result["score"] = max(0.0, min(1.0, result["score"])) + return result + + def _compare_lists(self, pred: list, exp: list) -> Dict[str, Any]: + """Compare two lists.""" + result = {"score": 0.0, "details": {"type": "list", "length_match": False, "item_matches": 0}} + + if not exp: + result["score"] = 1.0 if not pred else 0.5 + return result + + result["details"]["length_match"] = len(pred) == len(exp) + + # Compare items (order-sensitive) + matches = 0 + for i, exp_item in enumerate(exp): + if i < len(pred): + if pred[i] == exp_item: + matches += 1 + else: + # Check if item exists elsewhere + if exp_item in pred: + matches += 0.5 # Partial credit for wrong position + + result["details"]["item_matches"] = matches + result["score"] = matches / len(exp) + + # Penalty for length mismatch + if len(pred) != len(exp): + len_ratio = min(len(pred), len(exp)) / max(len(pred), len(exp)) + result["score"] *= (0.7 + 0.3 * len_ratio) + + return result + + def _value_similarity(self, pred: Any, exp: Any) -> float: + """ + Calculate similarity between two values. + + ๐Ÿ”ฅ ENHANCED: Now handles: + - Case-insensitive string comparison + - Semantic similarity for common variations + - Underscore/space/dash normalization + - Numeric comparison with tolerance + """ + # Same value (exact match) + if pred == exp: + return 1.0 + + # Numeric comparison + try: + pred_num = float(pred) + exp_num = float(exp) + if exp_num == 0: + return 1.0 if pred_num == 0 else 0.0 + # Relative error with tolerance + error = abs(pred_num - exp_num) / abs(exp_num) + return max(0.0, 1.0 - error) + except (ValueError, TypeError): + pass + + # String comparison with normalization + pred_str = str(pred).strip() + exp_str = str(exp).strip() + + # Case-insensitive exact match + if pred_str.lower() == exp_str.lower(): + return 0.98 # Slight penalty for case mismatch + + # Normalize strings (remove underscores, spaces, dashes for comparison) + def normalize_str(s: str) -> str: + return re.sub(r'[_\s\-]+', '', s.lower()) + + pred_norm = normalize_str(pred_str) + exp_norm = normalize_str(exp_str) + + if pred_norm == exp_norm: + return 0.95 # Good match despite formatting differences + + # Check if one contains the other (partial match) + if pred_norm in exp_norm or exp_norm in pred_norm: + ratio = min(len(pred_norm), len(exp_norm)) / max(len(pred_norm), len(exp_norm)) + return 0.7 + (0.2 * ratio) # 0.7-0.9 for partial matches + + # ๐Ÿ”ฅ SEMANTIC SIMILARITY: Check for common equivalent terms + semantic_equivalents = { + # Priority levels + 'low': ['low', 'minor', 'trivial', 'p3', 'p4'], + 'medium': ['medium', 'normal', 'moderate', 'p2'], + 'high': ['high', 'important', 'major', 'p1', 'critical', 'urgent'], + # Boolean variations + 'true': ['true', 'yes', '1', 'on', 'enabled'], + 'false': ['false', 'no', '0', 'off', 'disabled'], + # Status variations + 'success': ['success', 'succeeded', 'completed', 'done', 'passed'], + 'failure': ['failure', 'failed', 'error', 'crashed'], + 'pending': ['pending', 'waiting', 'queued', 'in_progress', 'processing'], + } + + for canonical, equivalents in semantic_equivalents.items(): + pred_match = any(eq in pred_norm for eq in equivalents) + exp_match = any(eq in exp_norm for eq in equivalents) + if pred_match and exp_match: + return 0.85 # Semantic match + + # Sequence matching (character-level similarity) + ratio = SequenceMatcher(None, pred_str.lower(), exp_str.lower()).ratio() + + # ๐Ÿ”ฅ WORD-LEVEL SIMILARITY: Check word overlap + pred_words = set(re.findall(r'\w+', pred_str.lower())) + exp_words = set(re.findall(r'\w+', exp_str.lower())) + + if pred_words and exp_words: + word_overlap = len(pred_words & exp_words) / max(len(pred_words), len(exp_words)) + # Combine character and word similarity + return max(ratio, word_overlap * 0.9) + + def _compare_text_structure(self, predicted: str, expected: str) -> Dict[str, Any]: + """Compare text structure when not JSON.""" + result = {"score": 0.0, "details": {"type": "text"}} + + # Word overlap + pred_words = set(predicted.lower().split()) + exp_words = set(expected.lower().split()) + + if not exp_words: + result["score"] = 1.0 if not pred_words else 0.5 + return result + + overlap = len(pred_words & exp_words) + result["details"]["word_overlap"] = overlap + result["details"]["expected_words"] = len(exp_words) + + # Jaccard similarity + union = len(pred_words | exp_words) + result["score"] = overlap / union if union > 0 else 0.0 + + return result + + def _llm_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]: + """ + Use LLM for semantic analysis of predicted vs expected. + + Uses XML-delimited prompt structure to prevent context bleeding + and Multi-Dimensional Scoring (Semantics vs. Syntax). + + Returns: + Dict with 'score' (0.0-1.0), 'details', and 'feedback' + """ + # Check cache + cache_key = f"{hash(predicted)}:{hash(expected)}" + if cache_key in self._analysis_cache: + return self._analysis_cache[cache_key] + + result = {"score": 0.0, "details": {}, "feedback": ""} + + try: + # Truncate for token limits but preserve enough context + expected_truncated = expected[:10000] + predicted_truncated = predicted[:10000] + + # OPTIMIZED: Penalty-based scoring with self-verification + # Starts at 1.0 and deducts for failures - more consistent than subjective scoring + analysis_prompt = f""" +You are a **Semantic Logic Engine** tasked with grading AI performance. +You must compare a [PREDICTED] output against a [EXPECTED] truth. + + + + +{expected_truncated} + + + +{predicted_truncated} + + + + +Calculate the score based on these STRICT rules. Start with 1.0 and deduct penalties. + +1. **Information Completeness (Max -0.5)**: + - If key facts/fields are missing, deduct proportional to importance. + - If a nested JSON field is missing, deduct 0.1 per field. + +2. **Accuracy & Hallucination (Max -1.0)**: + - If factual numbers/IDs are wrong: Score = 0 immediately. + - If the model invents information NOT in the input: Deduct 0.3. + +3. **Format Compliance (Max -0.3)**: + - If JSON is requested but Markdown is returned: Deduct 0.3. + - If keys are lowercase instead of snake_case: Deduct 0.1. + +4. **Semantic Equivalence (No Penalty)**: + - Synonyms are ACCEPTED (e.g., "Purchase" == "Buy"). + - Formatting differences (whitespace) are IGNORED. + + + +Before finalizing the score, ask: "If I used the predicted output in code expecting the original output, would the code crash?" +- If YES (Crash) -> Score must be < 0.5. +- If NO (Safe) -> Score can be high. + + + +Return JSON ONLY: +{{ + "semantic_similarity": 0.0-1.0, + "structural_similarity": 0.0-1.0, + "verdict": "PERFECT" | "ACCEPTABLE" | "FORMAT_ERROR" | "DATA_CORRUPTION", + "critical_failures": ["List specific failures that caused score < 1.0"], + "penalty_breakdown": {{"completeness": -0.0, "accuracy": -0.0, "format": -0.0}}, + "fix_directive": "Imperative command to fix the prompt" +}} + +""" + + response = self.llm_client.generate( + system_prompt="You are a Semantic Logic Engine. Calculate scores using penalty-based deduction from 1.0. Respond only with valid JSON.", + user_prompt=analysis_prompt, + image_base64="" + ) + + content = response.get("content", str(response)) if isinstance(response, dict) else str(response) + + # Parse JSON response + analysis = self._extract_json_from_response(content) + + if analysis: + # Extract semantic similarity (primary score) + semantic_sim = float(analysis.get("semantic_similarity", 0.5)) + structural_sim = float(analysis.get("structural_similarity", semantic_sim)) + + # Compute weighted score based on verdict (updated for new schema) + verdict = analysis.get("verdict", "ACCEPTABLE") + verdict_multiplier = { + "PERFECT": 1.0, + "ACCEPTABLE": 0.85, + "FORMAT_ERROR": 0.6, # New: was WRONG_FORMAT + "DATA_CORRUPTION": 0.1, # New: replaces WRONG_CONTENT + HALLUCINATION + # Legacy support + "WRONG_FORMAT": 0.6, + "WRONG_CONTENT": 0.3, + "HALLUCINATION": 0.1 + }.get(verdict, 0.5) + + # Final score: weighted combination + result["score"] = min(1.0, semantic_sim * 0.6 + structural_sim * 0.3 + verdict_multiplier * 0.1) + + # Extract penalty breakdown if available + penalty_breakdown = analysis.get("penalty_breakdown", {}) + critical_failures = analysis.get("critical_failures", []) + + result["details"] = { + "verdict": verdict, + "semantic_similarity": semantic_sim, + "structural_similarity": structural_sim, + "critical_failures": critical_failures, + "penalty_breakdown": penalty_breakdown, + # Legacy field support + "key_matches": analysis.get("key_matches", []), + "key_differences": analysis.get("key_differences", critical_failures), + "value_errors": analysis.get("value_errors", {}), + "reasoning": analysis.get("reasoning", "") + } + result["feedback"] = analysis.get("fix_directive", "") + else: + # Fallback if JSON parsing fails + result = self._heuristic_semantic_analysis(predicted, expected) + + # Cache result + self._analysis_cache[cache_key] = result + + except Exception as e: + logger.warning(f"LLM semantic analysis failed: {e}, falling back to heuristics") + result = self._heuristic_semantic_analysis(predicted, expected) + + return result + + def _extract_json_from_response(self, content: str) -> Optional[Dict]: + """Extract JSON from LLM response.""" + # Try to find JSON in response + json_match = re.search(r'\{[\s\S]*\}', content) + if json_match: + try: + return json.loads(json_match.group(0)) + except json.JSONDecodeError: + pass + return None + + def _heuristic_semantic_analysis(self, predicted: str, expected: str) -> Dict[str, Any]: + """ + Heuristic-based semantic analysis when LLM is not available. + + Uses multiple signals: + - Word overlap (Jaccard) + - Sequence matching (SequenceMatcher) + - Number extraction and comparison + - Key phrase matching + """ + result = {"score": 0.0, "details": {}, "feedback": ""} + + pred_lower = predicted.lower() + exp_lower = expected.lower() + + # 1. Sequence similarity + seq_sim = SequenceMatcher(None, pred_lower, exp_lower).ratio() + + # 2. Word overlap (Jaccard) + pred_words = set(pred_lower.split()) + exp_words = set(exp_lower.split()) + jaccard = len(pred_words & exp_words) / len(pred_words | exp_words) if (pred_words | exp_words) else 0.0 + + # 3. Number comparison + pred_nums = re.findall(r'-?\d+\.?\d*', predicted) + exp_nums = re.findall(r'-?\d+\.?\d*', expected) + + num_score = 1.0 + num_errors = [] + if exp_nums: + matches = 0 + for exp_num in exp_nums: + if exp_num in pred_nums: + matches += 1 + else: + # Check for close matches + try: + exp_val = float(exp_num) + for pred_num in pred_nums: + pred_val = float(pred_num) + if abs(pred_val - exp_val) <= 1: # Off by 1 + matches += 0.9 + num_errors.append(f"Number close: expected {exp_num}, got {pred_num}") + break + else: + num_errors.append(f"Number missing: expected {exp_num}") + except ValueError: + pass + num_score = matches / len(exp_nums) if exp_nums else 1.0 + + # 4. Key entity extraction (simple approach) + # Look for capitalized words, quoted strings, etc. + pred_entities = set(re.findall(r'\b[A-Z][a-z]+\b', predicted)) + exp_entities = set(re.findall(r'\b[A-Z][a-z]+\b', expected)) + entity_overlap = len(pred_entities & exp_entities) / len(exp_entities) if exp_entities else 1.0 + + # Combine scores + result["score"] = ( + 0.3 * seq_sim + + 0.25 * jaccard + + 0.25 * num_score + + 0.2 * entity_overlap + ) + + result["details"] = { + "sequence_similarity": seq_sim, + "word_overlap": jaccard, + "number_accuracy": num_score, + "entity_overlap": entity_overlap, + "number_errors": num_errors + } + + # Generate feedback + feedback_parts = [] + if jaccard < 0.5: + feedback_parts.append("Low word overlap - output may be missing key terms.") + if num_errors: + feedback_parts.append(f"Number issues: {'; '.join(num_errors[:3])}") + if entity_overlap < 0.5 and exp_entities: + missing = exp_entities - pred_entities + feedback_parts.append(f"Missing entities: {', '.join(list(missing)[:3])}") + + if feedback_parts: + result["feedback"] = " | ".join(feedback_parts) + else: + result["feedback"] = "Output is semantically similar but not exact match." + + return result + + def _generate_default_feedback(self, result: Dict) -> str: + """Generate default feedback based on scores.""" + score = result["composite_score"] + semantic = result["semantic_similarity"] + structural = result["structural_similarity"] + + if score >= 0.9: + return "โœ… Excellent match! Minor differences only." + elif score >= 0.7: + return f"โš ๏ธ Good match (semantic={semantic:.0%}, structural={structural:.0%}). Some differences to address." + elif score >= 0.5: + return f"โš ๏ธ Partial match (semantic={semantic:.0%}, structural={structural:.0%}). Significant differences found." + else: + return f"โŒ Poor match (semantic={semantic:.0%}, structural={structural:.0%}). Major issues to fix." + + def get_evaluation_summary(self, results: List[Dict]) -> Dict[str, Any]: + """ + Get summary statistics for a batch of evaluations. + + Args: + results: List of evaluation result dictionaries + + Returns: + Summary statistics + """ + if not results: + return { + "total_samples": 0, + "accuracy": 0.0, + "avg_semantic_similarity": 0.0, + "avg_structural_similarity": 0.0 + } + + total = len(results) + scores = [r.get("composite_score", 0.0) for r in results] + semantic_scores = [r.get("semantic_similarity", 0.0) for r in results] + structural_scores = [r.get("structural_similarity", 0.0) for r in results] + + return { + "total_samples": total, + "accuracy": sum(1 for s in scores if s >= 0.8) / total, + "avg_composite_score": sum(scores) / total, + "avg_semantic_similarity": sum(semantic_scores) / total, + "avg_structural_similarity": sum(structural_scores) / total, + "min_score": min(scores), + "max_score": max(scores) + } + + +# Convenience function to create evaluator +def create_universal_evaluator(llm_client=None) -> UniversalSemanticEvaluator: + """ + Create a Universal Semantic Evaluator. + + Args: + llm_client: Optional LLM client for semantic analysis. + If not provided, uses heuristic-based analysis. + + Returns: + Configured UniversalSemanticEvaluator instance + """ + return UniversalSemanticEvaluator( + llm_client=llm_client, + use_llm_analysis=llm_client is not None + ) + diff --git a/src/gepa_optimizer/evaluation/validation_evaluator.py b/src/gepa_optimizer/evaluation/validation_evaluator.py new file mode 100644 index 0000000000000000000000000000000000000000..6d7f95765f060e9f1cc181550a28e9d48f9b368b --- /dev/null +++ b/src/gepa_optimizer/evaluation/validation_evaluator.py @@ -0,0 +1,495 @@ +""" +Validation Evaluator for UI Validation Use Case + +Evaluates predicted validation results (true/false) against expected results. +Extracts reasoning from both predicted and expected outputs for LLM-as-judge feedback. +""" + +from typing import Dict, Any, Optional +import re +import logging + +try: + from .base_evaluator import BaseEvaluator +except ImportError: + # For standalone testing + import sys + from pathlib import Path + sys.path.insert(0, str(Path(__file__).parent.parent.parent)) + from gepa_optimizer.evaluation.base_evaluator import BaseEvaluator + + +class ValidationEvaluator(BaseEvaluator): + """ + Evaluator for validation use case (true/false results). + + Features: + - Normalizes boolean formats ("true"/"True"/"1" โ†’ True, "false"/"False"/"0" โ†’ False) + - Extracts reasoning from both predicted and expected outputs (REQUIRED for LLM-as-judge) + - Binary scoring: correct boolean = 1.0, wrong = 0.0 + - Returns reasoning in evaluation results for LLM-as-judge feedback + """ + + def __init__(self, metric_weights: Optional[Dict[str, float]] = None): + """ + Initialize validation evaluator. + + Args: + metric_weights: Weights for evaluation metrics + Default: {"output_match": 1.0} + """ + default_weights = { + "output_match": 1.0 # Binary boolean comparison + } + + weights = metric_weights or default_weights + super().__init__(metric_weights=weights) + + def evaluate(self, predicted: str, expected: str) -> Dict[str, float]: + """ + Evaluate predicted validation result against expected result. + + Scoring Strategy: + 1. Normalize both predicted and expected to boolean + 2. Compare booleans (exact match required) + 3. Extract reasoning from both (for LLM-as-judge) + 4. Return 1.0 if match, 0.0 otherwise (binary scoring) + + Args: + predicted: LLM's output (may include "true"/"false" + reasoning) + expected: Expected output (should be "true" or "false", may include reasoning) + + Returns: + Dictionary with evaluation metrics, extracted booleans, and reasoning: + { + "output_match": 1.0 or 0.0, + "composite_score": 1.0 or 0.0, + "predicted_output": str, + "expected_output": str, + "predicted_boolean": True/False, + "expected_boolean": True/False, + "predicted_reasoning": str, # REQUIRED for LLM-as-judge + "expected_reasoning": str, # REQUIRED for LLM-as-judge + "evaluation_reason": str + } + """ + if not predicted or not expected: + return { + "output_match": 0.0, + "composite_score": 0.0, + "predicted_output": str(predicted).strip() if predicted else "", + "expected_output": str(expected).strip() if expected else "", + "predicted_boolean": None, + "expected_boolean": None, + "predicted_reasoning": "", + "expected_reasoning": "", + "evaluation_reason": "โŒ Empty or missing input/output" + } + + predicted_str = str(predicted).strip() + expected_str = str(expected).strip() + + # 1. Extract boolean from predicted output + pred_bool = self._normalize_to_bool(predicted_str) + pred_reasoning = self._extract_reasoning(predicted_str) + + # 2. Extract boolean from expected output + exp_bool = self._normalize_to_bool(expected_str) + exp_reasoning = self._extract_reasoning(expected_str) + + # ๐Ÿ”ฅ NEW: Detect output structure for both expected and predicted + expected_structure = self._detect_output_structure(expected_str) + predicted_structure = self._detect_output_structure(predicted_str) + + # Compare structures + structure_match = (expected_structure['format'] == predicted_structure['format']) + + # 3. Compare booleans (binary scoring) + if pred_bool is None or exp_bool is None: + # Could not extract boolean from one or both + score = 0.0 + reason = "โŒ Could not extract boolean value" + if pred_bool is None: + reason += " from predicted output" + if exp_bool is None: + reason += " from expected output" + else: + # Both booleans extracted successfully - compare + score = 1.0 if pred_bool == exp_bool else 0.0 + if score == 1.0: + reason = f"โœ… Correct! Result matches (both are {exp_bool})" + # ๐Ÿ”ฅ NEW: Add note if structure doesn't match + if not structure_match: + reason += f" (but format differs: expected {expected_structure['format']}, got {predicted_structure['format']})" + else: + reason = f"โŒ Wrong result (predicted: {pred_bool}, expected: {exp_bool})" + + # 4. Log evaluation details + self.logger.info(f"\n{'โ”€'*70}") + self.logger.info(f"๐Ÿ“Š VALIDATION EVALUATION") + self.logger.info(f"{'โ”€'*70}") + self.logger.info(f" Expected: '{expected_str[:100]}...' โ†’ {exp_bool}") + self.logger.info(f" Predicted: '{predicted_str[:100]}...' โ†’ {pred_bool}") + self.logger.info(f" {'โ”€'*66}") + self.logger.info(f" ๐ŸŽฏ SCORE: {score:.2f} - {reason}") + if pred_reasoning: + self.logger.info(f" ๐Ÿ“ Predicted Reasoning: {pred_reasoning[:150]}...") + if exp_reasoning: + self.logger.info(f" ๐Ÿ“ Expected Reasoning: {exp_reasoning[:150]}...") + # ๐Ÿ”ฅ NEW: Log structure comparison + self.logger.info(f" ๐Ÿ“ Expected Format: {expected_structure['format']} (reasoning: {expected_structure['reasoning_quality']})") + self.logger.info(f" ๐Ÿ“ Predicted Format: {predicted_structure['format']} (reasoning: {predicted_structure['reasoning_quality']})") + if not structure_match: + self.logger.warning(f" โš ๏ธ OUTPUT STRUCTURE MISMATCH!") + self.logger.info(f"{'โ”€'*70}\n") + + return { + "output_match": score, + "composite_score": score, # This is what GEPA uses + "predicted_output": predicted_str, + "expected_output": expected_str, + "predicted_boolean": pred_bool, + "expected_boolean": exp_bool, + "predicted_reasoning": pred_reasoning, # REQUIRED for LLM-as-judge + "expected_reasoning": exp_reasoning, # REQUIRED for LLM-as-judge + "evaluation_reason": reason, + # ๐Ÿ”ฅ NEW: Structure metadata for LLM-as-judge + "expected_structure": expected_structure, + "predicted_structure": predicted_structure, + "output_structure_match": structure_match, + "expected_has_reasoning": expected_structure['has_reasoning'], + "predicted_has_reasoning": predicted_structure['has_reasoning'], + "reasoning_quality_gap": expected_structure['reasoning_quality'] + " โ†’ " + predicted_structure['reasoning_quality'] + } + + def _normalize_to_bool(self, value: str) -> Optional[bool]: + """ + Normalize various formats to boolean. + + Handles: + - "true", "True", "TRUE" โ†’ True + - "false", "False", "FALSE" โ†’ False + - "1", "0" โ†’ True, False + - "yes", "no" โ†’ True, False + - "correct", "incorrect" โ†’ True, False + - JSON: {"result": true} โ†’ True + - Text with boolean: "The result is true because..." โ†’ True + + Args: + value: String that may contain a boolean value + + Returns: + Boolean value or None if cannot be determined + """ + if not value: + return None + + value_lower = value.lower().strip() + + # Direct boolean strings + if value_lower in ("true", "1", "yes", "correct", "valid", "pass"): + return True + if value_lower in ("false", "0", "no", "incorrect", "invalid", "fail"): + return False + + # JSON format: {"action": "TRUE"} or {"action": "FALSE"} or {"action": "LOADING"} + # This handles the production prompt's JSON output format + # Match both quoted and unquoted values, case-insensitive + action_match = re.search(r'["\']?action["\']?\s*:\s*["\']?(true|false|loading)["\']?', value_lower) + if action_match: + action_value = action_match.group(1).lower() + if action_value == "true": + return True + elif action_value == "false": + return False + elif action_value == "loading": + # Treat LOADING as False for validation purposes (screen not ready) + return False + + # Also try to parse full JSON structure if present (more robust) + try: + import json + # Try to find and parse JSON object + json_start = value.find('{') + if json_start != -1: + # Try to extract JSON from the response + for end_idx in range(len(value), json_start, -1): + try: + json_str = value[json_start:end_idx] + data = json.loads(json_str) + # Check for "action" field (production prompt format) + if "action" in data: + action_val = str(data["action"]).upper() + if action_val == "TRUE": + return True + elif action_val == "FALSE": + return False + elif action_val == "LOADING": + return False # Treat as False + # Check for "result" field (alternative format) + if "result" in data: + result_val = data["result"] + if isinstance(result_val, bool): + return result_val + elif isinstance(result_val, str): + return result_val.lower() in ("true", "1", "yes") + except (json.JSONDecodeError, KeyError, ValueError): + continue + except Exception: + pass # Fall through to other extraction methods + + # JSON format: {"result": true} or {"result": false} + json_match = re.search(r'["\']?result["\']?\s*:\s*(true|false)', value_lower) + if json_match: + return json_match.group(1) == "true" + + # Pattern: "result is true" or "result: true" + pattern_match = re.search(r'result[:\s]+(true|false)', value_lower) + if pattern_match: + return pattern_match.group(1) == "true" + + # Pattern: "is true" or "is false" (standalone) + is_match = re.search(r'\b(is|are)\s+(true|false)\b', value_lower) + if is_match: + return is_match.group(2) == "true" + + # Pattern: "true" or "false" as standalone word (not in other words) + standalone_match = re.search(r'\b(true|false)\b', value_lower) + if standalone_match: + return standalone_match.group(1) == "true" + + # Last resort: check if "true" appears before "false" in text + true_pos = value_lower.find("true") + false_pos = value_lower.find("false") + + if true_pos != -1 and false_pos != -1: + # Both found - use the one that appears first + return true_pos < false_pos + elif true_pos != -1: + return True + elif false_pos != -1: + return False + + # Cannot determine + return None + + def _detect_output_structure(self, output: str) -> Dict[str, Any]: + """ + Dynamically detect the structure/components of the output. + + This detects: + - Boolean result presence + - Reasoning/explanation presence and quality + - Output format (boolean only, boolean+reasoning, etc.) + + Args: + output: Output string to analyze + + Returns: + Dictionary with structure information: + { + "has_boolean": bool, + "has_reasoning": bool, + "reasoning_length": int, + "reasoning_quality": str, # "missing", "minimal", "adequate", "detailed" + "format": str # "boolean_only", "boolean_with_reasoning", "unknown" + } + """ + if not output: + return { + "has_boolean": False, + "has_reasoning": False, + "reasoning_length": 0, + "reasoning_quality": "missing", + "format": "empty" + } + + output_clean = output.strip() + + # Detect boolean + has_boolean = self._normalize_to_bool(output_clean) is not None + + # Extract reasoning + reasoning = self._extract_reasoning(output_clean) + has_reasoning = len(reasoning) > 15 # Minimum 15 chars to count as reasoning + reasoning_length = len(reasoning) + + # Classify reasoning quality + if reasoning_length == 0: + reasoning_quality = "missing" + elif reasoning_length < 30: + reasoning_quality = "minimal" # Just a few words + elif reasoning_length < 100: + reasoning_quality = "adequate" # Brief explanation + else: + reasoning_quality = "detailed" # Full explanation + + # Determine format + if has_boolean and has_reasoning: + output_format = "boolean_with_reasoning" + elif has_boolean and not has_reasoning: + output_format = "boolean_only" + elif not has_boolean and has_reasoning: + output_format = "reasoning_only" + else: + output_format = "unknown" + + return { + "has_boolean": has_boolean, + "has_reasoning": has_reasoning, + "reasoning_length": reasoning_length, + "reasoning_quality": reasoning_quality, + "format": output_format + } + + def _extract_reasoning(self, output: str) -> str: + """ + Extract reasoning/explanation from output string. + + This is REQUIRED for LLM-as-judge feedback. The reasoning helps + the judge understand why the result was true/false and compare + predicted vs expected reasoning. + + Args: + output: Full output string that may contain reasoning + + Returns: + Extracted reasoning text, or empty string if not found + """ + if not output: + return "" + + # Patterns to find reasoning sections + reasoning_patterns = [ + r'[Rr]eason[:\s]+(.*?)(?:\n\n|\Z)', # "Reason: ..." + r'[Ee]xplanation[:\s]+(.*?)(?:\n\n|\Z)', # "Explanation: ..." + r'[Bb]ecause[:\s]+(.*?)(?:\n\n|\Z)', # "Because: ..." + r'[Ww]hy[:\s]+(.*?)(?:\n\n|\Z)', # "Why: ..." + r'[Dd]etails[:\s]+(.*?)(?:\n\n|\Z)', # "Details: ..." + ] + + # Try each pattern + for pattern in reasoning_patterns: + match = re.search(pattern, output, re.DOTALL | re.IGNORECASE) + if match: + reasoning = match.group(1).strip() + if len(reasoning) > 20: # Only return if substantial + return reasoning + + # If no explicit reasoning section, check if output has substantial text + # after boolean (likely contains reasoning) + bool_match = re.search(r'\b(true|false)\b', output.lower()) + if bool_match: + # Get text after the boolean + bool_pos = bool_match.end() + remaining = output[bool_pos:].strip() + + # If remaining text is substantial (more than just punctuation), use it + if len(remaining) > 30: + # Clean up common prefixes + remaining = re.sub(r'^[:\s.,;!?-]+', '', remaining) + if remaining: + return remaining + + # If output is long and doesn't start with boolean, might be all reasoning + if len(output) > 100 and not re.match(r'^\s*(true|false)\s*$', output, re.IGNORECASE): + # Return first 500 chars as reasoning + return output[:500].strip() + + # No reasoning found + return "" + + def get_evaluation_summary(self, results: list) -> Dict[str, Any]: + """ + Get summary statistics for a batch of evaluations. + + Args: + results: List of evaluation result dictionaries + + Returns: + Summary statistics including accuracy, true/false distribution + """ + if not results: + return { + "total_samples": 0, + "accuracy": 0.0, + "correct_predictions": 0, + "incorrect_predictions": 0, + "true_predictions": 0, + "false_predictions": 0 + } + + total = len(results) + correct = sum(1 for r in results if r.get("output_match", 0.0) == 1.0) + accuracy = correct / total if total > 0 else 0.0 + + # Count true/false predictions + true_preds = sum(1 for r in results if r.get("predicted_boolean") is True) + false_preds = sum(1 for r in results if r.get("predicted_boolean") is False) + + return { + "total_samples": total, + "accuracy": accuracy, + "correct_predictions": correct, + "incorrect_predictions": total - correct, + "true_predictions": true_preds, + "false_predictions": false_preds + } + + +# Example usage and testing +if __name__ == "__main__": + print("๐Ÿš€ Testing Validation Evaluator...") + + evaluator = ValidationEvaluator() + + # Test cases + test_cases = [ + # (predicted, expected, should_match) + ("true", "true", True), + ("false", "false", True), + ("True", "true", True), + ("FALSE", "false", True), + ("1", "true", True), + ("0", "false", True), + ("true", "false", False), + ("false", "true", False), + ("The result is true because the button is visible", "true", True), + ("The result is false because the element is not found", "false", True), + ('{"result": true, "reasoning": "Button is visible"}', "true", True), + ("Result: true\n\nReasoning: The submit button is clearly visible at the bottom of the screen.", "true", True), + ("", "true", False), + ("invalid", "true", False), + ] + + print("\n๐Ÿ“ Running test cases:") + print("-" * 80) + + results = [] + for predicted, expected, should_match in test_cases: + result = evaluator.evaluate(predicted, expected) + match = result["composite_score"] == 1.0 + + status = "โœ…" if match == should_match else "โŒ" + pred_bool = result.get("predicted_boolean", "?") + exp_bool = result.get("expected_boolean", "?") + pred_reason = result.get("predicted_reasoning", "")[:50] + + print(f"{status} Predicted: '{predicted[:40]}...' โ†’ {pred_bool}") + print(f" Expected: '{expected}' โ†’ {exp_bool}") + print(f" Match: {match} (should be {should_match})") + if pred_reason: + print(f" Reasoning: {pred_reason}...") + print() + + results.append(result) + + # Summary + print("\n๐Ÿ“Š Summary:") + summary = evaluator.get_evaluation_summary(results) + print(f" Total: {summary['total_samples']}") + print(f" Correct: {summary['correct_predictions']}") + print(f" Accuracy: {summary['accuracy']:.1%}") + print(f" True predictions: {summary['true_predictions']}") + print(f" False predictions: {summary['false_predictions']}") + diff --git a/src/gepa_optimizer/infrastructure/__init__.py b/src/gepa_optimizer/infrastructure/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0b3d27aeb3cf77dcd0d6f4cd1b4e5565580b3df9 --- /dev/null +++ b/src/gepa_optimizer/infrastructure/__init__.py @@ -0,0 +1,15 @@ +""" +Infrastructure module for cross-cutting concerns. + +This module contains infrastructure components that are used across +the entire application, including logging, metrics, and configuration. +""" + +from .logging import get_logger, configure_logging, LogContext + +__all__ = [ + "get_logger", + "configure_logging", + "LogContext", +] + diff --git a/src/gepa_optimizer/infrastructure/logging/__init__.py b/src/gepa_optimizer/infrastructure/logging/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6086e4bb54f61fc14cb384dc87db2d639f9dca85 --- /dev/null +++ b/src/gepa_optimizer/infrastructure/logging/__init__.py @@ -0,0 +1,43 @@ +""" +Centralized Logging Infrastructure for GEPA Optimizer. + +This module provides a unified logging system with: +- Structured logging with context +- Consistent formatting across all modules +- Log level configuration +- Operation tracking with timing +- Contextual logging for debugging + +Usage: + from gepa_optimizer.infrastructure.logging import get_logger, LogContext + + logger = get_logger(__name__) + logger.info("Starting optimization", extra={"iteration": 1}) + + with LogContext(logger, "evaluation", sample_id=123): + logger.info("Evaluating sample") +""" + +from .logger import ( + get_logger, + configure_logging, + LogLevel, + GEPA_LOGGER_NAME, +) +from .context import LogContext, log_operation +from .formatters import GepaFormatter, JsonFormatter + +__all__ = [ + # Core logging + "get_logger", + "configure_logging", + "LogLevel", + "GEPA_LOGGER_NAME", + # Context management + "LogContext", + "log_operation", + # Formatters + "GepaFormatter", + "JsonFormatter", +] + diff --git a/src/gepa_optimizer/infrastructure/logging/context.py b/src/gepa_optimizer/infrastructure/logging/context.py new file mode 100644 index 0000000000000000000000000000000000000000..d2a305e5ac78fd50311331a62713ceff09f4315d --- /dev/null +++ b/src/gepa_optimizer/infrastructure/logging/context.py @@ -0,0 +1,257 @@ +""" +Logging Context Management. + +Provides context managers and decorators for: +- Operation tracking with timing +- Contextual logging with nested contexts +- Automatic exception logging +""" + +import logging +import time +import functools +from contextlib import contextmanager +from typing import Any, Callable, Dict, Optional, TypeVar, ParamSpec + +P = ParamSpec('P') +R = TypeVar('R') + + +class LogContext: + """ + Context manager for logging operations with timing and context. + + Features: + - Automatic start/end logging + - Timing measurement + - Exception capture + - Nested context support + + Example: + logger = get_logger(__name__) + + with LogContext(logger, "optimization", iteration=5): + # ... optimization code ... + logger.info("Processing sample") # Inherits context + + # Output: + # INFO | Starting optimization | iteration=5 + # INFO | Processing sample | iteration=5 + # INFO | Completed optimization | iteration=5 duration_ms=1234 + """ + + def __init__( + self, + logger: logging.Logger, + operation: str, + log_start: bool = True, + log_end: bool = True, + log_level: int = logging.INFO, + **context_fields: Any + ): + """ + Initialize log context. + + Args: + logger: Logger instance to use + operation: Name of the operation being performed + log_start: Whether to log when entering context + log_end: Whether to log when exiting context + log_level: Log level for start/end messages + **context_fields: Additional fields to include in all logs + """ + self.logger = logger + self.operation = operation + self.log_start = log_start + self.log_end = log_end + self.log_level = log_level + self.context_fields = context_fields + self.start_time: Optional[float] = None + self.exception: Optional[Exception] = None + + def __enter__(self) -> "LogContext": + """Enter the context, logging start if configured.""" + self.start_time = time.perf_counter() + + if self.log_start: + self.logger.log( + self.log_level, + f"Starting {self.operation}", + extra=self.context_fields + ) + + return self + + def __exit__(self, exc_type, exc_val, exc_tb) -> bool: + """Exit the context, logging completion or error.""" + duration_ms = (time.perf_counter() - self.start_time) * 1000 + + extra = { + **self.context_fields, + "duration_ms": round(duration_ms, 2) + } + + if exc_type is not None: + # Log exception + self.exception = exc_val + self.logger.error( + f"Failed {self.operation}: {exc_type.__name__}: {exc_val}", + extra=extra, + exc_info=True + ) + # Don't suppress the exception + return False + + if self.log_end: + self.logger.log( + self.log_level, + f"Completed {self.operation}", + extra=extra + ) + + return False + + def log(self, level: int, message: str, **extra_fields: Any) -> None: + """Log a message within this context, inheriting context fields.""" + self.logger.log( + level, + message, + extra={**self.context_fields, **extra_fields} + ) + + def info(self, message: str, **extra_fields: Any) -> None: + """Log info message within context.""" + self.log(logging.INFO, message, **extra_fields) + + def debug(self, message: str, **extra_fields: Any) -> None: + """Log debug message within context.""" + self.log(logging.DEBUG, message, **extra_fields) + + def warning(self, message: str, **extra_fields: Any) -> None: + """Log warning message within context.""" + self.log(logging.WARNING, message, **extra_fields) + + def error(self, message: str, **extra_fields: Any) -> None: + """Log error message within context.""" + self.log(logging.ERROR, message, **extra_fields) + + +def log_operation( + logger: Optional[logging.Logger] = None, + operation: Optional[str] = None, + log_args: bool = False, + log_result: bool = False, + log_level: int = logging.INFO, +) -> Callable[[Callable[P, R]], Callable[P, R]]: + """ + Decorator for logging function execution. + + Automatically logs: + - Function entry (with arguments if configured) + - Function exit (with result if configured) + - Execution duration + - Exceptions + + Args: + logger: Logger to use (defaults to logger named after module) + operation: Operation name (defaults to function name) + log_args: Whether to log function arguments + log_result: Whether to log function result + log_level: Log level for messages + + Example: + @log_operation(log_args=True) + def process_batch(batch_id: int, items: List[str]) -> int: + return len(items) + + # Output: + # INFO | Starting process_batch | batch_id=123 items=['a', 'b'] + # INFO | Completed process_batch | duration_ms=45.2 result=2 + """ + def decorator(func: Callable[P, R]) -> Callable[P, R]: + nonlocal logger, operation + + if logger is None: + logger = logging.getLogger(func.__module__) + if operation is None: + operation = func.__name__ + + @functools.wraps(func) + def wrapper(*args: P.args, **kwargs: P.kwargs) -> R: + start_time = time.perf_counter() + + # Build context fields + extra: Dict[str, Any] = {} + if log_args: + # Include positional args (skip self for methods) + arg_names = func.__code__.co_varnames[:func.__code__.co_argcount] + for i, (name, value) in enumerate(zip(arg_names, args)): + if name != 'self': + extra[name] = _safe_repr(value) + # Include keyword args + for key, value in kwargs.items(): + extra[key] = _safe_repr(value) + + logger.log(log_level, f"Starting {operation}", extra=extra) + + try: + result = func(*args, **kwargs) + + duration_ms = (time.perf_counter() - start_time) * 1000 + result_extra: Dict[str, Any] = {"duration_ms": round(duration_ms, 2)} + + if log_result: + result_extra["result"] = _safe_repr(result) + + logger.log(log_level, f"Completed {operation}", extra=result_extra) + + return result + + except Exception as e: + duration_ms = (time.perf_counter() - start_time) * 1000 + logger.error( + f"Failed {operation}: {type(e).__name__}: {e}", + extra={"duration_ms": round(duration_ms, 2)}, + exc_info=True + ) + raise + + return wrapper + + return decorator + + +@contextmanager +def timed_block(logger: logging.Logger, description: str, log_level: int = logging.DEBUG): + """ + Simple context manager for timing a block of code. + + Less verbose than LogContext, suitable for quick timing measurements. + + Example: + with timed_block(logger, "data processing"): + process_data() + # Output: DEBUG | data processing completed in 123.45ms + """ + start = time.perf_counter() + try: + yield + finally: + duration_ms = (time.perf_counter() - start) * 1000 + logger.log(log_level, f"{description} completed in {duration_ms:.2f}ms") + + +def _safe_repr(value: Any, max_length: int = 100) -> str: + """ + Create a safe string representation of a value for logging. + + Truncates long strings and handles non-serializable objects. + """ + try: + repr_str = repr(value) + if len(repr_str) > max_length: + return repr_str[:max_length] + "..." + return repr_str + except Exception: + return f"<{type(value).__name__}>" + diff --git a/src/gepa_optimizer/infrastructure/logging/formatters.py b/src/gepa_optimizer/infrastructure/logging/formatters.py new file mode 100644 index 0000000000000000000000000000000000000000..2387fe8deac3f641e363aab02067a55ed29a4474 --- /dev/null +++ b/src/gepa_optimizer/infrastructure/logging/formatters.py @@ -0,0 +1,259 @@ +""" +Custom Log Formatters for GEPA Optimizer. + +Provides formatters for: +- Console output with colors and emoji +- JSON structured logging for production +- Plain text for file logging +""" + +import json +import logging +from datetime import datetime +from typing import Any, Dict, Optional + + +# ANSI color codes for terminal output +class Colors: + """ANSI color codes for terminal coloring.""" + RESET = "\033[0m" + BOLD = "\033[1m" + DIM = "\033[2m" + + # Log level colors + DEBUG = "\033[36m" # Cyan + INFO = "\033[32m" # Green + WARNING = "\033[33m" # Yellow + ERROR = "\033[31m" # Red + CRITICAL = "\033[35m" # Magenta + + # Semantic colors + TIMESTAMP = "\033[90m" # Gray + MODULE = "\033[34m" # Blue + MESSAGE = "\033[0m" # Default + + +# Emoji prefixes for visual log scanning +LEVEL_EMOJI = { + logging.DEBUG: "๐Ÿ”", + logging.INFO: "โ„น๏ธ ", + logging.WARNING: "โš ๏ธ ", + logging.ERROR: "โŒ", + logging.CRITICAL: "๐Ÿšจ", +} + +# Level colors mapping +LEVEL_COLORS = { + logging.DEBUG: Colors.DEBUG, + logging.INFO: Colors.INFO, + logging.WARNING: Colors.WARNING, + logging.ERROR: Colors.ERROR, + logging.CRITICAL: Colors.CRITICAL, +} + + +class GepaFormatter(logging.Formatter): + """ + Custom formatter for GEPA Optimizer logs. + + Features: + - Optional color output for console + - Optional emoji prefixes for visual scanning + - Structured extra fields support + - Clean, readable format + + Example output: + 2024-01-15 10:30:45 | INFO | โ„น๏ธ gepa_optimizer.core.optimizer | Starting optimization iteration=5 + """ + + def __init__( + self, + fmt: Optional[str] = None, + datefmt: Optional[str] = None, + use_colors: bool = True, + include_emoji: bool = True, + ): + """ + Initialize the formatter. + + Args: + fmt: Format string (uses default if not provided) + datefmt: Date format string + use_colors: Whether to use ANSI colors + include_emoji: Whether to include emoji prefixes + """ + super().__init__(fmt=fmt, datefmt=datefmt) + self.use_colors = use_colors + self.include_emoji = include_emoji + + def format(self, record: logging.LogRecord) -> str: + """Format a log record with colors and emoji.""" + # Store original values + original_msg = record.msg + original_levelname = record.levelname + + try: + # Add emoji prefix if enabled + if self.include_emoji: + emoji = LEVEL_EMOJI.get(record.levelno, "") + record.levelname = f"{emoji} {record.levelname}" + + # Add colors if enabled + if self.use_colors: + color = LEVEL_COLORS.get(record.levelno, Colors.RESET) + record.levelname = f"{color}{record.levelname}{Colors.RESET}" + record.name = f"{Colors.MODULE}{record.name}{Colors.RESET}" + + # Format extra fields if present + extra_str = self._format_extra(record) + if extra_str: + record.msg = f"{record.msg} | {extra_str}" + + # Call parent formatter + formatted = super().format(record) + + return formatted + + finally: + # Restore original values + record.msg = original_msg + record.levelname = original_levelname + + def _format_extra(self, record: logging.LogRecord) -> str: + """ + Format extra fields from the log record. + + Extra fields are passed via the 'extra' parameter to logging calls: + logger.info("Message", extra={"key": "value"}) + """ + # Standard LogRecord attributes to exclude + standard_attrs = { + 'name', 'msg', 'args', 'created', 'filename', 'funcName', + 'levelname', 'levelno', 'lineno', 'module', 'msecs', + 'pathname', 'process', 'processName', 'relativeCreated', + 'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName', + 'taskName', 'message' + } + + # Collect extra fields + extra_fields = { + k: v for k, v in record.__dict__.items() + if k not in standard_attrs and not k.startswith('_') + } + + if not extra_fields: + return "" + + # Format as key=value pairs + parts = [] + for key, value in extra_fields.items(): + if isinstance(value, str): + parts.append(f"{key}={value}") + elif isinstance(value, (int, float)): + parts.append(f"{key}={value}") + elif isinstance(value, bool): + parts.append(f"{key}={str(value).lower()}") + else: + parts.append(f"{key}={repr(value)}") + + return " ".join(parts) + + +class JsonFormatter(logging.Formatter): + """ + JSON formatter for structured logging. + + Outputs each log record as a single JSON line, suitable for: + - Log aggregation systems (ELK, Splunk) + - Cloud logging (CloudWatch, Stackdriver) + - Log parsing and analysis + + Example output: + {"timestamp": "2024-01-15T10:30:45.123Z", "level": "INFO", "logger": "gepa_optimizer.core", "message": "Starting optimization", "iteration": 5} + """ + + def __init__( + self, + include_timestamp: bool = True, + include_location: bool = False, + ): + """ + Initialize JSON formatter. + + Args: + include_timestamp: Include ISO timestamp + include_location: Include file/line information + """ + super().__init__() + self.include_timestamp = include_timestamp + self.include_location = include_location + + def format(self, record: logging.LogRecord) -> str: + """Format record as JSON string.""" + log_dict: Dict[str, Any] = {} + + # Timestamp + if self.include_timestamp: + log_dict["timestamp"] = datetime.utcfromtimestamp( + record.created + ).isoformat() + "Z" + + # Core fields + log_dict["level"] = record.levelname + log_dict["logger"] = record.name + log_dict["message"] = record.getMessage() + + # Location info + if self.include_location: + log_dict["file"] = record.filename + log_dict["line"] = record.lineno + log_dict["function"] = record.funcName + + # Exception info + if record.exc_info: + log_dict["exception"] = self.formatException(record.exc_info) + + # Extra fields + standard_attrs = { + 'name', 'msg', 'args', 'created', 'filename', 'funcName', + 'levelname', 'levelno', 'lineno', 'module', 'msecs', + 'pathname', 'process', 'processName', 'relativeCreated', + 'stack_info', 'exc_info', 'exc_text', 'thread', 'threadName', + 'taskName', 'message' + } + + for key, value in record.__dict__.items(): + if key not in standard_attrs and not key.startswith('_'): + try: + # Ensure value is JSON serializable + json.dumps(value) + log_dict[key] = value + except (TypeError, ValueError): + log_dict[key] = str(value) + + return json.dumps(log_dict, default=str) + + +class CompactFormatter(logging.Formatter): + """ + Compact formatter for minimal log output. + + Useful for: + - CI/CD pipelines + - Reduced log verbosity + - Quick debugging + + Example output: + 10:30:45 INFO optimizer: Starting optimization + """ + + def format(self, record: logging.LogRecord) -> str: + """Format record in compact form.""" + # Short timestamp (time only) + time_str = datetime.fromtimestamp(record.created).strftime("%H:%M:%S") + + # Short module name (last part only) + short_name = record.name.split(".")[-1] + + return f"{time_str} {record.levelname:5s} {short_name}: {record.getMessage()}" + diff --git a/src/gepa_optimizer/infrastructure/logging/logger.py b/src/gepa_optimizer/infrastructure/logging/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..e5cd5e11a86747139d3238cb77d1813aa712baf8 --- /dev/null +++ b/src/gepa_optimizer/infrastructure/logging/logger.py @@ -0,0 +1,260 @@ +""" +Core Logger Factory and Configuration. + +This module provides the centralized logger factory that should be used +across all GEPA Optimizer modules. It ensures consistent logging behavior +and formatting throughout the application. + +Design Principles: +- Single source of truth for logger configuration +- Lazy initialization (loggers created on first use) +- Thread-safe logger access +- Configurable log levels per module +""" + +import logging +import sys +from enum import Enum +from typing import Optional, Dict, Any +from functools import lru_cache + +from .formatters import GepaFormatter + +# Root logger name for GEPA Optimizer +GEPA_LOGGER_NAME = "gepa_optimizer" + +# Default log format +DEFAULT_FORMAT = "%(asctime)s | %(levelname)-8s | %(name)s | %(message)s" +DEFAULT_DATE_FORMAT = "%Y-%m-%d %H:%M:%S" + + +class LogLevel(str, Enum): + """Supported log levels with string representation.""" + DEBUG = "DEBUG" + INFO = "INFO" + WARNING = "WARNING" + ERROR = "ERROR" + CRITICAL = "CRITICAL" + + @classmethod + def from_string(cls, level: str) -> "LogLevel": + """Convert string to LogLevel enum.""" + try: + return cls(level.upper()) + except ValueError: + return cls.INFO + + +class LoggerConfig: + """ + Configuration class for GEPA logging. + + This class holds all logging configuration and can be modified + before calling configure_logging() to customize behavior. + """ + + # Default configuration + level: LogLevel = LogLevel.INFO + format: str = DEFAULT_FORMAT + date_format: str = DEFAULT_DATE_FORMAT + + # Module-specific log levels (for fine-grained control) + module_levels: Dict[str, LogLevel] = {} + + # Output configuration + log_to_console: bool = True + log_to_file: Optional[str] = None + + # Formatting options + use_colors: bool = True + include_emoji: bool = True # For visual clarity in development + + @classmethod + def reset(cls) -> None: + """Reset configuration to defaults.""" + cls.level = LogLevel.INFO + cls.format = DEFAULT_FORMAT + cls.date_format = DEFAULT_DATE_FORMAT + cls.module_levels = {} + cls.log_to_console = True + cls.log_to_file = None + cls.use_colors = True + cls.include_emoji = True + + +# Global flag to track if logging is configured +_logging_configured = False + + +def configure_logging( + level: Optional[str] = None, + log_file: Optional[str] = None, + use_colors: bool = True, + include_emoji: bool = True, + format_string: Optional[str] = None, + module_levels: Optional[Dict[str, str]] = None, +) -> None: + """ + Configure the GEPA logging system. + + This should be called once at application startup. Subsequent calls + will update the configuration. + + Args: + level: Global log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + log_file: Optional path to log file + use_colors: Whether to use colored output in console + include_emoji: Whether to include emoji prefixes for visual clarity + format_string: Custom format string (optional) + module_levels: Dict mapping module names to their specific log levels + + Example: + configure_logging( + level="DEBUG", + log_file="optimization.log", + module_levels={ + "gepa_optimizer.core.optimizer": "INFO", + "gepa_optimizer.llms": "DEBUG" + } + ) + """ + global _logging_configured + + # Update configuration + if level: + LoggerConfig.level = LogLevel.from_string(level) + if log_file: + LoggerConfig.log_to_file = log_file + LoggerConfig.use_colors = use_colors + LoggerConfig.include_emoji = include_emoji + if format_string: + LoggerConfig.format = format_string + if module_levels: + LoggerConfig.module_levels = { + k: LogLevel.from_string(v) for k, v in module_levels.items() + } + + # Get or create root GEPA logger + root_logger = logging.getLogger(GEPA_LOGGER_NAME) + root_logger.setLevel(getattr(logging, LoggerConfig.level.value)) + + # Remove existing handlers to avoid duplicates + root_logger.handlers.clear() + + # Console handler + if LoggerConfig.log_to_console: + console_handler = logging.StreamHandler(sys.stdout) + console_handler.setLevel(getattr(logging, LoggerConfig.level.value)) + + # Use custom formatter + formatter = GepaFormatter( + fmt=LoggerConfig.format, + datefmt=LoggerConfig.date_format, + use_colors=use_colors, + include_emoji=include_emoji, + ) + console_handler.setFormatter(formatter) + root_logger.addHandler(console_handler) + + # File handler (if configured) + if LoggerConfig.log_to_file: + file_handler = logging.FileHandler(LoggerConfig.log_to_file) + file_handler.setLevel(getattr(logging, LoggerConfig.level.value)) + + # File logs don't use colors + file_formatter = GepaFormatter( + fmt=LoggerConfig.format, + datefmt=LoggerConfig.date_format, + use_colors=False, + include_emoji=False, + ) + file_handler.setFormatter(file_formatter) + root_logger.addHandler(file_handler) + + # Apply module-specific levels + for module_name, module_level in LoggerConfig.module_levels.items(): + module_logger = logging.getLogger(module_name) + module_logger.setLevel(getattr(logging, module_level.value)) + + _logging_configured = True + + # Log that configuration is complete + root_logger.debug( + f"Logging configured: level={LoggerConfig.level.value}, " + f"file={LoggerConfig.log_to_file}" + ) + + +@lru_cache(maxsize=128) +def get_logger(name: str) -> logging.Logger: + """ + Get a logger instance for the given module name. + + This is the primary factory function for obtaining loggers. + All GEPA modules should use this instead of logging.getLogger(). + + Args: + name: Module name (typically __name__) + + Returns: + Configured Logger instance + + Example: + from gepa_optimizer.infrastructure.logging import get_logger + + logger = get_logger(__name__) + logger.info("Starting process") + logger.error("Failed to connect", exc_info=True) + """ + global _logging_configured + + # Auto-configure with defaults if not yet configured + if not _logging_configured: + configure_logging() + + # Ensure name is under GEPA namespace for consistent handling + if not name.startswith(GEPA_LOGGER_NAME) and name != GEPA_LOGGER_NAME: + # External module - still use our formatting + pass + + logger = logging.getLogger(name) + + # Apply module-specific level if configured + if name in LoggerConfig.module_levels: + logger.setLevel(getattr(logging, LoggerConfig.module_levels[name].value)) + + return logger + + +def set_log_level(level: str, module: Optional[str] = None) -> None: + """ + Dynamically change log level at runtime. + + Args: + level: New log level (DEBUG, INFO, WARNING, ERROR, CRITICAL) + module: Optional module name. If None, changes global level. + + Example: + # Enable debug for specific module + set_log_level("DEBUG", "gepa_optimizer.core.optimizer") + + # Change global level + set_log_level("WARNING") + """ + log_level = LogLevel.from_string(level) + + if module: + # Set level for specific module + logger = logging.getLogger(module) + logger.setLevel(getattr(logging, log_level.value)) + LoggerConfig.module_levels[module] = log_level + else: + # Set global level + LoggerConfig.level = log_level + root_logger = logging.getLogger(GEPA_LOGGER_NAME) + root_logger.setLevel(getattr(logging, log_level.value)) + + # Update all handlers + for handler in root_logger.handlers: + handler.setLevel(getattr(logging, log_level.value)) + diff --git a/src/gepa_optimizer/llms/__init__.py b/src/gepa_optimizer/llms/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..352f7a77e882131acec1c3084b72bb5e502a8ef0 --- /dev/null +++ b/src/gepa_optimizer/llms/__init__.py @@ -0,0 +1,10 @@ +""" +LLM module for GEPA Optimizer +""" + +from .base_llm import BaseLLMClient +from .vision_llm import VisionLLMClient +from .batch_llm import BatchLLMClient +from .llego_enhanced_llm import LLEGOEnhancedLLMClient + +__all__ = ["BaseLLMClient", "VisionLLMClient", "BatchLLMClient", "LLEGOEnhancedLLMClient"] diff --git a/src/gepa_optimizer/llms/base_llm.py b/src/gepa_optimizer/llms/base_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..15ffb4e83212922f30edad4d9e18a4b248af6234 --- /dev/null +++ b/src/gepa_optimizer/llms/base_llm.py @@ -0,0 +1,56 @@ +""" +Base LLM client class for all LLM providers. +""" + +from abc import ABC, abstractmethod +from typing import Any, Dict, Optional, Union +import logging + +logger = logging.getLogger(__name__) + +class BaseLLMClient(ABC): + """ + Abstract base class for all LLM clients. + + Provides a consistent interface for different LLM providers and models. + """ + + def __init__(self, provider: str, model_name: str, **kwargs): + """ + Initialize LLM client. + + Args: + provider: LLM provider (e.g., 'openai', 'anthropic') + model_name: Specific model name + **kwargs: Additional provider-specific parameters + """ + self.provider = provider + self.model_name = model_name + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + # Store additional configuration + self.config = kwargs + + @abstractmethod + def generate(self, system_prompt: str, user_prompt: str, **kwargs) -> Dict[str, Any]: + """ + Generate response from LLM. + + Args: + system_prompt: System-level instructions + user_prompt: User's input prompt + **kwargs: Additional generation parameters (e.g., image_base64) + + Returns: + Dictionary with 'content' key containing the generated response + and additional metadata + """ + pass + + def get_model_info(self) -> Dict[str, str]: + """Get model information for logging and debugging""" + return { + 'provider': self.provider, + 'model_name': self.model_name, + 'class': self.__class__.__name__ + } diff --git a/src/gepa_optimizer/llms/batch_llm.py b/src/gepa_optimizer/llms/batch_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..dae4cf1462d0fe037442b59e9190cac09fe54975 --- /dev/null +++ b/src/gepa_optimizer/llms/batch_llm.py @@ -0,0 +1,712 @@ +""" +Batch LLM Client for cost-effective processing using Gemini Batch API. + +This client provides 50% cost savings by using Google's Gemini Batch API +instead of real-time API calls. Ideal for large-scale prompt optimization +where latency is acceptable. + +Features: +- 50% cost reduction compared to standard API +- Automatic batching and job management +- Built-in retry and polling logic +- Thread-safe operation +- Comprehensive error handling + +Author: GEPA Optimizer Team +""" + +import os +import json +import time +import logging +import tempfile +import io +from pathlib import Path +from typing import Dict, List, Any, Optional, Tuple +from .base_llm import BaseLLMClient + +try: + from PIL import Image + PIL_AVAILABLE = True +except ImportError: + PIL_AVAILABLE = False + Image = None + +try: + from google import genai + from google.genai import types + GENAI_AVAILABLE = True +except ImportError: + GENAI_AVAILABLE = False + genai = None + types = None + +logger = logging.getLogger(__name__) + + +class BatchLLMClient(BaseLLMClient): + """ + Batch LLM client that uses Gemini Batch API for cost-effective processing. + + This client processes multiple requests together in batch jobs, providing: + - 50% cost savings vs standard API + - No rate limit impact + - Automatic job management and polling + + Usage: + >>> from gepa_optimizer.llms import BatchLLMClient + >>> + >>> client = BatchLLMClient( + ... provider="google", + ... model_name="gemini-2.5-flash", + ... api_key="your-key", + ... batch_size=20, + ... polling_interval=30 + ... ) + >>> + >>> # Use just like VisionLLMClient - adapter handles the rest! + >>> result = client.generate( + ... system_prompt="You are a helpful assistant", + ... user_prompt="Analyze this image", + ... image_base64="..." + ... ) + + Performance Note: + Batch processing adds latency (30s+ polling time) but reduces costs by 50%. + Choose this mode for large-scale optimization where cost > speed. + """ + + def __init__( + self, + provider: str, + model_name: str, + api_key: Optional[str] = None, + batch_size: int = 20, + polling_interval: int = 30, + max_polling_time: int = 3600, + temp_dir: str = ".gepa_batch_temp", + **kwargs + ): + """ + Initialize Batch LLM Client. + + Args: + provider: Must be "google" or "gemini" + model_name: Gemini model (e.g., "gemini-2.5-flash", "gemini-1.5-flash") + api_key: Google API key (defaults to GEMINI_API_KEY env var) + batch_size: Number of samples to process per batch job (1-100) + polling_interval: Seconds between job status checks (default: 30) + max_polling_time: Maximum seconds to wait for job completion (default: 3600) + temp_dir: Directory for temporary files (default: ".gepa_batch_temp") + **kwargs: Additional parameters + + Raises: + ValueError: If provider is not Google/Gemini + ImportError: If google-genai is not installed + """ + super().__init__(provider=provider, model_name=model_name, **kwargs) + + # Validate provider + if provider.lower() not in ["google", "gemini"]: + raise ValueError( + f"BatchLLMClient only supports Google/Gemini provider. Got: {provider}" + ) + + # Check dependencies + if not GENAI_AVAILABLE: + raise ImportError( + "google-genai not installed. Install with: pip install google-genai" + ) + + # Configuration + self.batch_size = batch_size + self.polling_interval = polling_interval + self.max_polling_time = max_polling_time + self.temp_dir = Path(temp_dir) + self.temp_dir.mkdir(exist_ok=True) + + # Initialize Gemini client + from ..utils.api_keys import APIKeyManager + self.api_key = api_key or APIKeyManager().get_api_key("google") + + if not self.api_key: + raise ValueError( + "Google API key required. Provide via api_key parameter or " + "set GEMINI_API_KEY environment variable." + ) + + self.client = genai.Client(api_key=self.api_key) + + logger.info( + f"โœ“ BatchLLMClient initialized: {model_name} " + f"(batch_size={batch_size}, polling={polling_interval}s)" + ) + + def generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: Optional[str] = None, + **kwargs + ) -> Dict[str, Any]: + """ + Generate response using batch API. + + Note: This method is primarily for compatibility. For batch optimization, + the adapter will call generate_batch() directly with multiple requests. + + Args: + system_prompt: System-level instructions + user_prompt: User's input prompt + image_base64: Optional base64 encoded image + **kwargs: Additional generation parameters + + Returns: + Dict with 'content' key containing generated text + """ + # Single request - process as a batch of 1 + requests = [{ + 'system_prompt': system_prompt, + 'user_prompt': user_prompt, + 'image_base64': image_base64 + }] + + results = self.generate_batch(requests) + return results[0] if results else {"content": "", "error": "No results"} + + def generate_batch( + self, + requests: List[Dict[str, Any]], + timeout_override: Optional[int] = None + ) -> List[Dict[str, Any]]: + """ + Process multiple requests in a single batch job. + + This is the main method called by UniversalGepaAdapter during GEPA optimization. + + Args: + requests: List of request dicts with keys: + - system_prompt: System instructions + - user_prompt: User input + - image_base64: Optional base64 image + timeout_override: Override max_polling_time for this batch + + Returns: + List of response dicts with 'content' key + + Raises: + RuntimeError: If batch job fails + TimeoutError: If polling exceeds timeout + """ + logger.info(f"๐Ÿ“ฆ Processing batch of {len(requests)} requests via Gemini Batch API...") + + start_time = time.time() + + try: + # Step 1: Upload images if needed + file_uris, mime_types = self._upload_images_for_batch(requests) + + # Step 2: Create JSONL file + jsonl_path = self._create_batch_jsonl(requests, file_uris, mime_types) + + # Step 3: Submit batch job + batch_job_name = self._submit_batch_job(jsonl_path) + + # Step 4: Wait for completion + timeout = timeout_override or self.max_polling_time + self._wait_for_batch_completion(batch_job_name, timeout) + + # Step 5: Retrieve results + results = self._retrieve_batch_results(batch_job_name) + + # Cleanup + jsonl_path.unlink(missing_ok=True) + + elapsed_time = time.time() - start_time + logger.info( + f"โœ“ Batch processing complete: {len(results)} results in {elapsed_time:.1f}s " + f"(~{elapsed_time/len(results):.1f}s per request)" + ) + + return results + + except Exception as e: + elapsed_time = time.time() - start_time + logger.error(f"โŒ Batch processing failed after {elapsed_time:.1f}s: {e}") + raise + + def _upload_images_for_batch(self, requests: List[Dict]) -> Tuple[List[Optional[str]], List[Optional[str]]]: + """ + Upload images to Gemini and return file URIs and MIME types. + + Args: + requests: List of request dicts + + Returns: + Tuple of (file_uris, mime_types) - both are lists with None for requests without images + """ + file_uris = [] + mime_types = [] + images_to_upload = sum(1 for r in requests if r.get('image_base64')) + + if images_to_upload > 0: + logger.info(f" โฌ†๏ธ Uploading {images_to_upload} images to Gemini...") + + for i, request in enumerate(requests): + image_base64 = request.get('image_base64') + + if not image_base64: + file_uris.append(None) + mime_types.append(None) + continue + + try: + # Decode image data + import base64 + image_data = base64.b64decode(image_base64) + + # Detect image format using Pillow + image_format = None + if PIL_AVAILABLE: + try: + img = Image.open(io.BytesIO(image_data)) + image_format = img.format.lower() if img.format else None + except Exception as e: + logger.warning(f" โš ๏ธ Could not detect image format: {e}") + + # Map format to extension and MIME type + format_map = { + 'jpeg': ('.jpg', 'image/jpeg'), + 'jpg': ('.jpg', 'image/jpeg'), + 'png': ('.png', 'image/png'), + 'gif': ('.gif', 'image/gif'), + 'webp': ('.webp', 'image/webp'), + 'bmp': ('.bmp', 'image/bmp'), + 'tiff': ('.tiff', 'image/tiff'), + 'tif': ('.tiff', 'image/tiff'), + } + + # Get extension and MIME type (default to PNG if unknown) + ext, mime_type = format_map.get(image_format, ('.png', 'image/png')) + + if image_format and image_format not in format_map: + logger.warning(f" โš ๏ธ Unknown image format '{image_format}' for image {i}, defaulting to PNG") + elif not image_format: + logger.debug(f" โ„น๏ธ Could not detect format for image {i}, using PNG") + + # Save to temp file with correct extension + temp_file = tempfile.NamedTemporaryFile( + delete=False, + suffix=ext, + dir=self.temp_dir + ) + temp_file.write(image_data) + temp_file.close() + + # Upload to Gemini with correct MIME type + uploaded_file = self.client.files.upload( + file=temp_file.name, + config=types.UploadFileConfig( + display_name=f"batch_image_{i}_{int(time.time())}{ext}", + mime_type=mime_type + ) + ) + + logger.debug(f" โœ“ Uploaded image {i} as {mime_type}") + + # Wait for file to be active + self._wait_for_file_active(uploaded_file) + file_uris.append(uploaded_file.uri) + mime_types.append(mime_type) + + # Cleanup temp file + Path(temp_file.name).unlink() + + except Exception as e: + logger.error(f" โœ— Failed to upload image {i}: {e}") + file_uris.append(None) + mime_types.append(None) + + if images_to_upload > 0: + successful = sum(1 for uri in file_uris if uri is not None) + logger.info(f" โœ“ Uploaded {successful}/{images_to_upload} images successfully") + + return file_uris, mime_types + + def _create_batch_jsonl( + self, + requests: List[Dict], + file_uris: List[Optional[str]], + mime_types: List[Optional[str]] + ) -> Path: + """ + Create JSONL file for batch job. + + Args: + requests: List of request dicts + file_uris: List of uploaded file URIs + mime_types: List of MIME types for uploaded files + + Returns: + Path to created JSONL file + """ + timestamp = int(time.time()) + jsonl_path = self.temp_dir / f"batch_{timestamp}.jsonl" + + with open(jsonl_path, 'w', encoding='utf-8') as f: + for i, (request, file_uri, mime_type) in enumerate(zip(requests, file_uris, mime_types)): + # Combine system and user prompts + system_prompt = request.get('system_prompt', '') + user_prompt = request.get('user_prompt', '') + full_prompt = f"{system_prompt}\n\n{user_prompt}".strip() + + # Build request parts + parts = [{"text": full_prompt}] + + if file_uri: + parts.append({ + "file_data": { + "file_uri": file_uri, + "mime_type": mime_type or "image/png" # Use actual MIME type + } + }) + + # Gemini Batch API format according to official docs + # Reference: https://ai.google.dev/gemini-api/docs/batch-inference + # NOTE: The "request" wrapper is REQUIRED for Gemini 2.5 batch API + batch_request = { + "custom_id": f"request-{i}", + "request": { + "contents": [{ + "role": "user", + "parts": parts + }] + } + } + + f.write(json.dumps(batch_request, ensure_ascii=False) + '\n') + + logger.info(f" ๐Ÿ“ Created JSONL file: {jsonl_path.name} ({len(requests)} requests)") + return jsonl_path + + def _submit_batch_job(self, jsonl_path: Path) -> str: + """ + Submit batch job to Gemini. + + Args: + jsonl_path: Path to JSONL file + + Returns: + Batch job name + """ + # Upload JSONL file + # Try multiple methods as the google-genai SDK can be finicky + try: + logger.info(f" ๐Ÿ“ค Uploading JSONL file: {jsonl_path.name}") + + # Read and validate file content + with open(jsonl_path, 'r', encoding='utf-8') as f: + content = f.read() + line_count = len(content.strip().split('\n')) + logger.debug(f" ๐Ÿ“„ JSONL: {len(content)} bytes, {line_count} lines") + + # Validate JSONL format + for line_num, line in enumerate(content.strip().split('\n'), 1): + try: + json.loads(line) + except json.JSONDecodeError as e: + logger.error(f" โŒ Invalid JSON at line {line_num}: {e}") + logger.error(f" Content: {line[:100]}...") + raise ValueError(f"Invalid JSONL format at line {line_num}") from e + + # Method 1: Try uploading with Path object + logger.info(f" ๐Ÿ”„ Upload method 1: Using Path object...") + try: + jsonl_file = self.client.files.upload( + file=jsonl_path, + config=types.UploadFileConfig( + display_name=f'gepa-batch-{int(time.time())}', + mime_type='application/json' # Try application/json instead of application/jsonl + ) + ) + logger.info(f" โœ“ JSONL file uploaded: {jsonl_file.name}") + + except Exception as e1: + logger.warning(f" โš ๏ธ Method 1 failed: {e1}") + logger.info(f" ๐Ÿ”„ Upload method 2: Using string path...") + + # Method 2: Fallback to string path + try: + jsonl_file = self.client.files.upload( + file=str(jsonl_path.absolute()), + config=types.UploadFileConfig( + display_name=f'gepa-batch-{int(time.time())}', + mime_type='application/json' + ) + ) + logger.info(f" โœ“ JSONL file uploaded (method 2): {jsonl_file.name}") + except Exception as e2: + logger.error(f" โŒ Method 2 also failed: {e2}") + raise e2 + + except KeyError as e: + logger.error(f"โŒ KeyError during JSONL upload: {e}") + logger.error(f" This suggests the Gemini API response format changed") + logger.error(f" Try updating google-genai: pip install --upgrade google-genai") + raise RuntimeError(f"Gemini Batch API response format error: {e}") from e + except Exception as e: + logger.error(f"โŒ Failed to upload JSONL file: {e}") + logger.error(f" File path: {jsonl_path}") + logger.error(f" File exists: {jsonl_path.exists()}") + logger.error(f" File size: {jsonl_path.stat().st_size if jsonl_path.exists() else 'N/A'} bytes") + raise RuntimeError(f"Gemini Batch API file upload failed: {e}") from e + + # Wait for JSONL to be active + try: + logger.info(f" โณ Waiting for JSONL file to be processed...") + self._wait_for_file_active(jsonl_file) + except Exception as e: + logger.error(f"โŒ JSONL file processing failed: {e}") + raise + + # Create batch job + try: + logger.info(f" ๐Ÿš€ Creating batch job...") + batch_job = self.client.batches.create( + model=self.model_name, + src=jsonl_file.name, + config={'display_name': f'gepa-opt-{int(time.time())}'} + ) + + logger.info(f" โœ“ Batch job submitted: {batch_job.name}") + return batch_job.name + + except Exception as e: + logger.error(f"โŒ Failed to create batch job: {e}") + raise RuntimeError(f"Batch job creation failed: {e}") from e + + def _wait_for_batch_completion(self, job_name: str, timeout: int): + """ + Poll batch job until completion. + + Args: + job_name: Batch job name + timeout: Maximum seconds to wait + + Raises: + TimeoutError: If polling exceeds timeout + RuntimeError: If batch job fails + """ + logger.info(f" โณ Polling for completion (checking every {self.polling_interval}s)...") + + start_time = time.time() + poll_count = 0 + + while True: + elapsed = time.time() - start_time + + if elapsed > timeout: + raise TimeoutError( + f"Batch job timeout after {elapsed:.0f}s " + f"(max: {timeout}s)" + ) + + try: + batch_job = self.client.batches.get(name=job_name) + state = batch_job.state.name + + # Success states + if state in ['JOB_STATE_SUCCEEDED', 'SUCCEEDED']: + logger.info(f" โœ“ Batch job completed in {elapsed:.0f}s") + return + + # Failure states + if state in ['JOB_STATE_FAILED', 'FAILED']: + raise RuntimeError(f"Batch job failed with state: {state}") + + if state in ['JOB_STATE_CANCELLED', 'CANCELLED']: + raise RuntimeError(f"Batch job was cancelled: {state}") + + # Still processing + poll_count += 1 + if poll_count % 5 == 0: # Log every 5 polls + logger.info(f" ... still processing ({elapsed:.0f}s elapsed, state: {state})") + + time.sleep(self.polling_interval) + + except (TimeoutError, RuntimeError): + raise + except Exception as e: + logger.warning(f" โš ๏ธ Error checking job status: {e}, retrying...") + time.sleep(5) + + def _retrieve_batch_results(self, job_name: str) -> List[Dict[str, Any]]: + """ + Retrieve and parse batch results. + + Args: + job_name: Batch job name + + Returns: + List of result dicts + """ + batch_job = self.client.batches.get(name=job_name) + + # Check for inline responses (preferred) + if hasattr(batch_job.dest, 'inlined_responses') and batch_job.dest.inlined_responses: + logger.info(f" ๐Ÿ“ฅ Processing inline responses...") + return self._parse_inline_results(batch_job.dest.inlined_responses) + + # Download results file (fallback) + if hasattr(batch_job.dest, 'file_name') and batch_job.dest.file_name: + logger.info(f" ๐Ÿ“ฅ Downloading results file: {batch_job.dest.file_name}") + file_data = self.client.files.download(file=batch_job.dest.file_name) + return self._parse_file_results(file_data) + + raise RuntimeError("No results available from batch job") + + def _parse_inline_results(self, inline_responses) -> List[Dict[str, Any]]: + """Parse inline batch results.""" + results = [] + + for response_obj in inline_responses: + if hasattr(response_obj, 'response') and response_obj.response: + text = self._extract_text_from_response(response_obj.response) + results.append({ + "content": text, + "role": "assistant", + "model": self.model_name, + "provider": "google" + }) + else: + error_msg = str(getattr(response_obj, 'error', 'Unknown error')) + logger.warning(f" โš ๏ธ Response error: {error_msg}") + results.append({ + "content": "", + "error": error_msg + }) + + return results + + def _parse_file_results(self, file_data) -> List[Dict[str, Any]]: + """Parse JSONL results file.""" + if isinstance(file_data, bytes): + jsonl_content = file_data.decode('utf-8') + else: + jsonl_content = file_data + + results = [] + + for line_num, line in enumerate(jsonl_content.strip().split('\n'), 1): + if not line.strip(): + continue + + try: + result = json.loads(line) + + if 'response' in result: + text = self._extract_text_from_dict(result['response']) + results.append({ + "content": text, + "role": "assistant", + "model": self.model_name, + "provider": "google" + }) + else: + error_msg = result.get('error', 'Unknown error') + logger.warning(f" โš ๏ธ Line {line_num} error: {error_msg}") + results.append({ + "content": "", + "error": error_msg + }) + + except json.JSONDecodeError as e: + logger.error(f" โœ— Line {line_num}: JSON decode error: {e}") + results.append({"content": "", "error": f"JSON decode error: {e}"}) + + return results + + def _extract_text_from_response(self, response_obj) -> str: + """Extract text from response object.""" + try: + # Direct text attribute + if hasattr(response_obj, 'text'): + return response_obj.text + + # Navigate through candidates + if hasattr(response_obj, 'candidates') and response_obj.candidates: + candidate = response_obj.candidates[0] + if hasattr(candidate, 'content'): + content = candidate.content + if hasattr(content, 'parts') and content.parts: + part = content.parts[0] + if hasattr(part, 'text'): + return part.text + + # Fallback to string representation + return str(response_obj) + + except Exception as e: + logger.error(f"Error extracting text from response: {e}") + return "" + + def _extract_text_from_dict(self, response_dict: Dict) -> str: + """Extract text from response dictionary.""" + try: + # Direct text key + if 'text' in response_dict: + return response_dict['text'] + + # Navigate through candidates + if 'candidates' in response_dict and response_dict['candidates']: + candidate = response_dict['candidates'][0] + if 'content' in candidate and 'parts' in candidate['content']: + parts = candidate['content']['parts'] + if parts and 'text' in parts[0]: + return parts[0]['text'] + + # Fallback to JSON string + return json.dumps(response_dict) + + except Exception as e: + logger.error(f"Error extracting text from dict: {e}") + return "" + + def _wait_for_file_active(self, uploaded_file, timeout: int = 60): + """ + Wait for uploaded file to become active. + + Args: + uploaded_file: Uploaded file object + timeout: Maximum seconds to wait + + Raises: + TimeoutError: If file processing exceeds timeout + RuntimeError: If file processing fails + """ + start_time = time.time() + + while uploaded_file.state.name == "PROCESSING": + if time.time() - start_time > timeout: + raise TimeoutError(f"File processing timeout: {uploaded_file.name}") + + time.sleep(1) + uploaded_file = self.client.files.get(name=uploaded_file.name) + + if uploaded_file.state.name != "ACTIVE": + raise RuntimeError( + f"File processing failed: {uploaded_file.name} " + f"(state: {uploaded_file.state.name})" + ) + + def get_model_info(self) -> Dict[str, str]: + """Get model information for logging and debugging.""" + return { + 'provider': self.provider, + 'model_name': self.model_name, + 'class': self.__class__.__name__, + 'mode': 'batch', + 'batch_size': str(self.batch_size), + 'polling_interval': f'{self.polling_interval}s' + } + diff --git a/src/gepa_optimizer/llms/llego_enhanced_llm.py b/src/gepa_optimizer/llms/llego_enhanced_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..8785cffea3eb2ef716139f07e92b0f62d3d5cde3 --- /dev/null +++ b/src/gepa_optimizer/llms/llego_enhanced_llm.py @@ -0,0 +1,1625 @@ +""" +LLEGO-Enhanced LLM Client Wrapper + +This wrapper intercepts LLM calls and uses LLEGO genetic operators +when generating new prompt candidates during GEPA's reflection phase. +""" + +import logging +import re +from typing import Optional, Dict, Any, Callable, List +from .base_llm import BaseLLMClient + +logger = logging.getLogger(__name__) + +# Fallback system prompt for sequential generation (when JSON parsing fails) +# Uses Linear Command structure for reliability when complex JSON generation fails +_FALLBACK_SYSTEM_PROMPT = """You are a Prompt Optimization Engine operating in **SAFE MODE**. + + +Rewrite the prompt based on the feedback provided below. + + + +1. Output **ONLY** the new prompt text. +2. No JSON. No Explanations. No "Here is the prompt". +3. The prompt must be fully functional and self-contained. +4. START directly with the prompt content (e.g., "You are a..." or task instructions). +5. Preserve the core task/domain - only improve HOW it's described. + + + +- Be specific and concrete (no vague instructions) +- Use clear, imperative language +- Include edge case handling if feedback identifies confusion +- Ensure the prompt is self-contained and unambiguous +- Add explicit constraints for format/output if needed + + + +- Analysis of what went wrong +- Explanations of your changes +- Meta-text like "Here's an improved version..." +- Anything other than the raw prompt text + + +Start of New Prompt:""" + + +class LLEGOEnhancedLLMClient(BaseLLMClient): + """ + Wrapper around BaseLLMClient that uses LLEGO for candidate generation. + + This wrapper detects when GEPA is asking for new prompt candidates + and routes those requests through LLEGO's genetic operators instead + of standard LLM generation. + """ + + def __init__( + self, + base_llm: BaseLLMClient, + llego_layer, + config=None, + verbose: bool = True + ): + """ + Initialize LLEGO-enhanced LLM client. + + Args: + base_llm: The underlying LLM client (VisionLLMClient, etc.) + llego_layer: LLEGOIntegrationLayer instance + config: Optional OptimizationConfig for hybrid mode settings + verbose: Whether to log LLEGO operations + """ + self.base_llm = base_llm + self.llego = llego_layer + self.config = config + self.verbose = verbose + + # Get log level from config (default to INFO) + self.log_level = getattr(config, 'log_level', 'INFO') if config else 'INFO' + + # Track context for detecting reflection calls + self.reflection_context = { + 'current_prompt': None, + 'feedback': None, + 'in_reflection': False + } + + # Queue for hybrid mode candidates (GEPA will call generate() multiple times) + self._candidate_queue = [] + self._hybrid_generation_complete = False + + # ๐Ÿ”ฅ CRITICAL: Queue for adapter-generated candidates (from make_reflective_dataset) + # When adapter generates candidates at adapter level, they're stored here + # GEPA will call generate() for proposals, and we'll return these candidates + self._adapter_generated_candidates = [] + + + # ๐Ÿ”ฅ FORMAT AWARENESS: Store format info from adapter for use in candidate generation + self._detected_format = None # Will be set by adapter after format detection + + # FIX #5: Circuit breaker for LLEGO failures + self._llego_failures = 0 + self._llego_disabled = False + self._llego_failure_threshold = 3 # Disable after 3 consecutive failures + + logger.info("๐Ÿงฌ LLEGO-Enhanced LLM Client initialized") + logger.info(f" Base LLM: {base_llm.__class__.__name__}") + logger.info(f" LLEGO enabled: {llego_layer is not None}") + if config and hasattr(config, 'enable_gepa_reflection_with_llego'): + logger.info(f" Hybrid mode: {config.enable_gepa_reflection_with_llego}") + logger.debug(f" Log level: {self.log_level}") + + def _should_log_debug(self) -> bool: + """ + Check if DEBUG logging is enabled. + + Returns: + True if DEBUG level logging is enabled, False otherwise + """ + return self.log_level == "DEBUG" or ( + hasattr(logging, 'getLogger') and + logging.getLogger().isEnabledFor(logging.DEBUG) + ) + + def _extract_clean_prompt_from_reflection(self, reflection_output: str) -> str: + """ + ๐Ÿ›ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis despite system prompt instructions. + + NOTE: The system prompt now explicitly instructs the LLM to output ONLY the prompt text. + However, this extraction logic serves as a safety net in case the LLM still adds: + "Based on the performance analysis... + ### Recommendations... + ### Revised Prompt Example: + [THE ACTUAL PROMPT HERE] + ### Conclusion..." + + This is now a defensive measure, not the primary mechanism. + + Args: + reflection_output: Full reflection output (should be clean prompt, but may contain analysis) + + Returns: + str: Clean, extracted prompt (or original if extraction fails or not needed) + """ + if not reflection_output or not isinstance(reflection_output, str): + return reflection_output + + # Pattern 1: Look for "Revised Prompt Example:" or "### Revised Prompt Example:" + patterns = [ + r'(?:###\s*)?Revised\s+Prompt\s+(?:Example|:)?\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:###\s*)?Revised\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:###\s*)?Optimized\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:###\s*)?New\s+Prompt\s*:\s*\n(.*?)(?:\n###|\n##|\n---|\Z)', + r'(?:Here\s+is|Here\'s)\s+a?\s*refined?\s+(?:version\s+of\s+)?(?:the\s+)?prompt\s*[:\n](.*?)(?:\n###|\n##|\n---|\Z)', + ] + + for pattern in patterns: + match = re.search(pattern, reflection_output, re.IGNORECASE | re.DOTALL) + if match: + extracted = match.group(1).strip() + # Clean up common artifacts + extracted = re.sub(r'^```(?:plaintext|markdown|text)?\s*\n', '', extracted, flags=re.MULTILINE) + extracted = re.sub(r'\n```\s*$', '', extracted, flags=re.MULTILINE) + extracted = extracted.strip() + + if len(extracted) > 50: # Reasonable minimum length for a prompt + logger.debug(f"โœ… Extracted clean prompt using pattern: {pattern[:50]}...") + logger.debug(f" Original length: {len(reflection_output)} chars") + logger.debug(f" Extracted length: {len(extracted)} chars") + return extracted + + # Pattern 2: If output starts with a quote or prompt-like structure + # Look for text that starts with "You are..." and is substantial + if 'You are' in reflection_output: + # Find the longest continuous block that starts with "You are" + prompt_match = re.search(r'(You are[^#]*?)(?:\n###|\n##|###|##|Conclusion|\Z)', + reflection_output, re.IGNORECASE | re.DOTALL) + if prompt_match: + extracted = prompt_match.group(1).strip() + if len(extracted) > 50: + logger.debug(f"โœ… Extracted prompt starting with 'You are...'") + return extracted + + # Pattern 3: If the reflection output is actually just a clean prompt (no analysis) + # Check if it's relatively short and doesn't contain analysis keywords + analysis_keywords = ['recommendation', 'suggestion', 'improvement', 'conclusion', + 'optimization', 'analysis', 'feedback'] + if (len(reflection_output) < 2000 and + not any(keyword in reflection_output.lower() for keyword in analysis_keywords)): + # Likely a clean prompt, return as-is + logger.debug(f"โœ… Reflection output appears to be a clean prompt (no analysis detected)") + return reflection_output.strip() + + # Fallback: Try to extract ANY valid prompt-like text + # Look for text that might be a prompt even if not perfectly formatted + if 'You are' in reflection_output: + # Try to find a substantial block starting with "You are" + potential_prompt = re.search( + r'(You are(?:[^\.]|\.(?!\s*(?:Here|This|These|The above)))*?)(?:\n\n|\n###|Conclusion|\Z)', + reflection_output, + re.IGNORECASE | re.DOTALL + ) + if potential_prompt and len(potential_prompt.group(1)) > 100: + extracted = potential_prompt.group(1).strip() + logger.warning(f"โš ๏ธ Could not extract clean prompt using standard patterns") + logger.warning(f" Falling back to 'You are...' block (length: {len(extracted)} chars)") + logger.warning(f" This may still contain some analysis text") + return extracted + + # Final fallback: If still nothing, return original but log strongly + logger.warning(f"โš ๏ธ Could not extract clean prompt from reflection output") + logger.warning(f" Output length: {len(reflection_output)} chars") + logger.warning(f" Output preview: {reflection_output[:200]}...") + logger.warning(f" โš ๏ธ WARNING: Returning original output (may contain analysis text or be invalid)") + logger.warning(f" This candidate may perform poorly - consider improving extraction logic") + return reflection_output.strip() + + def _parse_json_variations(self, response_text: str, num_expected: int) -> List[str]: + """ + ๐Ÿ”ฅ OPTIMIZED: Parse N prompt variations from JSON format response. + + Uses robust JSON parsing with multiple fallback strategies. + + Handles common LLM output issues: + - Markdown code blocks (```json ... ```) + - Extra text before/after JSON + - Trailing commas + - Comments in JSON + - Newlines in strings + """ + import json + import re + + if not response_text or not isinstance(response_text, str): + raise ValueError("Empty or invalid response text") + + # ๐Ÿ”ฅ PREPROCESSING: Clean LLM output + cleaned = response_text.strip() + + # Remove BOM and invisible chars + cleaned = cleaned.lstrip('\ufeff\u200b\u200c\u200d') + + # Strategy 0: Handle Python dict syntax (single quotes -> double quotes) + # LLMs sometimes return Python dict syntax {'key': 'value'} instead of JSON {"key": "value"} + if "'variations'" in cleaned or (cleaned.startswith("{'") or cleaned.startswith("{'variations'")): + try: + import ast + # Try to parse as Python literal (handles single quotes, True/False, None) + python_dict = ast.literal_eval(cleaned) + if isinstance(python_dict, dict) and 'variations' in python_dict: + # Convert to JSON-compatible format + json_str = json.dumps(python_dict) + data = json.loads(json_str) + if 'variations' in data: + # #region agent log + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_parse", "message": "Successfully parsed Python dict syntax", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + return self._extract_variations_from_json(data, num_expected) + except (ValueError, SyntaxError, TypeError) as e: + # If ast.literal_eval fails, try string replacement as fallback + try: + # Simple conversion: replace single quotes with double quotes (with escaping) + # This is a heuristic and may not work for all cases + converted = cleaned.replace("'", '"') + data = json.loads(converted) + if 'variations' in data: + # #region agent log + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "JSON_FIX", "location": "llego_enhanced_llm.py:python_dict_string_replace", "message": "Parsed Python dict via string replacement", "data": {"num_expected": num_expected, "parsed_variations": len(data.get('variations', []))}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 1: Direct JSON parse (cleanest case) + try: + data = json.loads(cleaned) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 2: Extract from markdown code block + # More permissive regex that handles various formats + code_block_patterns = [ + r'```(?:json|JSON)?\s*(\{[\s\S]*?\})\s*```', # Standard markdown + r'```\s*(\{[\s\S]*"variations"[\s\S]*\})\s*```', # With "variations" keyword + ] + + for pattern in code_block_patterns: + json_match = re.search(pattern, cleaned) + if json_match: + json_str = json_match.group(1) + try: + data = json.loads(json_str) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + # Try repair + repaired = self._repair_json_string(json_str) + try: + data = json.loads(repaired) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 3: Balanced brace extraction (handles nested objects) + json_str = self._extract_balanced_json(cleaned) + if json_str: + try: + data = json.loads(json_str) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + repaired = self._repair_json_string(json_str) + try: + data = json.loads(repaired) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 4: Find JSON object with "variations" keyword + # Use greedy matching to get the full object + json_match = re.search(r'(\{[\s\S]*"variations"[\s\S]*\})', cleaned) + if json_match: + json_str = json_match.group(1) + # Find the balanced JSON within + balanced = self._extract_balanced_json(json_str) + if balanced: + try: + data = json.loads(balanced) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + repaired = self._repair_json_string(balanced) + try: + data = json.loads(repaired) + if 'variations' in data: + return self._extract_variations_from_json(data, num_expected) + except json.JSONDecodeError: + pass + + # Strategy 5: Fallback to numbered sections + logger.warning(f"JSON parsing failed, trying numbered section fallback...") + try: + return self._parse_numbered_section_variations(response_text, num_expected) + except ValueError: + pass + + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "D", "location": "llego_enhanced_llm.py:json_parse_fail", "message": "JSON parsing failed completely", "data": {"num_expected": num_expected, "response_preview": response_text[:500] if response_text else "EMPTY", "response_length": len(response_text) if response_text else 0}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + raise ValueError(f"Could not parse {num_expected} variations from response") + + def _extract_balanced_json(self, text: str) -> Optional[str]: + """Extract JSON with balanced braces.""" + brace_count = 0 + start_idx = -1 + in_string = False + escape_next = False + + for i, char in enumerate(text): + # Handle string escaping + if escape_next: + escape_next = False + continue + if char == '\\' and in_string: + escape_next = True + continue + if char == '"' and not escape_next: + in_string = not in_string + continue + + # Skip characters inside strings + if in_string: + continue + + if char == '{': + if brace_count == 0: + start_idx = i + brace_count += 1 + elif char == '}': + brace_count -= 1 + if brace_count == 0 and start_idx >= 0: + return text[start_idx:i+1] + + return None + + def _repair_json_string(self, json_str: str) -> str: + """ + Repair common JSON issues from LLM output. + + Fixes: + - Trailing commas + - Comments + - Unescaped newlines in strings + """ + repaired = json_str + + # Remove trailing commas before } or ] + repaired = re.sub(r',\s*}', '}', repaired) + repaired = re.sub(r',\s*]', ']', repaired) + + # Remove single-line comments + repaired = re.sub(r'//[^\n]*\n', '\n', repaired) + + # Remove multi-line comments + repaired = re.sub(r'/\*[\s\S]*?\*/', '', repaired) + + return repaired + + def _extract_variations_from_json(self, data: Dict[str, Any], num_expected: int) -> List[str]: + """Extract and validate variations from parsed JSON data.""" + + if not isinstance(data, dict): + raise ValueError("JSON data is not a dictionary") + + variations_list = data.get('variations', []) + if not isinstance(variations_list, list): + raise ValueError("'variations' field is not a list") + + # Extract and sort by index + variations_with_index = [] + for var in variations_list: + if not isinstance(var, dict): + continue + index = var.get('index', 0) + prompt = var.get('prompt', '') + if prompt and isinstance(prompt, str): + variations_with_index.append((index, prompt.strip())) + + variations_with_index.sort(key=lambda x: x[0]) + variations = [v[1] for v in variations_with_index] + + # Validate count + if len(variations) < num_expected: + logger.warning(f"Only {len(variations)} valid variations found, expected {num_expected}") + while len(variations) < num_expected: + variations.append(variations[-1] if variations else "") + + variations = variations[:num_expected] + + if not all(v for v in variations): + raise ValueError(f"Some variations are empty after parsing") + + return variations + + def _parse_numbered_section_variations(self, response_text: str, num_expected: int) -> List[str]: + """Fallback parser: Extract variations from numbered sections.""" + import re + + variations = [] + + pattern1 = r'---\s*VARIATION\s+(\d+)\s*---\s*\n(.*?)(?=\n---\s*VARIATION|\Z)' + matches1 = re.findall(pattern1, response_text, re.DOTALL | re.IGNORECASE) + + pattern2 = r'Variation\s+(\d+)\s*:?\s*\n(.*?)(?=\nVariation\s+\d+|$)' + matches2 = re.findall(pattern2, response_text, re.DOTALL | re.IGNORECASE) + + pattern3 = r'(\d+)\.\s*\n(.*?)(?=\n\d+\.|$)' + matches3 = re.findall(pattern3, response_text, re.DOTALL) + + matches = matches1 if len(matches1) >= num_expected else (matches2 if len(matches2) >= num_expected else matches3) + + if len(matches) >= num_expected: + matches.sort(key=lambda x: int(x[0])) + variations = [match[1].strip() for match in matches[:num_expected]] + + if len(variations) != num_expected: + raise ValueError(f"Numbered section parsing found {len(variations)} variations, expected {num_expected}") + + return variations + + def _is_valid_prompt(self, prompt: str) -> bool: + """ + Validate that extracted text is actually a valid system prompt. + + Uses minimal, conservative filtering: only rejects OBVIOUSLY wrong text. + Let evaluation decide on quality - false negatives (rejecting good prompts) + are worse than false positives (accepting bad prompts). + + Args: + prompt: Extracted text to validate + + Returns: + True if appears to be a valid prompt, False if obviously wrong + """ + if not prompt or not prompt.strip(): + return False + + prompt_lower = prompt.lower().strip() + + # STRONG indicators of analysis text (high confidence rejection) + # These are phrases that almost never appear in actual prompts + strong_analysis_patterns = [ + 'in conclusion', + 'to summarize', + 'based on the analysis', + 'the analysis shows', + 'here are some suggestions', + 'it seems you\'re looking for', + ] + + # Check first 200 characters for strong patterns + first_200 = prompt_lower[:200] + for pattern in strong_analysis_patterns: + if pattern in first_200: + if self._should_log_debug(): + logger.debug(f"Rejected prompt: contains analysis pattern '{pattern}'") + return False + + # POSITIVE indicators of valid prompt (high confidence acceptance) + # These are common prompt starters + valid_starters = [ + 'you are', + 'you\'re', + 'your task', + 'your role', + 'analyze', + 'identify', + 'select', + 'determine', + 'given', + 'when', + ] + + # If starts with valid prompt pattern, accept immediately + first_100 = prompt_lower[:100] + if any(first_100.startswith(starter) for starter in valid_starters): + return True + + # DEFAULT: Accept everything else and let evaluation decide + # This is conservative - we'd rather evaluate a bad prompt than reject a good one + return True + + def set_reflection_context( + self, + current_prompt: Optional[str] = None, + feedback: Optional[Any] = None, + in_reflection: bool = False + ): + """ + Set context for the next generate() call. + + Args: + current_prompt: The prompt being reflected upon + feedback: Evaluation feedback + in_reflection: Whether we're in reflection mode + """ + self.reflection_context = { + 'current_prompt': current_prompt, + 'feedback': feedback, + 'in_reflection': in_reflection + } + + # Reset candidate queue when entering new reflection phase + if in_reflection: + self._candidate_queue = [] + self._hybrid_generation_complete = False + if self._should_log_debug(): + logger.debug("๐Ÿ”„ Entering LLEGO reflection mode (queue reset)") + else: + logger.info("๐Ÿ”„ Entering LLEGO reflection mode") + + def generate( + self, + system_prompt: str = "", + user_prompt: str = "", + image_base64: str = "", + **kwargs + ) -> Dict[str, Any]: + """ + Generate response, using LLEGO for reflection calls. + + ๐Ÿ”ฅ CRITICAL: This method intercepts ALL LLM calls. For candidate generation, + it checks if we have pre-generated candidates from hybrid mode and returns those. + + Args: + system_prompt: System prompt + user_prompt: User prompt + image_base64: Base64-encoded image (if any) + **kwargs: Additional arguments + + Returns: + Dict with 'content' key containing the generated text + """ + # ๐Ÿ” DEBUG: Log generate calls (full details at DEBUG level) + if self._should_log_debug(): + logger.debug(f"๐Ÿ” LLEGO Wrapper: generate() called") + logger.debug(f" system_prompt: '{system_prompt[:100]}...' (truncated)") + logger.debug(f" user_prompt length: {len(user_prompt)} chars") + logger.debug(f" in_reflection: {self.reflection_context['in_reflection']}") + logger.debug(f" has_image: {bool(image_base64)}") + + # #region agent log + try: + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "INTERCEPTION", "location": "llego_enhanced_llm.py:generate", "message": "Generate called", "data": {"system_prompt_len": len(system_prompt), "user_prompt_len": len(user_prompt), "has_image": bool(image_base64), "has_candidates": len(getattr(self, '_adapter_generated_candidates', [])), "in_reflection": self.reflection_context.get('in_reflection', False)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + except Exception: + pass + # #endregion + + # ๐Ÿ”ฅ CRITICAL: Check if we have pre-generated candidates from adapter-level generation + # This happens when GEPA calls adapter.llm_client to generate candidates + # We intercept and return our pre-generated candidates instead + # ๐Ÿ”ฅ NEW: Select BEST candidate instead of FIFO + # ๐Ÿ”ฅ FIX: DON'T intercept evaluation calls (those have images!) + # Only intercept proposal calls (no images, just asking for new candidate) + # ๐Ÿ”ฅ FIX 2: DON'T intercept TEST EVALUATION calls! + # Test evaluation has no images but uses the OPTIMIZED prompt to execute tasks + # We detect test evaluation by checking if this is a TASK EXECUTION call (not reflection) + is_task_execution = ( + # Task execution prompts contain task instructions, not optimization requests + not any(kw in system_prompt.lower() for kw in ['evolutionary', 'mutation', 'variation', 'optimize', 'improve prompt', 'rewrite', 'generate variations']) and + # Short prompts are usually task prompts, not optimization prompts + len(system_prompt) < 1000 and + # User prompt is the actual task input (short), not feedback (long) + len(user_prompt) < 2000 + ) + + # Log task execution detection for debugging + if is_task_execution and hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates: + logger.info(f"๐Ÿ”’ NOT intercepting: Task execution detected (not optimization)") + logger.debug(f" system_prompt_len={len(system_prompt)}, user_prompt_len={len(user_prompt)}") + + if hasattr(self, '_adapter_generated_candidates') and self._adapter_generated_candidates and not image_base64 and not is_task_execution: + # ๐Ÿ”ฅ BEST-CANDIDATE SELECTION: Find candidate with highest Dpareto score + # This ensures we use the best candidate for the current iteration + best_candidate = None + best_score = -float('inf') + best_idx = -1 + + # Check if candidates have scores stored + for idx, cand in enumerate(self._adapter_generated_candidates): + if isinstance(cand, dict): + # Try to get score from candidate dict + score = cand.get('score', -float('inf')) + + # If score not in dict, try to get from Pareto logger + if score == -float('inf'): + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + + # Look up score in Pareto front or evaluated candidates + cand_prompt = cand.get('prompt', '') + if cand_prompt: + normalized = cand_prompt.strip().strip('"\'') + # Check in Pareto front + for front_cand in pareto_log.pareto_front: + if front_cand.get('prompt', '').strip().strip('"\'') == normalized: + score = front_cand.get('score', -float('inf')) + break + + # If not in front, check evaluated candidates + if score == -float('inf'): + for eval_cand in pareto_log.candidates_evaluated: + if eval_cand.get('prompt', '').strip().strip('"\'') == normalized: + score = eval_cand.get('score', -float('inf')) + break + + if score > best_score: + best_score = score + best_candidate = cand + best_idx = idx + + # If no scores found, fall back to FIFO (first candidate) + if best_candidate is None and self._adapter_generated_candidates: + best_candidate = self._adapter_generated_candidates[0] + best_idx = 0 + logger.info(f"โš ๏ธ No scores found for candidates - using FIFO selection") + + # Remove selected candidate from queue + if best_idx >= 0: + self._adapter_generated_candidates.pop(best_idx) + + # Important event - keep at INFO + if best_score > -float('inf'): + logger.info(f"๐ŸŽฏ INTERCEPTING GEPA PROPOSAL CALL - Returning BEST candidate (score: {best_score:.4f})!") + logger.info(f"๐ŸŽฏ Remaining candidates: {len(self._adapter_generated_candidates)}") + else: + logger.info(f"๐ŸŽฏ INTERCEPTING GEPA PROPOSAL CALL - Returning pre-generated candidate!") + logger.info(f"๐ŸŽฏ Remaining candidates: {len(self._adapter_generated_candidates)}") + + if isinstance(best_candidate, dict) and 'prompt' in best_candidate: + prompt = best_candidate['prompt'] + + # Detailed logging only in DEBUG mode + if self._should_log_debug(): + logger.debug(f"โœ… Pre-generated candidate details:") + logger.debug(f"{'โ–“'*80}") + logger.debug(f"{prompt}") + logger.debug(f"{'โ–“'*80}") + else: + source = best_candidate.get('source', 'unknown') + score_info = f" (score: {best_score:.4f})" if best_score > -float('inf') else "" + logger.info(f"โœ… Candidate length: {len(prompt)} chars, Source: {source}{score_info}") + + return {'content': prompt, 'source': best_candidate.get('source', 'adapter_generated')} + elif isinstance(best_candidate, str): + if self._should_log_debug(): + logger.debug(f"โœ… Pre-generated candidate (string format):") + logger.debug(f"{'โ–“'*80}") + logger.debug(f"{best_candidate}") + logger.debug(f"{'โ–“'*80}") + else: + logger.info(f"โœ… Candidate length: {len(best_candidate)} chars") + return {'content': best_candidate, 'source': 'adapter_generated'} + + # ๐Ÿ”ฅ ENHANCED CALL TYPE DETECTION + # We need to distinguish between 4 types of calls: + # 1. Evaluation calls: Image + task command โ†’ identify element (pass through) + # 2. Judge calls: Image + "prompt engineer" โ†’ analyze feedback (pass through) + # 3. Proposal calls: No image + feedback โ†’ generate candidate (intercept) + # 4. JSON batch calls: JSON generation request (pass through) + + # FIX: DON'T intercept JSON batch generation calls + is_json_batch_request = ( + '"variations"' in system_prompt or + 'MUST BE VALID JSON' in system_prompt or + 'Output ONLY the JSON object' in system_prompt or + '```json' in system_prompt.lower() + ) + + # FIX: DON'T intercept LLM-as-Judge calls (they analyze feedback with images) + is_judge_call = ( + 'prompt engineer' in system_prompt.lower() or + 'analyzing mobile ui automation' in system_prompt.lower() or + 'expert prompt engineer' in system_prompt.lower() or + ('analyze' in system_prompt.lower() and 'screenshot with numbered bounding boxes' in system_prompt.lower() and image_base64) + ) + + # Check if this is a reflection call (GEPA asking for new candidate) + is_reflection_call = ( + self.reflection_context['in_reflection'] or + self._detect_reflection_call(system_prompt, user_prompt) + ) + + # Proposal calls are reflection calls WITHOUT images and NOT judge/JSON calls + # These are the calls we want to intercept with LLEGO + is_proposal_call = ( + not is_json_batch_request and # Not a JSON generation request + not is_judge_call and # Not an LLM-as-Judge analysis + not image_base64 and # No image = not an evaluation/judge call + ( + is_reflection_call or + 'improve' in system_prompt.lower() or + 'optimize' in system_prompt.lower() or + 'suggest' in system_prompt.lower() or + 'feedback' in system_prompt.lower() or + 'reflection' in system_prompt.lower() + ) and + len(user_prompt) > 100 # Proposal calls have substantial feedback + ) + + # Detailed call detection logging only in DEBUG mode + if self._should_log_debug(): + logger.debug(f" is_json_batch_request: {is_json_batch_request}") + logger.debug(f" is_judge_call: {is_judge_call}") + logger.debug(f" is_reflection_call: {is_reflection_call}") + logger.debug(f" is_proposal_call: {is_proposal_call}") + logger.debug(f" has_image: {bool(image_base64)}") + logger.debug(f" has_llego: {self.llego is not None}") + + # Only intercept proposal calls (not judge, not evaluation, not JSON) + if is_proposal_call and self.llego: + # FIX #5: Check if LLEGO is disabled due to repeated failures + if self._llego_disabled: + logger.warning("โš ๏ธ LLEGO is disabled (circuit breaker), using base LLM") + return self.base_llm.generate( + system_prompt=system_prompt, + user_prompt=user_prompt, + image_base64=image_base64, + **kwargs + ) + + # Important event - keep at INFO + logger.info("๐Ÿ”ฅ INTERCEPTING REFLECTION/PROPOSAL CALL FOR CANDIDATE GENERATION") + return self._llego_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs) + else: + # Standard LLM call (for evaluation, not reflection) + if self._should_log_debug(): + logger.debug(" โ†’ Standard LLM call (evaluation, not reflection)") + return self.base_llm.generate( + system_prompt=system_prompt, + user_prompt=user_prompt, + image_base64=image_base64, + **kwargs + ) + + def _clean_reflection_feedback(self, feedback_text: str, max_length: int = 50000) -> str: + """ + Clean reflection feedback by removing base64 images and truncating. + + ๐Ÿ”ฅ CRITICAL: GEPA's feedback can include massive base64 images (7MB+). + This function removes them and keeps feedback concise. + + Args: + feedback_text: Original feedback (may contain base64) + max_length: Maximum length after cleaning (default: 50K chars) + + Returns: + Cleaned feedback without base64, within size limits + """ + if not feedback_text: + return feedback_text + + # Step 1: Remove very long base64-like sequences (50K+ chars of alphanumeric) + base64_pattern = r'[A-Za-z0-9+/=]{5000,}' + cleaned = re.sub(base64_pattern, '[IMAGE_DATA_REMOVED]', feedback_text) + + # Step 2: Remove explicit image_base64 references and their values + cleaned = re.sub(r'image_base64["\']?\s*[:=]\s*["\']?[A-Za-z0-9+/=]+["\']?', + 'image_base64: [REMOVED]', cleaned, flags=re.IGNORECASE) + + # Step 3: Remove detailed_scores sections that might contain base64 + cleaned = re.sub(r'##\s+detailed_scores[^\n]*\n[^#]*(?:image_base64|base64)[^\n]*(?:\n[^#]*)*', + '## detailed_scores: [REMOVED_FOR_BREVITY]', cleaned, flags=re.IGNORECASE | re.MULTILINE) + + # Step 4: Remove any remaining very long strings (likely base64) + cleaned = re.sub(r'"[A-Za-z0-9+/=]{10000,}"', '[LARGE_DATA_STRING_REMOVED]', cleaned) + + # Step 5: Truncate if still too long (keep beginning which has most important info) + if len(cleaned) > max_length: + truncated_size = len(cleaned) - max_length + cleaned = cleaned[:max_length] + f"\n\n[TRUNCATED {truncated_size} characters - keeping essential feedback only]" + logger.warning(f"โš ๏ธ Reflection feedback truncated: {len(feedback_text)} โ†’ {len(cleaned)} chars") + + return cleaned + + def _detect_reflection_call(self, system_prompt: str, user_prompt: str) -> bool: + """ + Heuristic to detect if this is a reflection call from GEPA. + + GEPA's reflection calls typically contain feedback/error analysis. + """ + reflection_keywords = [ + 'improve', 'feedback', 'error', 'failure', 'reflection', + 'better prompt', 'modify', 'enhance', 'optimize' + ] + + combined = (system_prompt + " " + user_prompt).lower() + return any(keyword in combined for keyword in reflection_keywords) + + def _llego_generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: str = "", + **kwargs + ) -> Dict[str, Any]: + """ + Use LLEGO (or Hybrid mode) to generate new prompt candidates. + + Args: + system_prompt: System prompt + user_prompt: User prompt (contains reflection feedback) + image_base64: Image data (for reflection, always empty) + **kwargs: Additional arguments (may contain image_base64, will be removed) + + Returns: + Dict with 'content' key containing a new prompt candidate + """ + try: + # ๐Ÿ”ฅ CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error + kwargs.pop('image_base64', None) # Remove if present to avoid conflict + + # ๐Ÿ”ฅ HYBRID MODE: Generate from BOTH GEPA reflection AND LLEGO + if (self.config and + hasattr(self.config, 'enable_gepa_reflection_with_llego') and + self.config.enable_gepa_reflection_with_llego): + + return self._hybrid_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs) + + # STANDARD LLEGO MODE (LLEGO only) + return self._llego_only_generate(system_prompt, user_prompt, image_base64=image_base64, **kwargs) + + except Exception as e: + # FIX #5: Circuit breaker - track failures and disable LLEGO if needed + self._llego_failures += 1 + + logger.error(f"โŒ LLEGO generation failed ({self._llego_failures}/{self._llego_failure_threshold}): {e}") + logger.error("โš ๏ธ Falling back to base LLM") + + if self._llego_failures >= self._llego_failure_threshold: + self._llego_disabled = True + logger.error(f"๐Ÿšซ LLEGO DISABLED - {self._llego_failures} consecutive failures detected") + logger.error(" All future requests will use base LLM only") + + import traceback + logger.debug(traceback.format_exc()) + + # Fallback to base LLM - ensure image_base64 is not in kwargs + kwargs.pop('image_base64', None) + return self.base_llm.generate( + system_prompt=system_prompt, + user_prompt=user_prompt, + image_base64=image_base64, + **kwargs + ) + + def _hybrid_generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: str = "", + **kwargs + ) -> Dict[str, Any]: + """ + ๐Ÿ”ฅ HYBRID MODE: Generate candidates from BOTH GEPA reflection AND LLEGO operators. + + Smart Compensation Strategy: + - When crossover can't run (< 2 parents), compensates with extra GEPA reflection + - GEPA is smarter than mutation (uses semantic understanding of feedback) + - Crossover only runs when we have 2+ scored parents to combine + + GEPA will call generate() multiple times. On first call, we generate all candidates + and queue them. Subsequent calls return from the queue. + """ + # If we already generated candidates, return next from queue + if self._hybrid_generation_complete and self._candidate_queue: + candidate = self._candidate_queue.pop(0) + source = candidate.get('source', 'unknown') + logger.info(f"๐Ÿ“ฆ Returning queued candidate (source: {source}, {len(self._candidate_queue)} remaining)") + return {'content': candidate['prompt'], 'source': source} + + # First call: Generate ALL candidates + from ..utils.clean_logger import get_clean_logger + clean_log = get_clean_logger() + + all_candidates = [] + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # PHASE 0: Check if crossover will be possible + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + gepa_pareto_front = pareto_log.pareto_front + + # Determine if we need to compensate for crossover + crossover_possible = len(gepa_pareto_front) >= 2 + n_crossover_config = self.config.n_crossover if hasattr(self.config, 'n_crossover') else 2 + crossover_compensation = 0 if crossover_possible else n_crossover_config + + if not crossover_possible: + logger.info(f"โš ๏ธ Crossover NOT possible (have {len(gepa_pareto_front)} parents, need 2+)") + logger.info(f" โ†’ Smart compensation: +{crossover_compensation} extra GEPA reflection candidates") + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # PHASE 1: GEPA REFLECTION (Semantic Understanding) + # More GEPA = better, it understands WHY things fail + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + base_gepa_count = self.config.num_gepa_reflection_candidates if hasattr(self.config, 'num_gepa_reflection_candidates') else 3 + + # ๐Ÿ”ฅ SMART COMPENSATION: More GEPA when crossover can't run + num_gepa = base_gepa_count + crossover_compensation + + logger.info("โ”€" * 80) + logger.info("PHASE 1: GEPA REFLECTION (Semantic Understanding)") + if crossover_compensation > 0: + logger.info(f"Generating {num_gepa} candidates ({base_gepa_count} base + {crossover_compensation} compensation for skipped crossover)") + else: + logger.info(f"Generating {num_gepa} candidates") + logger.info("โ”€" * 80) + + # ๐Ÿ”ฅ OPTIMIZED: Single call with JSON format for multiple variations + try: + # Clean user_prompt before sending to LLM + cleaned_user_prompt = self._clean_reflection_feedback(user_prompt) + + # Build diversity requirements based on num_gepa + diversity_requirements = self._build_diversity_requirements(num_gepa) + + # ๐Ÿ”ฅ FORMAT AWARENESS: Get format constraint if available + format_constraint = "" + if self._detected_format and self._detected_format.get('format_constraint'): + format_constraint = self._detected_format['format_constraint'] + logger.info(f"๐Ÿ“ Injecting format constraint into candidate generation") + # #region agent log + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "Format constraint injected", "data": {"format_type": self._detected_format.get('format_type', 'unknown'), "constraint_length": len(format_constraint), "avg_length": self._detected_format.get('avg_length', 0)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + else: + format_constraint = "No specific format detected - ensure output is CONCISE and matches expected examples." + # #region agent log + import json as _json_debug + import time as _time_debug + import os as _os_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + _os_debug.makedirs(_os_debug.path.dirname(_debug_log_path), exist_ok=True) + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "FORMAT_CONSTRAINT", "location": "llego_enhanced_llm.py:format_injection", "message": "No format constraint available", "data": {"has_detected_format": bool(self._detected_format)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + + # ๐Ÿ”ฅ EVOLUTIONARY PROMPT ENGINEER: Forces radically different mutations + # Each variation MUST use a distinct genetic strategy to maximize search space + optimization_system_prompt = f""" +You are an **Evolutionary Prompt Engineer**. Your task is to mutate a [FAILING_PROMPT] into a high-performance instruction set using genetic strategies. +You must generate {num_gepa} radically different prompt variations based on the [FAILURE_FEEDBACK]. + + + + +{cleaned_user_prompt} + + + + +You MUST use a different strategy for each variation. Assign strategies in order: + +1. **STRATEGY A: The Strict Auditor (Constraints)** + - Focus: Add "Negative Constraints" (e.g., "Do NOT...", "NEVER...", "FORBIDDEN:"). + - Use strict XML tagging for the output schema. + - Goal: Fix hallucinations and formatting errors. + +2. **STRATEGY B: The Reasoning Expert (Chain of Thought)** + - Focus: Add a "Reasoning Steps" section. + - Instruct the model to "Think step-by-step" before generating the final output. + - Goal: Fix logic errors and complex multi-step reasoning failures. + +3. **STRATEGY C: The Few-Shot Teacher (Examples)** + - Focus: Generate a *synthetic* example of Input -> Correct Output within the prompt. + - Goal: Fix understanding of abstract concepts or strict schema requirements. + +4. **STRATEGY D: The Role-Player (Persona)** + - Focus: Change the persona to a hyper-specific expert (e.g., "Senior Data Engineer at Fortune 500" vs "Coder"). + - Add domain-specific vocabulary and expertise markers. + - Goal: Fix domain-specific terminology errors. + +5. **STRATEGY E: The Structure Architect (Format)** + - Focus: Add explicit output schema with field-by-field instructions. + - Use markdown or XML headers to organize the prompt. + - Goal: Fix output structure and field naming errors. + + + +1. **Self-Contained**: Each variation must be the FULL prompt text (100-500 words), ready to run. +2. **No Meta-Talk**: Do not explain your strategy inside the prompt. Just output the optimized prompt. +3. **Preserve Core Task**: Keep the original task/domain - only improve HOW it's described. +4. **JSON Output**: Follow the schema below exactly. +5. **ENFORCE OUTPUT FORMAT**: The generated prompt MUST instruct the model to output in the EXACT format shown in examples. + + + +๐Ÿšจ THE GENERATED PROMPTS MUST INCLUDE EXPLICIT OUTPUT FORMAT INSTRUCTIONS! +Common failure: The model generates explanations/prose instead of the required concise format. + +{format_constraint} + +Your generated prompts MUST include: +- Explicit instruction to output ONLY in the required format +- "Do NOT explain", "No reasoning", "Output ONLY [format]" constraints +- Length constraint to prevent verbose responses + + + +You MUST output ONLY valid JSON. No comments, no explanations, no markdown code blocks. + +Generate exactly {num_gepa} variations in this exact format: + +{{ + "variations": [ + {{ + "index": 1, + "strategy": "Strict Auditor", + "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]" + }}, + {{ + "index": 2, + "strategy": "Reasoning Expert", + "prompt": "[FULL PROMPT TEXT - Complete, self-contained, ready to use]" + }} + ] +}} + +CRITICAL RULES: +1. Output ONLY the JSON object - no text before or after +2. Do NOT use markdown code blocks (no ```json) +3. Do NOT include comments (no // or /* */) +4. Ensure all strings are properly escaped +5. Generate exactly {num_gepa} variations +6. Each variation must have: index (number), strategy (string), prompt (string) + +""" + + # Standard GEPA reflection call + call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'} + result = self.base_llm.generate( + system_prompt=optimization_system_prompt, + user_prompt=cleaned_user_prompt, + image_base64=image_base64, + **call_kwargs + ) + + if isinstance(result, dict): + response_text = result.get("content", str(result)) + else: + response_text = str(result) + + # Parse JSON variations + gepa_variations = self._parse_json_variations(response_text, num_gepa) + + # Add all variations to candidates + for idx, variation_prompt in enumerate(gepa_variations, 1): + # ๐Ÿ›ก๏ธ DEFENSIVE FALLBACK: Extract clean prompt if LLM adds analysis + gepa_candidate = self._extract_clean_prompt_from_reflection(variation_prompt) + + # Validate extracted prompt before adding + if not self._is_valid_prompt(gepa_candidate): + logger.warning(f" โš ๏ธ Variation {idx} appears invalid, skipping") + continue + + # ๐Ÿ” DIAGNOSTIC: Log candidate length to help diagnose scoring issues + if self._should_log_debug(): + logger.debug(f" Candidate {idx} length: {len(gepa_candidate)} chars") + logger.debug(f" Candidate {idx} preview: {gepa_candidate[:100]}...") + + all_candidates.append({ + 'prompt': gepa_candidate, + 'source': 'gepa_reflection', + 'index': idx + }) + + clean_log.log_gepa_reflection_candidate(idx, gepa_candidate) + + gepa_count = len(all_candidates) + logger.info(f"โœ… GEPA Reflection: {gepa_count} candidates generated in single optimized call") + + except Exception as e: + logger.error(f"โŒ Error generating GEPA reflection candidates: {e}") + logger.warning(f" Falling back to sequential generation...") + import traceback + logger.debug(traceback.format_exc()) + + # Fallback: Sequential generation (when JSON parsing fails) + gepa_count = self._fallback_sequential_gepa_generation( + num_gepa, user_prompt, image_base64, kwargs, all_candidates, clean_log + ) + + if gepa_count > 0: + logger.info(f"GEPA Reflection Complete: {gepa_count} candidates") + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # PHASE 2: LLEGO GENETIC OPERATORS + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + logger.info("โ”€" * 80) + logger.info("PHASE 2: LLEGO GENETIC OPERATORS") + logger.info("โ”€" * 80) + + # Extract current prompt from context + current_prompt = self.reflection_context.get('current_prompt', '') + if not current_prompt: + current_prompt = self._extract_prompt_from_feedback(user_prompt) + + if not current_prompt and self.llego.population: + current_prompt = self.llego.population[0].prompt + logger.info(f" Using population prompt (length: {len(current_prompt)})") + + # Convert GEPA Pareto front to PromptCandidate format (already fetched in Phase 0) + pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front) + pareto_front = pareto_candidates + + logger.info(f" Pareto front: {len(pareto_front)} candidates with scores") + for idx, p in enumerate(pareto_front, 1): + notation = p.metadata.get('notation', 'S') if p.metadata else 'S' + logger.info(f" {notation}: fitness={p.fitness:.3f}") + + # Create LLM callable for LLEGO genetic operations (crossover/mutation) + call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'} + + # LLEGO genetic prompt with SAFETY LOCKS to prevent task drift + # Directed mutations ensure prompts improve without losing core functionality + genetic_operator_system_prompt = """ +You are a **Prompt Mutation Engine**. Your input is a [PARENT_PROMPT]. Your output is a [MUTATED_CHILD]. + + + +Apply ONE of the following micro-mutations to improve the prompt: + +1. **COMPRESS**: Remove fluff words ("please", "ensure to", "kindly"). Make it telegraphic and efficient. +2. **INTENSIFY**: Capitalize key constraints (e.g., "must return JSON" -> "**MUST** return **VALID JSON**"). +3. **STRUCTURIZE**: Add markdown headers or XML tags to organize a messy prompt. +4. **CLARIFY**: Expand vague nouns (e.g., "code" -> "production-ready Python code with type hints"). +5. **CONSTRAIN**: Add negative constraints ("Do NOT include explanations", "NEVER output markdown"). + + + +1. **IMMUTABLE CORE**: You MUST NOT change the core task (e.g., do not change "Extract JSON" to "Write a Summary"). +2. **NO EXPLANATION**: Output ONLY the new prompt string. No meta-commentary. +3. **VALIDITY**: The output must remain a functional system prompt. +4. **LENGTH LIMIT**: Keep mutations within 20% of original length (no excessive expansion). +""" + + def llm_callable(genetic_prompt: str) -> str: + result = self.base_llm.generate( + system_prompt=genetic_operator_system_prompt, + user_prompt=genetic_prompt, + image_base64="", + **call_kwargs + ) + if isinstance(result, dict): + return result.get('content', str(result)) + return str(result) + + # Generate LLEGO offspring (crossover will be skipped if < 2 parents) + llego_prompts = self.llego.evolve_generation( + llm=llm_callable, + pareto_front=pareto_front + ) + + # Track actual crossover count from LLEGO (it tracks internally now) + actual_crossover = getattr(self.llego, '_actual_crossover_count', 0) + crossover_skipped = getattr(self.llego, '_crossover_skipped', False) + + crossover_idx = 1 + mutation_idx = 1 + + for i, prompt in enumerate(llego_prompts): + if i < actual_crossover: + source = 'llego_crossover' + clean_log.log_llego_crossover_candidate(crossover_idx, prompt) + crossover_idx += 1 + else: + source = 'llego_mutation' + clean_log.log_llego_mutation_candidate(mutation_idx, prompt) + mutation_idx += 1 + + all_candidates.append({ + 'prompt': prompt, + 'source': source, + 'index': i + 1 + }) + + mutation_count = len(llego_prompts) - actual_crossover + logger.info(f"๐Ÿงฌ LLEGO: {actual_crossover} crossover + {mutation_count} mutation = {len(llego_prompts)} candidates") + if crossover_skipped: + logger.info(f" (Crossover was skipped - compensated with extra GEPA reflection)") + + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + # SUMMARY + # โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ + total_gepa = len([c for c in all_candidates if c.get('source') == 'gepa_reflection']) + total_crossover = len([c for c in all_candidates if c.get('source') == 'llego_crossover']) + total_mutation = len([c for c in all_candidates if c.get('source') == 'llego_mutation']) + + logger.info("โ”€" * 80) + logger.info("CANDIDATE GENERATION SUMMARY") + logger.info("โ”€" * 80) + logger.info(f" GEPA Reflection: {total_gepa} candidates (semantic understanding)") + logger.info(f" LLEGO Crossover: {total_crossover} candidates (combine best)") + logger.info(f" LLEGO Mutation: {total_mutation} candidates (exploration)") + logger.info(f" TOTAL: {len(all_candidates)} candidates") + if crossover_skipped: + logger.info(f" ๐Ÿ“ Note: Crossover skipped (waiting for 2+ scored parents)") + logger.info("โ”€" * 80) + + clean_log.log_candidate_generation_summary() + + # Store in queue (skip first one - return it now) + self._candidate_queue = all_candidates[1:] if len(all_candidates) > 1 else [] + self._hybrid_generation_complete = True + + # Return first candidate + if all_candidates: + first = all_candidates[0] + logger.info(f"๐Ÿ“ค Returning FIRST candidate (source: {first['source']})") + return {'content': first['prompt'], 'source': first['source']} + else: + logger.error("โŒ No candidates generated!") + return {'content': '', 'source': 'error'} + + def _llego_only_generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: str = "", + **kwargs + ) -> Dict[str, Any]: + """ + STANDARD LLEGO MODE: Generate candidates using only LLEGO operators. + """ + # ๐Ÿ”ฅ CRITICAL: Remove image_base64 from kwargs to avoid duplicate argument error + kwargs.pop('image_base64', None) + + # ๐Ÿ”ฅ FIX: Clean user_prompt if it contains feedback (might have base64) + cleaned_user_prompt = self._clean_reflection_feedback(user_prompt) + + # Extract current prompt from context or user_prompt + current_prompt = self.reflection_context.get('current_prompt', '') + + if not current_prompt: + # Try to extract from cleaned user_prompt + current_prompt = self._extract_prompt_from_feedback(cleaned_user_prompt) + + logger.info(f"๐Ÿงฌ LLEGO: Evolving prompt...") + if self._should_log_debug(): + logger.debug(f" Current prompt: '{current_prompt[:100]}...' (length: {len(current_prompt)} chars)") + else: + logger.info(f" Prompt length: {len(current_prompt)} chars") + + # ๐Ÿ”ฅ FIX 2: Get Pareto front from GEPA (not LLEGO population) + # This ensures LLEGO operators use true non-dominated solutions + from ..utils.pareto_logger import get_pareto_logger + pareto_log = get_pareto_logger() + gepa_pareto_front = pareto_log.pareto_front + + # Convert GEPA Pareto front to PromptCandidate format + pareto_candidates = self.llego._convert_gepa_pareto_to_candidates(gepa_pareto_front) + pareto_front = pareto_candidates + + logger.info(f" Using GEPA Pareto front (size: {len(gepa_pareto_front)})") + logger.info(f" Converted to {len(pareto_front)} PromptCandidate objects") + + # Create LLM callable for LLEGO genetic operations + # Uses Genetic Mutation Engine prompt for micro-mutations + call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'} + + genetic_system_prompt = """You are a **Genetic Mutation Engine** for Text Prompts. + + +Apply a specific micro-mutation to the provided prompt to increase its clarity, strictness, or effectiveness. + + + +1. **Compress**: Shorten verbose instructions without losing meaning. +2. **Expand**: Add detail to vague nouns (e.g., "code" -> "production-ready Python 3.10 code"). +3. **Emphasize**: Highlight CRITICAL constraints using caps, bold, or explicit markers. +4. **Constrain**: Add explicit boundaries (what NOT to do, format rules, length limits). +5. **Exemplify**: Add a brief example if the task is ambiguous. + + + +1. Output ONLY the mutated prompt text. +2. Do NOT change the core intent or task domain. +3. Do NOT add explanations or meta-commentary. +4. Apply ONE primary mutation type while preserving all existing strengths. +""" + + def llm_callable(prompt: str) -> str: + # Clean prompt before sending (might contain base64 if from feedback) + cleaned_prompt = self._clean_reflection_feedback(prompt) + result = self.base_llm.generate( + system_prompt=genetic_system_prompt, + user_prompt=cleaned_prompt, + image_base64="", # Always empty for LLEGO genetic operations + **call_kwargs + ) + if isinstance(result, dict): + return result.get('content', str(result)) + return str(result) + + # Generate offspring using LLEGO + new_prompts = self.llego.evolve_generation( + llm=llm_callable, + pareto_front=pareto_front + ) + + if new_prompts: + new_prompt = new_prompts[0] + logger.info(f"โœ… LLEGO generated new candidate (length: {len(new_prompt)} chars)") + + if self._should_log_debug(): + logger.debug(f" Full prompt:") + logger.debug(f" '{new_prompt}'") + + return { + 'content': new_prompt, + 'source': 'llego', + 'num_candidates': len(new_prompts) + } + else: + logger.warning("โš ๏ธ LLEGO returned no candidates, falling back to base LLM") + return self.base_llm.generate( + system_prompt=system_prompt, + user_prompt=user_prompt, + image_base64="", + **kwargs + ) + + def _build_diversity_requirements(self, num_gepa: int) -> str: + """ + Build diversity requirements using research-backed Prompt Design Patterns. + + These are proven strategies from prompt engineering literature: + - Chain-of-Thought (CoT) + - Few-Shot Learning + - Negative Constraints + - Persona Pattern + + Args: + num_gepa: Number of GEPA variations to generate + + Returns: + String with diversity requirements for the optimization prompt + """ + # Research-backed Prompt Design Patterns that solve specific classes of problems + strategies = [ + """ + + **STRATEGY: COGNITIVE DECOMPOSITION (Chain-of-Thought)** + - **Goal**: Fixes logic/reasoning errors. + - **Action**: Add a thinking process section that forces step-by-step reasoning. + - **Implementation**: Include instructions like "First analyze..., then identify..., finally conclude..." + - **Pattern**: Force the model to "Plan before executing". + + """, + + """ + + **STRATEGY: FEW-SHOT SIMULATION (In-Context Learning)** + - **Goal**: Fixes formatting/syntax errors and output structure issues. + - **Action**: Invent 1-2 realistic "Input -> Output" examples that mirror the expected format. + - **Implementation**: Add "Example: Given [input], respond with: [expected output format]" + - **Pattern**: Show, don't just tell. Demonstrate the gold standard. + + """, + + """ + + **STRATEGY: SEMANTIC CONSTRAINING (Negative Constraints)** + - **Goal**: Fixes hallucinations, verbosity, and off-topic responses. + - **Action**: Add explicit forbidden actions and boundaries. + - **Implementation**: Include "Do NOT explain your reasoning", "Do NOT add preambles", "Do NOT include information not asked for" + - **Pattern**: Define the walls, not just the path. + + """, + + """ + + **STRATEGY: PERSONA & ROLE HARDENING** + - **Goal**: Fixes tone, domain knowledge gaps, and inconsistent behavior. + - **Action**: Define a hyper-specific expert role with clear responsibilities. + - **Implementation**: Instead of "You are a helpful assistant", use "You are a Senior Data Analyst with 10 years of experience in [domain]" + - **Pattern**: Adopt the mental model and rigorous standards of a real expert. + + """, + + """ + + **STRATEGY: OUTPUT SCHEMA ENFORCEMENT** + - **Goal**: Fixes structural and format compliance issues. + - **Action**: Define an explicit output schema with field names and types. + - **Implementation**: Include "Your response MUST follow this exact format: {field1: type, field2: type}" + - **Pattern**: Leave no ambiguity about what the output should look like. + + """, + + """ + + **STRATEGY: SELF-VERIFICATION LOOP** + - **Goal**: Fixes errors that could be caught by double-checking. + - **Action**: Add instructions for the model to verify its own output. + - **Implementation**: Include "Before responding, verify: 1) Does this match the required format? 2) Did I include all requested information?" + - **Pattern**: Build in quality control before submission. + + """, + + """ + + **STRATEGY: TASK DECOMPOSITION** + - **Goal**: Fixes complex tasks that overwhelm the model. + - **Action**: Break the task into numbered sub-tasks. + - **Implementation**: "Step 1: [subtask]. Step 2: [subtask]. Step 3: Combine results." + - **Pattern**: Divide and conquer complexity. + + """ + ] + + # Select strategies based on num_gepa + selected = strategies[:min(num_gepa, len(strategies))] + + requirements = "\n" + requirements += "Each variation MUST use a DIFFERENT strategy from the list below:\n" + requirements += "\n".join(selected) + requirements += "\n" + + requirements += """ + + + 1. Each variation must apply its assigned strategy comprehensively. + 2. Each variation must ALSO address ALL issues mentioned in the feedback. + 3. The strategies are not mutually exclusive - but the PRIMARY focus of each variation should be its assigned strategy. + 4. Do not just add a single line - transform the prompt structure according to the strategy. + +""" + + return requirements + + def _fallback_sequential_gepa_generation( + self, + num_gepa: int, + user_prompt: str, + image_base64: str, + kwargs: dict, + all_candidates: list, + clean_log + ) -> int: + """ + Fallback to sequential generation when JSON parsing fails. + + Args: + num_gepa: Number of candidates to generate + user_prompt: The feedback/context + image_base64: Image data (if any) + kwargs: Additional kwargs + all_candidates: List to append candidates to + clean_log: Logger for clean output + + Returns: + Number of candidates generated + """ + generated_count = 0 + + for i in range(num_gepa): + logger.debug(f"Generating Reflection Candidate #{i+1}/{num_gepa} (fallback mode)...") + try: + cleaned_user_prompt = self._clean_reflection_feedback(user_prompt) + + # Use research-backed strategy for each variation + strategy_prompts = [ + "\nApply CHAIN-OF-THOUGHT: Add step-by-step reasoning instructions. Force the model to 'think before answering'.\n", + "\nApply FEW-SHOT LEARNING: Add 1-2 concrete input/output examples within the prompt. Show, don't just tell.\n", + "\nApply NEGATIVE CONSTRAINTS: Add explicit 'Do NOT' rules. Define what the model must avoid.\n", + "\nApply PERSONA HARDENING: Define a specific expert role with clear responsibilities and standards.\n", + "\nApply OUTPUT SCHEMA: Define the exact output format with field names and types. Leave no ambiguity.\n", + ] + + strategy = strategy_prompts[i % len(strategy_prompts)] + + fallback_prompt = f"""You are a Prompt Optimization Engine in **SAFE MODE**. + +{strategy} + +{_FALLBACK_SYSTEM_PROMPT}""" + + call_kwargs = {k: v for k, v in kwargs.items() if k != 'image_base64'} + result = self.base_llm.generate( + system_prompt=fallback_prompt, + user_prompt=cleaned_user_prompt, + image_base64=image_base64, + **call_kwargs + ) + + if isinstance(result, dict): + gepa_candidate_raw = result.get("content", str(result)) + else: + gepa_candidate_raw = str(result) + + gepa_candidate = self._extract_clean_prompt_from_reflection(gepa_candidate_raw) + + if not self._is_valid_prompt(gepa_candidate): + logger.warning(f" โš ๏ธ Fallback candidate #{i+1} appears invalid, skipping") + continue + + all_candidates.append({ + 'prompt': gepa_candidate, + 'source': 'gepa_reflection', + 'index': i + 1 + }) + + clean_log.log_gepa_reflection_candidate(i + 1, gepa_candidate) + generated_count += 1 + + except Exception as fallback_error: + logger.error(f"โŒ Error in fallback generation #{i+1}: {fallback_error}") + + return generated_count + + def _extract_prompt_from_feedback(self, user_prompt: str) -> str: + """ + Try to extract the current prompt from GEPA's reflection feedback. + + Args: + user_prompt: The feedback text from GEPA + + Returns: + Extracted prompt or empty string + """ + # Look for common patterns in GEPA's feedback + if "current prompt:" in user_prompt.lower(): + lines = user_prompt.split('\n') + for i, line in enumerate(lines): + if "current prompt:" in line.lower(): + # Return the next line(s) as the prompt + return '\n'.join(lines[i+1:i+10]) + + return "" + + # Forward other methods to base LLM + def get_model_info(self) -> str: + """Get model information.""" + return f"LLEGO({self.base_llm.get_model_info()})" + + def __getattr__(self, name): + """Forward unknown attributes to base LLM.""" + return getattr(self.base_llm, name) + diff --git a/src/gepa_optimizer/llms/vision_llm.py b/src/gepa_optimizer/llms/vision_llm.py new file mode 100644 index 0000000000000000000000000000000000000000..79f84dac705b901341c722f2955af1b8b473561c --- /dev/null +++ b/src/gepa_optimizer/llms/vision_llm.py @@ -0,0 +1,813 @@ +""" +Vision LLM Client for GEPA Optimizer +""" + +import json +import logging +import time +from enum import Enum +import requests +from typing import Dict, Optional, Any, TYPE_CHECKING, Union + +# Assuming APIKeyManager is available from utils +from ..utils.api_keys import APIKeyManager + +# Import ModelConfig only for type checking to avoid circular imports +if TYPE_CHECKING: + from ..models.config import ModelConfig + +from .base_llm import BaseLLMClient + +class ProviderType(str, Enum): + OPENAI = "openai" + ANTHROPIC = "anthropic" + HUGGINGFACE = "huggingface" + VLLM = "vllm" + GOOGLE = "google" + GEMINI = "gemini" + +class ErrorType(str, Enum): + API_ERROR = "api_error" + VALIDATION_ERROR = "validation_error" + NETWORK_ERROR = "network_error" + RATE_LIMIT = "rate_limit" + TIMEOUT = "timeout" + +class GepaLLMError(Exception): + """Base exception for GEPA LLM related errors""" + def __init__(self, message: str, error_type: ErrorType, status_code: Optional[int] = None): + self.message = message + self.error_type = error_type + self.status_code = status_code + super().__init__(self.message) + + def __str__(self): + if self.status_code: + return f"{self.error_type.value} (HTTP {self.status_code}): {self.message}" + return f"{self.error_type.value}: {self.message}" + +logger = logging.getLogger(__name__) + +OPENAI_API_URL = "https://api.openai.com/v1/chat/completions" + +class VisionLLMClient(BaseLLMClient): + """ + A client for interacting with multi-modal Vision LLMs (e.g., OpenAI GPT-4 Vision). + + Example: + ```python + # Basic usage + client = VisionLLMClient( + provider="openai", + model_name="gpt-4-vision-preview", + temperature=0.7, + max_tokens=2048 + ) + + # With custom configuration + config = ModelConfig( + provider="openai", + model_name="gpt-4-vision-preview", + temperature=0.5, + max_tokens=1024 + ) + client = VisionLLMClient.from_config(config) + ``` + """ + + def __init__( + self, + provider: Union[str, ProviderType], + model_name: str, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + temperature: float = 0.7, + max_tokens: int = 2048, + top_p: float = 1.0, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + timeout: int = 120, # Increase to 2 minutes for large prompts + max_retries: int = 3 + ): + """ + Initializes the VisionLLMClient with model configuration. + + Args: + provider: The provider of the model (e.g., 'openai', 'anthropic') + model_name: The name of the multi-modal LLM model to use (e.g., "gpt-4-vision-preview"). + api_key: Optional API key. If not provided, it will be fetched from APIKeyManager. + base_url: Optional base URL for the API endpoint. + temperature: Controls randomness in the response generation. + max_tokens: Maximum number of tokens to generate. + top_p: Controls diversity via nucleus sampling. + frequency_penalty: Penalizes repeated tokens. + presence_penalty: Penalizes new tokens based on their presence in the text so far. + """ + # Initialize parent class + super().__init__(provider=str(provider), model_name=model_name, **{ + 'api_key': api_key, + 'base_url': base_url, + 'temperature': temperature, + 'max_tokens': max_tokens, + 'top_p': top_p, + 'frequency_penalty': frequency_penalty, + 'presence_penalty': presence_penalty, + 'timeout': timeout, + 'max_retries': max_retries + }) + + # Initialize the actual client + self._initialize_client(provider, model_name, api_key, base_url, temperature, + max_tokens, top_p, frequency_penalty, presence_penalty, + timeout, max_retries) + + def _initialize_client(self, provider, model_name, api_key, base_url, temperature, + max_tokens, top_p, frequency_penalty, presence_penalty, + timeout, max_retries): + """Initialize the actual client (existing logic)""" + # Input validation + try: + self.provider = ProviderType(provider.lower()) + except ValueError: + raise GepaLLMError( + f"Unsupported provider: {provider}. " + f"Supported providers: {[p.value for p in ProviderType]}", + ErrorType.VALIDATION_ERROR + ) + + if not model_name: + raise GepaLLMError("model_name cannot be empty", ErrorType.VALIDATION_ERROR) + + if not isinstance(temperature, (int, float)) or not 0 <= temperature <= 2: + raise GepaLLMError( + f"temperature must be between 0 and 2, got {temperature}", + ErrorType.VALIDATION_ERROR + ) + + if not isinstance(max_tokens, int) or max_tokens <= 0: + raise GepaLLMError( + f"max_tokens must be a positive integer, got {max_tokens}", + ErrorType.VALIDATION_ERROR + ) + + # Initialize API key + try: + self.api_key = api_key or APIKeyManager().get_api_key(self.provider.value) + if not self.api_key: + raise GepaLLMError( + f"No API key found for provider: {self.provider}", + ErrorType.VALIDATION_ERROR + ) + except Exception as e: + raise GepaLLMError( + f"Failed to initialize API key: {str(e)}", + ErrorType.API_ERROR + ) from e + + self.model_name = model_name + self.base_url = base_url or OPENAI_API_URL + self.temperature = temperature + self.max_tokens = max_tokens + self.top_p = top_p + self.frequency_penalty = frequency_penalty + self.presence_penalty = presence_penalty + self.timeout = timeout + self.max_retries = max_retries + self.logger = logging.getLogger(f"{__name__}.{self.__class__.__name__}") + + # Configure session with retry + self.session = requests.Session() + retry_strategy = requests.adapters.Retry( + total=max_retries, + backoff_factor=1, + status_forcelist=[429, 500, 502, 503, 504], + allowed_methods=["POST"] + ) + adapter = requests.adapters.HTTPAdapter(max_retries=retry_strategy) + self.session.mount("https://", adapter) + self.session.mount("http://", adapter) + + # No hardcoded model restrictions - user can specify any model name + # The API provider will validate if the model exists and supports vision + + def _get_api_key(self) -> Optional[str]: + """Get API key based on provider""" + if self.provider == 'openai': + return APIKeyManager().get_api_key('openai') + elif self.provider == 'anthropic': + return APIKeyManager().get_api_key('anthropic') + elif self.provider in ['google', 'gemini']: + return APIKeyManager().get_api_key('google') + # Add other providers as needed + return None + + @classmethod + def from_config(cls, config: 'ModelConfig') -> 'VisionLLMClient': + """Create a VisionLLMClient from a ModelConfig object. + + Args: + config: ModelConfig instance with provider and model settings + + Returns: + Configured VisionLLMClient instance + + Example: + ```python + config = ModelConfig( + provider="openai", + model_name="gpt-4-vision-preview", + temperature=0.7 + ) + client = VisionLLMClient.from_config(config) + ``` + """ + return cls( + provider=config.provider, + model_name=config.model_name, + api_key=config.api_key, + base_url=config.base_url, + temperature=config.temperature, + max_tokens=config.max_tokens, + top_p=config.top_p, + frequency_penalty=config.frequency_penalty, + presence_penalty=config.presence_penalty + ) + + @classmethod + def from_model_string(cls, model_string: str, **kwargs) -> 'VisionLLMClient': + """Create a VisionLLMClient from a model string like "provider/model-name". + + Args: + model_string: Model identifier in format "provider/model-name" or just "model-name" + Examples: "google/gemini-2.0-flash", "openai/gpt-4o", "gemini-1.5-pro" + **kwargs: Additional configuration options (temperature, max_tokens, etc.) + + Returns: + Configured VisionLLMClient instance + + Example: + ```python + # With provider + client = VisionLLMClient.from_model_string("google/gemini-2.0-flash") + + # Without provider (defaults to openai) + client = VisionLLMClient.from_model_string("gpt-4o") + + # With additional options + client = VisionLLMClient.from_model_string( + "google/gemini-2.0-flash", + temperature=0.5, + max_tokens=4096 + ) + ``` + """ + import os + + # Parse "provider/model-name" format + if "/" in model_string: + provider, model_name = model_string.split("/", 1) + else: + # Default to openai if no provider specified + provider = "openai" + model_name = model_string + + # Normalize provider names + provider = provider.lower() + if provider == "gemini": + provider = "google" + + # Get API key from environment if not provided + api_key = kwargs.pop('api_key', None) + if not api_key: + env_var_map = { + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "google": "GOOGLE_API_KEY", + } + env_var = env_var_map.get(provider, f"{provider.upper()}_API_KEY") + api_key = os.getenv(env_var) + + return cls( + provider=provider, + model_name=model_name, + api_key=api_key, + **kwargs + ) + + def generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: Optional[str] = None, + **generation_kwargs + ) -> Dict[str, Any]: + """ + Generates a response from the Vision LLM. + + Args: + system_prompt: The system-level instructions for the LLM. + user_prompt: The user's query or task. + image_base64: Optional Base64 encoded image string. + **generation_kwargs: Additional model-specific generation parameters + + Returns: + A dictionary containing the generated response and metadata. + + Raises: + GepaLLMError: If there's an error during generation + + Example: + ```python + response = client.generate( + system_prompt="You are a helpful assistant.", + user_prompt="What's in this image?", + image_base64="base64_encoded_image" + ) + ``` + """ + if not system_prompt or not user_prompt: + raise GepaLLMError( + "system_prompt and user_prompt are required", + ErrorType.VALIDATION_ERROR + ) + + try: + if self.provider == ProviderType.OPENAI: + return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs) + elif self.provider in [ProviderType.GOOGLE, ProviderType.GEMINI]: + return self._generate_google(system_prompt, user_prompt, image_base64, **generation_kwargs) + else: + raise GepaLLMError( + f"Provider {self.provider} is not yet supported", + ErrorType.VALIDATION_ERROR + ) + except requests.exceptions.RequestException as e: + self.logger.error(f"Network error during generation: {str(e)}") + raise GepaLLMError( + f"Network error: {str(e)}", + ErrorType.NETWORK_ERROR, + getattr(e.response, 'status_code', None) if hasattr(e, 'response') else None + ) from e + except GepaLLMError: + raise + except Exception as e: + self.logger.error(f"Unexpected error during generation: {str(e)}") + raise GepaLLMError( + f"Generation failed: {str(e)}", + ErrorType.API_ERROR + ) from e + + def _generate_openai( + self, + system_prompt: str, + user_prompt: str, + image_base64: Optional[str] = None, + **generation_kwargs + ) -> Dict[str, Any]: + """ + Generate response using OpenAI's API with configured parameters. + + Args: + system_prompt: System instructions for the model + user_prompt: User's input prompt + image_base64: Optional base64 encoded image + + Returns: + Dictionary containing the API response + + Raises: + GepaDependencyError: If API call fails + """ + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.api_key}", + "User-Agent": "GepaOptimizer/1.0 (Python)" + } + + messages = [ + {"role": "system", "content": system_prompt}, + { + "role": "user", + "content": [ + {"type": "text", "text": user_prompt} + ] + } + ] + + if image_base64: + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + try: + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({ + "id": f"log_{int(_time_debug.time() * 1000)}", + "timestamp": int(_time_debug.time() * 1000), + "location": "vision_llm.py:_generate_openai", + "message": "Image base64 BEFORE processing", + "data": { + "image_base64_length": len(image_base64) if image_base64 else 0, + "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False, + "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64, + "is_none": image_base64 is None, + "is_empty": image_base64 == "" if image_base64 else True + }, + "sessionId": "debug-session", + "runId": "run1", + "hypothesisId": "A,C,D" + }) + "\n") + except Exception: + pass + # #endregion + + # Detect and extract image format + detected_format = "jpeg" # Default fallback + clean_base64 = image_base64 + + # Extract format from data URI prefix if present + if image_base64.startswith("data:image"): + # Parse format from prefix: data:image/png;base64,... + if "," in image_base64: + prefix_part = image_base64.split(",", 1)[0] + clean_base64 = image_base64.split(",", 1)[1] + # Extract format from "data:image/PNG;base64" or "data:image/png" + if "/" in prefix_part and ";" in prefix_part: + detected_format = prefix_part.split("/")[1].split(";")[0].lower() + elif "/" in prefix_part: + detected_format = prefix_part.split("/")[1].lower() + else: + # Fallback: try to extract format + if "/" in image_base64: + detected_format = image_base64.split("/")[1].split(";")[0].lower() if ";" in image_base64 else "jpeg" + clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "") + + # If no format detected from prefix, try to detect from image data + if detected_format == "jpeg" or not detected_format: + try: + import base64 as b64 + from PIL import Image + import io + image_data = b64.b64decode(clean_base64) + img = Image.open(io.BytesIO(image_data)) + if img.format: + detected_format = img.format.lower() + # Normalize format names + if detected_format in ["jpg", "jpeg"]: + detected_format = "jpeg" + except Exception: + # If detection fails, keep default + pass + + # Normalize format for data URI (OpenAI accepts: jpeg, png, gif, webp) + format_map = { + "jpg": "jpeg", + "jpeg": "jpeg", + "png": "png", + "gif": "gif", + "webp": "webp", + "bmp": "png", # Convert BMP to PNG (OpenAI doesn't support BMP) + "tiff": "png", # Convert TIFF to PNG + "tif": "png" + } + final_format = format_map.get(detected_format, "jpeg") + + final_url = f"data:image/{final_format};base64,{clean_base64}" + + # #region agent log + try: + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({ + "id": f"log_{int(_time_debug.time() * 1000)}", + "timestamp": int(_time_debug.time() * 1000), + "location": "vision_llm.py:_generate_openai", + "message": "Image URL AFTER processing", + "data": { + "detected_format": detected_format, + "final_format": final_format, + "clean_base64_length": len(clean_base64), + "final_url_length": len(final_url), + "final_url_prefix": final_url[:60] + }, + "sessionId": "debug-session", + "runId": "run1", + "hypothesisId": "A,B" + }) + "\n") + except Exception: + pass + # #endregion + + messages[1]["content"].append({ + "type": "image_url", + "image_url": { + "url": final_url + } + }) + + payload = { + "model": self.model_name, + "messages": messages, + # "temperature": self.temperature, + # "max_tokens": self.max_tokens, + "top_p": self.top_p, + "frequency_penalty": self.frequency_penalty, + "presence_penalty": self.presence_penalty + } + + self.logger.debug(f"Sending request to {self.base_url} with model {self.model_name}") + + try: + self.logger.debug(f"Sending request to {self.model_name}") + + # Make the API request with retry + response = self.session.post( + self.base_url, + headers=headers, + json=payload, + timeout=300 + ) + + # Handle rate limiting + if response.status_code == 429: + retry_after = int(response.headers.get('Retry-After', 5)) + self.logger.warning(f"Rate limited. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + return self._generate_openai(system_prompt, user_prompt, image_base64, **generation_kwargs) + + response.raise_for_status() + + result = response.json() + self.logger.debug(f"Received response from {self.model_name}") + + # Extract and validate the response + try: + message = result["choices"][0]["message"] + llm_response_content = message["content"] + + # Log token usage if available + if "usage" in result: + usage = result["usage"] + self.logger.info( + f"Tokens used - Prompt: {usage.get('prompt_tokens', 'N/A')}, " + f"Completion: {usage.get('completion_tokens', 'N/A')}, " + f"Total: {usage.get('total_tokens', 'N/A')}" + ) + + # Try to parse JSON if the response looks like JSON + if isinstance(llm_response_content, str) and ( + llm_response_content.startswith('{') or + llm_response_content.startswith('[') + ): + try: + return json.loads(llm_response_content) + except json.JSONDecodeError: + pass + + # Default response format + return { + "content": llm_response_content, + "role": message.get("role", "assistant"), + "model": self.model_name, + "provider": self.provider.value + } + + except (KeyError, IndexError) as e: + self.logger.error(f"Unexpected response format: {result}") + raise GepaLLMError( + f"Unexpected response format from {self.provider} API", + ErrorType.API_ERROR, + response.status_code + ) from e + + except requests.exceptions.HTTPError as e: + status_code = e.response.status_code if hasattr(e, 'response') else None + error_msg = f"HTTP error {status_code} from {self.provider} API" + + try: + error_data = e.response.json() + error_msg = error_data.get('error', {}).get('message', error_msg) + except Exception: + error_msg = str(e) + + self.logger.error(f"{error_msg}: {error_data if 'error_data' in locals() else str(e)}") + raise GepaLLMError( + error_msg, + ErrorType.RATE_LIMIT if status_code == 429 else ErrorType.API_ERROR, + status_code + ) from e + + except requests.exceptions.Timeout: + self.logger.error(f"Request to {self.provider} API timed out after {self.timeout} seconds") + raise GepaLLMError( + f"Request timed out after {self.timeout} seconds", + ErrorType.TIMEOUT + ) + + except requests.exceptions.RequestException as e: + self.logger.error(f"Network error: {str(e)}") + raise GepaLLMError( + f"Network error: {str(e)}", + ErrorType.NETWORK_ERROR + ) from e + + except Exception as e: + self.logger.error(f"Unexpected error: {str(e)}", exc_info=True) + raise GepaLLMError( + f"Unexpected error: {str(e)}", + ErrorType.API_ERROR + ) from e + + def _generate_google( + self, + system_prompt: str, + user_prompt: str, + image_base64: Optional[str] = None, + **generation_kwargs + ) -> Dict[str, Any]: + """ + Generate response using Google Gemini API with configured parameters. + + Args: + system_prompt: System instructions for the model + user_prompt: User's input prompt + image_base64: Optional base64 encoded image + + Returns: + Dictionary containing the API response + + Raises: + GepaLLMError: If API call fails + """ + try: + import google.generativeai as genai + import base64 + from PIL import Image + import io + except ImportError as e: + raise GepaLLMError( + f"Required dependencies for Google Gemini not installed: {str(e)}. " + f"Please install: pip install google-generativeai Pillow", + ErrorType.VALIDATION_ERROR + ) from e + + # Configure Gemini + genai.configure(api_key=self.api_key) + + # Use the model name directly as specified by the user + # No hardcoded mappings or restrictions - fully configurable + # The Gemini API will validate if the model exists + gemini_model_name = self.model_name + + try: + model = genai.GenerativeModel(gemini_model_name) + except Exception as e: + raise GepaLLMError( + f"Failed to initialize Gemini model {gemini_model_name}: {str(e)}", + ErrorType.API_ERROR + ) from e + + # Prepare content + content_parts = [] + + # Add system prompt and user prompt + full_prompt = f"{system_prompt}\n\n{user_prompt}" + content_parts.append(full_prompt) + + # Add image if provided + if image_base64: + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + try: + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({ + "id": f"log_{int(_time_debug.time() * 1000)}", + "timestamp": int(_time_debug.time() * 1000), + "location": "vision_llm.py:_generate_google", + "message": "Image base64 BEFORE processing (Google)", + "data": { + "image_base64_length": len(image_base64) if image_base64 else 0, + "has_data_uri_prefix": image_base64.startswith("data:image") if image_base64 else False, + "prefix": image_base64[:50] if image_base64 and len(image_base64) > 50 else image_base64, + "is_none": image_base64 is None, + "is_empty": image_base64 == "" if image_base64 else True + }, + "sessionId": "debug-session", + "runId": "run1", + "hypothesisId": "A,C,D" + }) + "\n") + except Exception: + pass + # #endregion + + try: + # Strip data URI prefix if present (hypothesis A fix) + clean_base64 = image_base64 + if image_base64.startswith("data:image"): + # Extract just the base64 part after the comma + if "," in image_base64: + clean_base64 = image_base64.split(",", 1)[1] + else: + clean_base64 = image_base64.replace("data:image/", "").replace(";base64", "") + + # Decode base64 image + image_data = base64.b64decode(clean_base64) + image = Image.open(io.BytesIO(image_data)) + content_parts.append(image) + self.logger.debug(f"Added image to Gemini request") + except Exception as e: + self.logger.warning(f"Failed to process image for Gemini: {str(e)}") + # Continue without image rather than failing + + self.logger.debug(f"Sending request to Gemini model {gemini_model_name}") + + try: + # Generate response with retry logic + max_retries = 3 + for attempt in range(max_retries): + try: + # Configure generation parameters + generation_config = genai.types.GenerationConfig( + temperature=self.temperature, + max_output_tokens=self.max_tokens, + top_p=self.top_p, + ) + + response = model.generate_content( + content_parts, + generation_config=generation_config + ) + + # Check if response was blocked + if response.prompt_feedback and response.prompt_feedback.block_reason: + raise GepaLLMError( + f"Gemini blocked the prompt: {response.prompt_feedback.block_reason}", + ErrorType.VALIDATION_ERROR + ) + + # Check if response was blocked + if not response.text: + if response.candidates and response.candidates[0].finish_reason: + finish_reason = response.candidates[0].finish_reason + if finish_reason == genai.types.FinishReason.SAFETY: + raise GepaLLMError( + "Gemini response blocked due to safety concerns", + ErrorType.VALIDATION_ERROR + ) + elif finish_reason == genai.types.FinishReason.RECITATION: + raise GepaLLMError( + "Gemini response blocked due to recitation concerns", + ErrorType.VALIDATION_ERROR + ) + raise GepaLLMError( + "Gemini returned empty response", + ErrorType.API_ERROR + ) + + self.logger.debug(f"Received response from Gemini model {gemini_model_name}") + + # Log usage information if available + if hasattr(response, 'usage_metadata') and response.usage_metadata: + usage = response.usage_metadata + self.logger.info( + f"Tokens used - Prompt: {usage.prompt_token_count}, " + f"Completion: {usage.candidates_token_count}, " + f"Total: {usage.total_token_count}" + ) + + # Try to parse JSON if the response looks like JSON + response_text = response.text + if isinstance(response_text, str) and ( + response_text.startswith('{') or + response_text.startswith('[') + ): + try: + return json.loads(response_text) + except json.JSONDecodeError: + pass + + # Default response format + return { + "content": response_text, + "role": "assistant", + "model": gemini_model_name, + "provider": "google" + } + + except Exception as e: + if attempt < max_retries - 1: + self.logger.warning(f"Gemini API attempt {attempt + 1} failed: {str(e)}. Retrying...") + time.sleep(2 ** attempt) # Exponential backoff + continue + else: + raise + + except GepaLLMError: + raise + except Exception as e: + self.logger.error(f"Unexpected error with Gemini API: {str(e)}") + raise GepaLLMError( + f"Gemini API error: {str(e)}", + ErrorType.API_ERROR + ) from e diff --git a/src/gepa_optimizer/models/__init__.py b/src/gepa_optimizer/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1afaf5b607bb0fdddf1d62b08d6348a034e5f8a0 --- /dev/null +++ b/src/gepa_optimizer/models/__init__.py @@ -0,0 +1,15 @@ +""" +Models module for GEPA Optimizer +""" + +from .config import ModelConfig, OptimizationConfig +from .dataset import DatasetItem +from .result import OptimizationResult, OptimizedResult + +__all__ = [ + "ModelConfig", + "OptimizationConfig", + "DatasetItem", + "OptimizationResult", + "OptimizedResult" +] diff --git a/src/gepa_optimizer/models/config.py b/src/gepa_optimizer/models/config.py new file mode 100644 index 0000000000000000000000000000000000000000..6f6193d450b891b1d66255966b843fecbc45d25c --- /dev/null +++ b/src/gepa_optimizer/models/config.py @@ -0,0 +1,488 @@ +""" +Configuration models for GEPA Optimizer +""" + +import os +from dataclasses import dataclass, field +from typing import List, Optional, Dict, Any, Union, Tuple + +@dataclass +class ModelConfig: + """Configuration for any LLM provider""" + provider: str # Required: "openai", "anthropic", "huggingface", "vllm", etc. + model_name: str # Required: actual model name + api_key: str # Required: API key for the provider + base_url: Optional[str] = None # Optional: custom endpoint URL + temperature: float = 0.7 + max_tokens: int = 2048 + top_p: float = 1.0 + frequency_penalty: float = 0.0 + presence_penalty: float = 0.0 + + def __post_init__(self): + """Validate required fields after initialization""" + if not self.provider: + raise ValueError("Provider is required (e.g., 'openai', 'anthropic', 'huggingface')") + if not self.model_name: + raise ValueError("Model name is required (e.g., 'gpt-4', 'claude-3-opus')") + if not self.api_key: + raise ValueError(f"API key is required for {self.provider} provider") + + @classmethod + def from_string(cls, model_string: str) -> 'ModelConfig': + """Create ModelConfig from string like 'openai/gpt-4' or 'gpt-4'""" + if "/" in model_string: + provider, model_name = model_string.split("/", 1) + else: + # Default to OpenAI if no provider specified + provider = "openai" + model_name = model_string + + # Get API key from environment + api_key = cls._get_api_key_for_provider(provider) + if not api_key: + raise ValueError( + f"No API key found for {provider}. Please set {provider.upper()}_API_KEY environment variable" + ) + + return cls( + provider=provider, + model_name=model_name, + api_key=api_key + ) + + @classmethod + def from_dict(cls, config_dict: dict) -> 'ModelConfig': + """Create ModelConfig from dictionary""" + return cls(**config_dict) + + def to_dict(self) -> dict: + """Convert ModelConfig to dictionary""" + return { + 'provider': self.provider, + 'model_name': self.model_name, + 'api_key': self.api_key, + 'base_url': self.base_url, + 'temperature': self.temperature, + 'max_tokens': self.max_tokens, + 'top_p': self.top_p, + 'frequency_penalty': self.frequency_penalty, + 'presence_penalty': self.presence_penalty + } + + @staticmethod + def _get_api_key_for_provider(provider: str) -> Optional[str]: + """Get API key for provider from environment variables""" + env_var_map = { + "openai": "OPENAI_API_KEY", + "anthropic": "ANTHROPIC_API_KEY", + "huggingface": "HUGGINGFACE_API_KEY", + "cohere": "COHERE_API_KEY", + "ai21": "AI21_API_KEY", + "together": "TOGETHER_API_KEY", + "replicate": "REPLICATE_API_TOKEN", + "groq": "GROQ_API_KEY", + "ollama": "OLLAMA_API_KEY" + } + + env_var = env_var_map.get(provider.lower()) + if env_var: + return os.getenv(env_var) + + # Fallback: try generic pattern + return os.getenv(f"{provider.upper()}_API_KEY") + +@dataclass +class DataSplitConfig: + """Configuration for dataset splitting into train/val/test sets + + ๐Ÿ”ฅ ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size for optimal results. + - Small datasets (< 15): Prioritizes validation set (70/25/5) for reliable candidate ranking + - Medium datasets (15-50): Balanced split (60/20/20) + - Large datasets (50+): More training data (70/15/15) + """ + + # Split ratios (must sum to 1.0) - used as defaults, but adaptive strategy overrides for small datasets + train_ratio: float = 0.6 # 60% for training (Dfeedback - reflection examples) + val_ratio: float = 0.2 # 20% for validation (Dpareto - Pareto selection) + test_ratio: float = 0.2 # 20% for test (held-out final evaluation) + + # Minimum samples per split + min_train_samples: int = 3 + min_val_samples: int = 3 # ๐Ÿ”ฅ INCREASED from 2 to 3 for more reliable validation scores + min_test_samples: int = 1 # ๐Ÿ”ฅ REDUCED from 2 to 1 (test set less critical, only used once) + + # Strategy for handling small datasets + small_dataset_strategy: str = 'adaptive' # ๐Ÿ”ฅ DEFAULT: 'adaptive', 'duplicate_val', 'no_test', 'error' + + def __post_init__(self): + """Validate split configuration""" + total = self.train_ratio + self.val_ratio + self.test_ratio + if not (0.99 <= total <= 1.01): # Allow small floating point errors + raise ValueError( + f"Split ratios must sum to 1.0, got {total:.3f} " + f"(train={self.train_ratio}, val={self.val_ratio}, test={self.test_ratio})" + ) + + if self.train_ratio <= 0 or self.val_ratio <= 0 or self.test_ratio < 0: + raise ValueError("Split ratios must be positive (test_ratio can be 0 to disable)") + + if self.small_dataset_strategy not in {'adaptive', 'duplicate_val', 'no_test', 'error'}: + raise ValueError( + f"Invalid small_dataset_strategy: {self.small_dataset_strategy}. " + f"Must be 'adaptive', 'duplicate_val', 'no_test', or 'error'" + ) + + def get_adaptive_ratios(self, dataset_size: int) -> Tuple[float, float, float]: + """ + ๐Ÿ”ฅ NEW: Get adaptive split ratios based on dataset size. + + For prompt optimization: + - Small datasets (< 15): Prioritize validation (70/25/5) for reliable candidate ranking + - Medium (15-50): Balanced (60/20/20) + - Large (50+): More training (70/15/15) + + Args: + dataset_size: Total number of samples in dataset + + Returns: + Tuple of (train_ratio, val_ratio, test_ratio) + """ + if dataset_size < 15: + # Small dataset: Prioritize validation for reliable candidate ranking + # Validation set is CRITICAL - used for every candidate evaluation + return (0.70, 0.25, 0.05) # 70% train, 25% val, 5% test + elif dataset_size < 50: + # Medium dataset: Balanced split + return (0.60, 0.20, 0.20) # 60% train, 20% val, 20% test + else: + # Large dataset: More training data, can reduce validation/test + return (0.70, 0.15, 0.15) # 70% train, 15% val, 15% test + + def get_split_indices(self, dataset_size: int) -> Tuple[int, int, int, int]: + """ + Calculate split indices for a dataset with adaptive ratios. + + ๐Ÿ”ฅ ADAPTIVE SPLITTING: Automatically adjusts ratios based on dataset size. + This ensures optimal allocation: + - Small datasets: More validation samples for reliable ranking + - Medium datasets: Balanced split + - Large datasets: More training data + + Args: + dataset_size: Total number of samples in dataset + + Returns: + Tuple of (train_end, val_end, test_end, dataset_size) indices + + Raises: + ValueError: If dataset is too small for configured splits + """ + # ๐Ÿ”ฅ NEW: Use adaptive ratios if strategy is 'adaptive' + if self.small_dataset_strategy == 'adaptive': + train_ratio, val_ratio, test_ratio = self.get_adaptive_ratios(dataset_size) + else: + train_ratio, val_ratio, test_ratio = self.train_ratio, self.val_ratio, self.test_ratio + + if dataset_size < self.min_train_samples + self.min_val_samples: + if self.small_dataset_strategy == 'error': + raise ValueError( + f"Dataset too small ({dataset_size} samples). " + f"Need at least {self.min_train_samples + self.min_val_samples} samples." + ) + + # Calculate ideal split points with adaptive ratios + train_end = max(self.min_train_samples, int(dataset_size * train_ratio)) + val_end = train_end + max(self.min_val_samples, int(dataset_size * val_ratio)) + + # Adjust for small datasets + if val_end >= dataset_size: + if self.small_dataset_strategy in {'adaptive', 'duplicate_val'}: + # Ensure minimum validation samples, use remainder for test + val_end = min(dataset_size, train_end + self.min_val_samples) + test_end = dataset_size + elif self.small_dataset_strategy == 'no_test': + # No test set for small datasets + val_end = dataset_size + test_end = dataset_size + else: # error + raise ValueError( + f"Dataset too small ({dataset_size} samples) for train/val/test split. " + f"Need at least {self.min_train_samples + self.min_val_samples + self.min_test_samples} samples." + ) + else: + test_end = dataset_size + + return train_end, val_end, test_end, dataset_size + +@dataclass +class OptimizationConfig: + """Configuration class for GEPA optimization process""" + + # Core models - REQUIRED by user + model: Union[str, ModelConfig] # No default - user must specify + reflection_model: Union[str, ModelConfig] # No default - user must specify + + # Optimization parameters - REQUIRED by user + max_iterations: int # No default - user decides their budget + max_metric_calls: int # No default - user sets their budget + batch_size: int # No default - user decides based on memory + + # Dataset splitting configuration + data_split: DataSplitConfig = field(default_factory=DataSplitConfig) + + # Reflection settings (separate from evaluation batch_size) + reflection_examples: int = 3 # Number of examples for each reflection (small!) + + # Optional optimization settings with sensible fallbacks + early_stopping: bool = True + learning_rate: float = 0.01 + + # Multi-objective optimization + multi_objective: bool = False + objectives: List[str] = field(default_factory=lambda: ["accuracy"]) + + # Advanced settings + custom_metrics: Optional[Dict[str, Any]] = None + use_cache: bool = True + parallel_evaluation: bool = False + + # Backwards compatibility (deprecated) + train_split_ratio: Optional[float] = None # Use data_split instead + min_dataset_size: int = 2 + + # Cost and budget - user controlled + max_cost_usd: Optional[float] = None + timeout_seconds: Optional[int] = None + + # GEPA-specific optimization parameters (based on actual GEPA library) + candidate_selection_strategy: str = 'pareto' # Use Pareto selection strategy + skip_perfect_score: bool = False # Don't skip perfect scores (set to True for early stopping) + reflection_minibatch_size: Optional[int] = None # Will use reflection_examples if None + perfect_score: float = 1.0 # Perfect score threshold + module_selector: str = 'round_robin' # Component selection strategy + verbose: bool = True # Enable detailed GEPA logging + + # Test set evaluation + evaluate_on_test: bool = True # Evaluate final prompt on held-out test set + + # ๐Ÿ†• LLEGO Genetic Operator Parameters (Optional - for faster convergence) + # Based on ICLR 2025 paper: "Decision Tree Induction Through LLMs via Semantically-Aware Evolution" + # Optimized for small datasets (6-10 samples) + use_llego_operators: bool = False # Enable LLEGO genetic operators + + # ๐Ÿ”ฅ HYBRID MODE: Combine GEPA Reflection + LLEGO Operators + # When both enabled, candidates are generated from BOTH sources for maximum diversity + enable_gepa_reflection_with_llego: bool = False # Enable hybrid GEPA+LLEGO mode + num_gepa_reflection_candidates: int = 3 # Number of GEPA reflection candidates per iteration (default: 3 for better exploration, range: 2-5) + + # Fitness-guided crossover parameters (FIX #3: Conservative alpha) + alpha: float = 0.05 # FIX #3: Fitness extrapolation (0.05 = 5% above best parent, realistic for prompt optimization) + n_crossover: int = 2 # Number of offspring from crossover per iteration + + # Diversity-guided mutation parameters + tau: float = 8.0 # Diversity temperature (8.0 = moderate diversity, balanced exploration/exploitation) + nu: int = 3 # Parent arity (3 parents optimal for small populations ~6 samples) + n_mutation: int = 2 # Number of offspring from mutation per iteration (total 4 offspring with crossover) + + # Population management (for genetic operators) + population_size: int = 8 # Size of prompt population (small but diverse for 6-sample dataset) + + # ๐Ÿ†• LLM-as-Judge configuration (Phase 2) + use_llm_as_judge: bool = True # Enable LLM-as-Judge feedback for detailed, actionable analysis + llm_as_judge_threshold: float = 0.8 # Use LLM-as-Judge for scores below this threshold + llm_as_judge_model: Optional[ModelConfig] = None # Optional: use different model (defaults to reflection_model) + + # ๐Ÿ†• Logging configuration (Phase 3) + log_level: str = "INFO" # Logging level: "DEBUG", "INFO", "WARNING", "ERROR" + + def __post_init__(self): + """Validate and process configuration after initialization""" + # Handle backwards compatibility for train_split_ratio + if self.train_split_ratio is not None and self.train_split_ratio != 0.8: + import warnings + warnings.warn( + "train_split_ratio is deprecated. Use data_split=DataSplitConfig(...) instead. " + "Converting to 3-way split with your ratio.", + DeprecationWarning, + stacklevel=2 + ) + # Convert 2-way split to 3-way: use train_ratio, split remainder between val/test + remainder = 1.0 - self.train_split_ratio + self.data_split = DataSplitConfig( + train_ratio=self.train_split_ratio, + val_ratio=remainder * 0.5, + test_ratio=remainder * 0.5 + ) + + # Convert string models to ModelConfig objects + self.model = self._parse_model_config(self.model, "model") + self.reflection_model = self._parse_model_config(self.reflection_model, "reflection_model") + + # Set reflection_minibatch_size default + if self.reflection_minibatch_size is None: + self.reflection_minibatch_size = self.reflection_examples + + # Validate required parameters + self._validate_required_params() + + # Validate ranges + self._validate_ranges() + + def _parse_model_config(self, model: Union[str, ModelConfig], field_name: str) -> ModelConfig: + """Parse string model specification into ModelConfig""" + if isinstance(model, ModelConfig): + return model + + if isinstance(model, str): + # Parse "provider/model-name" format + if "/" in model: + provider, model_name = model.split("/", 1) + else: + # Default to openai if no provider specified + provider = "openai" + model_name = model + + # Try to get API key from environment + api_key = self._get_api_key_for_provider(provider) + if not api_key: + raise ValueError( + f"No API key found for {provider}. Please set environment variable " + f"or provide ModelConfig with api_key for {field_name}" + ) + + return ModelConfig( + provider=provider, + model_name=model_name, + api_key=api_key + ) + + raise ValueError(f"{field_name} must be either a string or ModelConfig object") + + def _get_api_key_for_provider(self, provider: str) -> Optional[str]: + """Get API key for provider from environment variables""" + return ModelConfig._get_api_key_for_provider(provider) + + def _validate_required_params(self): + """Validate that all required parameters are provided""" + required_fields = { + "max_iterations": self.max_iterations, + "max_metric_calls": self.max_metric_calls, + "batch_size": self.batch_size, + } + + for field_name, value in required_fields.items(): + if value is None: + raise ValueError(f"{field_name} is required and must be specified by user") + + def _validate_ranges(self): + """Validate parameter ranges""" + if self.max_iterations <= 0: + raise ValueError("max_iterations must be positive") + + if self.max_metric_calls <= 0: + raise ValueError("max_metric_calls must be positive") + + if self.batch_size <= 0: + raise ValueError("batch_size must be positive") + + if self.reflection_examples <= 0 or self.reflection_examples > 10: + raise ValueError("reflection_examples must be between 1 and 10 (recommended: 2-5)") + + if self.reflection_minibatch_size <= 0: + raise ValueError("reflection_minibatch_size must be positive") + + if hasattr(self.model, 'max_tokens') and self.model.max_tokens <= 0: + raise ValueError("model.max_tokens must be a positive integer") + + # Validate hybrid mode parameters + if self.enable_gepa_reflection_with_llego and not self.use_llego_operators: + raise ValueError("enable_gepa_reflection_with_llego requires use_llego_operators=True") + + if self.num_gepa_reflection_candidates <= 0 or self.num_gepa_reflection_candidates > 5: + raise ValueError("num_gepa_reflection_candidates must be between 1 and 5 (recommended: 3 for balanced exploration)") + + # Validate log_level + valid_log_levels = ["DEBUG", "INFO", "WARNING", "ERROR"] + if self.log_level.upper() not in valid_log_levels: + raise ValueError(f"log_level must be one of {valid_log_levels}, got: {self.log_level}") + + def validate_api_connectivity(self) -> Dict[str, bool]: + """Test API connectivity for both models""" + results = {} + + for model_name, model_config in [("model", self.model), ("reflection_model", self.reflection_model)]: + try: + # This would be implemented to actually test the API + # For now, just check if we have the required info + if model_config.api_key and model_config.provider and model_config.model_name: + results[model_name] = True + else: + results[model_name] = False + except Exception: + results[model_name] = False + + return results + + def get_estimated_cost(self) -> Dict[str, Any]: + """Estimate cost based on configuration""" + # This would calculate estimated costs based on: + # - max_metric_calls + # - model pricing + # - expected tokens per call + return { + "max_calls": self.max_metric_calls, + "estimated_cost_range": "To be calculated based on provider pricing", + "cost_factors": { + "model_calls": self.max_metric_calls, + "reflection_calls": self.max_iterations, + "batch_size": self.batch_size + } + } + + @classmethod + def create_example_config(cls, provider: str = "openai") -> str: + """Generate example configuration code for users""" + examples = { + "openai": ''' +# Example OpenAI Configuration +config = OptimizationConfig( + model="openai/gpt-4-turbo", # or ModelConfig(...) + reflection_model="openai/gpt-4-turbo", + max_iterations=50, # Your choice based on budget + max_metric_calls=300, # Your choice based on budget + batch_size=8, # Your choice based on memory + early_stopping=True, + learning_rate=0.01 +) +''', + "anthropic": ''' +# Example Anthropic Configuration +config = OptimizationConfig( + model=ModelConfig( + provider="anthropic", + model_name="claude-3-opus-20240229", + api_key="your-anthropic-key", + temperature=0.7 + ), + reflection_model="anthropic/claude-3-sonnet-20240229", + max_iterations=30, + max_metric_calls=200, + batch_size=4 +) +''', + "mixed": ''' +# Example Mixed Providers Configuration +config = OptimizationConfig( + model="openai/gpt-4-turbo", # Main model + reflection_model="anthropic/claude-3-opus", # Reflection model + max_iterations=25, + max_metric_calls=250, + batch_size=6, + max_cost_usd=100.0, # Budget limit + timeout_seconds=3600 # 1 hour limit +) +''' + } + + return examples.get(provider, examples["openai"]) diff --git a/src/gepa_optimizer/models/dataset.py b/src/gepa_optimizer/models/dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..82bf5264c258f4e6cd5c89dbde63f78693761c45 --- /dev/null +++ b/src/gepa_optimizer/models/dataset.py @@ -0,0 +1,89 @@ +""" +Dataset models for GEPA Optimizer +""" + +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional +import uuid + +@dataclass +class DatasetItem: + """Single item in a dataset""" + + # Identifiers + item_id: str = field(default_factory=lambda: str(uuid.uuid4())) + + # Core data + input_data: Any = "" + expected_output: Optional[str] = None + image_base64: Optional[str] = None + + # Metadata + metadata: Dict[str, Any] = field(default_factory=dict) + tags: List[str] = field(default_factory=list) + + # File references + file_paths: List[str] = field(default_factory=list) + + # Quality indicators + quality_score: float = 1.0 + is_validated: bool = False + validation_notes: List[str] = field(default_factory=list) + + def __post_init__(self): + """Validate item after initialization""" + if self.quality_score < 0 or self.quality_score > 1: + raise ValueError("quality_score must be between 0 and 1") + + def add_tag(self, tag: str): + """Add a tag to this item""" + if tag not in self.tags: + self.tags.append(tag) + + def mark_validated(self, notes: Optional[List[str]] = None): + """Mark item as validated""" + self.is_validated = True + if notes: + self.validation_notes.extend(notes) + +@dataclass +class ProcessedDataset: + """Dataset after processing for GEPA optimization""" + + # Identifiers + dataset_id: str = field(default_factory=lambda: str(uuid.uuid4())) + name: str = "Untitled Dataset" + + # Data + items: List[DatasetItem] = field(default_factory=list) + train_split: List[DatasetItem] = field(default_factory=list) + val_split: List[DatasetItem] = field(default_factory=list) + + # Metadata + source_info: Dict[str, Any] = field(default_factory=dict) + processing_stats: Dict[str, Any] = field(default_factory=dict) + + # Quality metrics + total_items: int = 0 + validated_items: int = 0 + avg_quality_score: float = 0.0 + + def __post_init__(self): + """Calculate derived fields""" + self.total_items = len(self.items) + + if self.items: + self.validated_items = sum(1 for item in self.items if item.is_validated) + self.avg_quality_score = sum(item.quality_score for item in self.items) / len(self.items) + + def get_stats(self) -> Dict[str, Any]: + """Get dataset statistics""" + return { + 'total_items': self.total_items, + 'validated_items': self.validated_items, + 'validation_rate': self.validated_items / self.total_items if self.total_items > 0 else 0, + 'avg_quality_score': self.avg_quality_score, + 'train_size': len(self.train_split), + 'val_size': len(self.val_split), + 'has_expected_outputs': sum(1 for item in self.items if item.expected_output), + } diff --git a/src/gepa_optimizer/models/result.py b/src/gepa_optimizer/models/result.py new file mode 100644 index 0000000000000000000000000000000000000000..95d11cda56bf40a7faae1fea211bd572a9c4dbe5 --- /dev/null +++ b/src/gepa_optimizer/models/result.py @@ -0,0 +1,204 @@ +""" +Result models for GEPA Optimizer +""" + +from dataclasses import dataclass, field +from datetime import datetime +from typing import Dict, Any, Optional, List +import uuid + +@dataclass +class OptimizationResult: + """Complete optimization result with all metadata""" + + # Identifiers + session_id: str = field(default_factory=lambda: str(uuid.uuid4())) + + # Core results + original_prompt: str = "" + optimized_prompt: str = "" + + # Performance metrics + improvement_data: Dict[str, Any] = field(default_factory=dict) + baseline_metrics: Dict[str, float] = field(default_factory=dict) + final_metrics: Dict[str, float] = field(default_factory=dict) + + # Process metadata + optimization_time: float = 0.0 + dataset_size: int = 0 + total_iterations: int = 0 + + # Status and error handling + status: str = "pending" # pending, running, completed, failed + error_message: Optional[str] = None + + # Timestamps + created_at: datetime = field(default_factory=datetime.now) + completed_at: Optional[datetime] = None + + # Reflection history + reflection_history: List[Dict[str, Any]] = field(default_factory=list) + + # Cost and resource usage + estimated_cost: Optional[float] = None + api_calls_made: int = 0 + + def mark_completed(self): + """Mark optimization as completed""" + self.status = "completed" + self.completed_at = datetime.now() + + def mark_failed(self, error: str): + """Mark optimization as failed""" + self.status = "failed" + self.error_message = error + self.completed_at = datetime.now() + +class OptimizedResult: + """ + User-facing result class that provides clean interface + """ + + def __init__(self, + original_prompt: str = "", + optimized_prompt: str = "", + improvement_data: Dict[str, Any] = None, + optimization_time: float = 0.0, + dataset_size: int = 0, + total_iterations: int = 0, + status: str = "pending", + error_message: Optional[str] = None, + detailed_result: Optional[OptimizationResult] = None, + session_id: Optional[str] = None): + """ + Initialize OptimizedResult with individual parameters + + Args: + original_prompt: Original seed prompt + optimized_prompt: Optimized prompt + improvement_data: Performance improvement data + optimization_time: Time taken for optimization + dataset_size: Size of dataset used + total_iterations: Number of optimization iterations + status: Optimization status + error_message: Error message if failed + detailed_result: Optional detailed OptimizationResult + session_id: Optional session ID + """ + if improvement_data is None: + improvement_data = {} + + # Create internal OptimizationResult + self._result = OptimizationResult( + session_id=session_id or str(uuid.uuid4()), + original_prompt=original_prompt, + optimized_prompt=optimized_prompt, + improvement_data=improvement_data, + optimization_time=optimization_time, + dataset_size=dataset_size, + total_iterations=total_iterations, + status=status, + error_message=error_message + ) + + # If detailed_result is provided, use it instead + if detailed_result is not None: + self._result = detailed_result + + @property + def prompt(self) -> str: + """The optimized prompt ready for production use""" + return self._result.optimized_prompt + + @property + def original_prompt(self) -> str: + """The original seed prompt for reference""" + return self._result.original_prompt + + @property + def session_id(self) -> str: + """Unique session identifier""" + return self._result.session_id + + @property + def improvement_data(self) -> Dict[str, Any]: + """Performance improvement data""" + return self._result.improvement_data + + @property + def status(self) -> str: + """Optimization status""" + return self._result.status + + @property + def error_message(self) -> Optional[str]: + """Error message if optimization failed""" + return self._result.error_message + + @property + def is_successful(self) -> bool: + """Whether optimization completed successfully""" + return ( + self._result.status == "completed" and + self._result.error_message is None + ) + + @property + def optimization_time(self) -> float: + """Time taken for optimization in seconds""" + return self._result.optimization_time + + @property + def dataset_size(self) -> int: + """Size of dataset used for optimization""" + return self._result.dataset_size + + @property + def total_iterations(self) -> int: + """Total optimization iterations performed""" + return self._result.total_iterations + + @property + def estimated_cost(self) -> Optional[float]: + """Estimated cost in USD""" + return self._result.estimated_cost + + def get_improvement_summary(self) -> Dict[str, Any]: + """Get summary of improvements made""" + summary = { + 'has_improvement': bool(self._result.improvement_data), + 'optimization_time': self.optimization_time, + 'iterations': self.total_iterations, + 'dataset_size': self.dataset_size + } + + # Add improvement percentage if available + if 'improvement_percent' in self._result.improvement_data: + summary['improvement_percent'] = self._result.improvement_data['improvement_percent'] + + return summary + + def get_reflection_summary(self) -> Dict[str, Any]: + """Get summary of reflection process""" + if not self._result.reflection_history: + return {'total_reflections': 0} + + return { + 'total_reflections': len(self._result.reflection_history), + 'reflection_points': [ + r.get('summary', 'No summary') + for r in self._result.reflection_history[:3] # First 3 + ] + } + + def get_detailed_result(self) -> OptimizationResult: + """Get the full detailed result for advanced users""" + return self._result + + def __str__(self) -> str: + """String representation""" + status_emoji = "โœ…" if self.is_successful else "โŒ" if self.status == "failed" else "โณ" + return f"OptimizedResult({status_emoji} {self.status}, time={self.optimization_time:.2f}s)" + + def __repr__(self) -> str: + return self.__str__() diff --git a/src/gepa_optimizer/operators/__init__.py b/src/gepa_optimizer/operators/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d2f3b6b5092b597b55a76fbabb6654f58e51f04d --- /dev/null +++ b/src/gepa_optimizer/operators/__init__.py @@ -0,0 +1,45 @@ +""" +LLEGO Genetic Operators for GEPA. + +This module provides genetic operators for prompt optimization: +- FitnessGuidedCrossover: Combines high-performing prompts +- DiversityGuidedMutation: Explores diverse variations +- LLEGOIntegrationLayer: Manages the genetic algorithm workflow + +Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025) +""" + +# Base interfaces (SOLID: Interface Segregation) +from .base_operator import ( + BaseGeneticOperator, + BaseCrossoverOperator, + BaseMutationOperator, +) + +# Data models +from .models import ( + PromptCandidate, + PromptMetadata, +) + +# Concrete operators (SOLID: Single Responsibility) +from .crossover import FitnessGuidedCrossover +from .mutation import DiversityGuidedMutation + +# Integration layer +from .llego_operators import LLEGOIntegrationLayer + +__all__ = [ + # Base interfaces + 'BaseGeneticOperator', + 'BaseCrossoverOperator', + 'BaseMutationOperator', + # Data models + 'PromptCandidate', + 'PromptMetadata', + # Operators + 'FitnessGuidedCrossover', + 'DiversityGuidedMutation', + # Integration + 'LLEGOIntegrationLayer', +] diff --git a/src/gepa_optimizer/operators/base_operator.py b/src/gepa_optimizer/operators/base_operator.py new file mode 100644 index 0000000000000000000000000000000000000000..ff3a2772abec80e3800aeec1857dc371cccdb608 --- /dev/null +++ b/src/gepa_optimizer/operators/base_operator.py @@ -0,0 +1,107 @@ +""" +Base Genetic Operator Interface. + +Defines the abstract interface for all genetic operators following +the Interface Segregation Principle (ISP) of SOLID. +""" + +from abc import ABC, abstractmethod +from typing import List, Callable +import logging + +logger = logging.getLogger(__name__) + + +class BaseGeneticOperator(ABC): + """ + Abstract base class for genetic operators. + + All genetic operators (crossover, mutation, etc.) should inherit from this + class and implement the __call__ method. + + Design Principles: + - Single Responsibility: Each operator does one thing + - Open/Closed: Extend via inheritance, don't modify + - Liskov Substitution: Any operator works where base is expected + - Interface Segregation: Minimal required interface + - Dependency Inversion: Depend on abstractions (LLM callable) + """ + + @abstractmethod + def __call__(self, *args, **kwargs) -> str: + """ + Execute the genetic operation. + + Returns: + str: New prompt generated by the operation + """ + pass + + @abstractmethod + def _build_prompt(self, *args, **kwargs) -> str: + """ + Build the LLM prompt for this operation. + + Returns: + str: Prompt to send to the LLM + """ + pass + + +class BaseCrossoverOperator(BaseGeneticOperator): + """ + Abstract base class for crossover operators. + + Crossover combines multiple parent prompts to create offspring + that inherit good traits from both parents. + """ + + @abstractmethod + def __call__( + self, + parents: List, # List[PromptCandidate] + target_fitness: float, + llm: Callable[[str], str] + ) -> str: + """ + Combine parent prompts to create offspring. + + Args: + parents: List of parent PromptCandidate objects + target_fitness: Desired fitness for offspring + llm: Language model callable + + Returns: + str: Offspring prompt + """ + pass + + +class BaseMutationOperator(BaseGeneticOperator): + """ + Abstract base class for mutation operators. + + Mutation creates variations of a parent prompt to explore + new regions of the search space. + """ + + @abstractmethod + def __call__( + self, + parent, # PromptCandidate + population: List, # List[PromptCandidate] + llm: Callable[[str], str] + ) -> str: + """ + Mutate a parent prompt to create a variation. + + Args: + parent: Parent PromptCandidate to mutate + population: Current population for diversity guidance + llm: Language model callable + + Returns: + str: Mutated prompt + """ + pass + diff --git a/src/gepa_optimizer/operators/crossover.py b/src/gepa_optimizer/operators/crossover.py new file mode 100644 index 0000000000000000000000000000000000000000..fff82d7d619550b72e33299a8f78c2ccd28b7e48 --- /dev/null +++ b/src/gepa_optimizer/operators/crossover.py @@ -0,0 +1,120 @@ +""" +Fitness-Guided Crossover Operator. + +Adapts LLEGO's fitness-guided crossover for text prompts. +Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025) +""" + +from typing import List, Callable, TYPE_CHECKING +import logging + +from .base_operator import BaseCrossoverOperator + +if TYPE_CHECKING: + from .models import PromptCandidate + +logger = logging.getLogger(__name__) + + +class FitnessGuidedCrossover(BaseCrossoverOperator): + """ + Fitness-guided crossover for text prompts. + + Combines high-performing parent prompts to generate offspring + that target specific fitness levels using LLM semantic understanding. + + From LLEGO paper: + "Fitness-guided crossover exploits high-performing regions of the search space + by combining parent trees targeting a desired fitness level f* = f_max + ฮฑ(f_max - f_min)" + + Reference: https://github.com/nicolashuynh/LLEGO + """ + + def __init__(self, alpha: float = 0.1): + """ + Initialize crossover operator. + + Args: + alpha: Fitness extrapolation parameter. + Higher ฮฑ = target higher fitness than parents. + Default 0.1 from LLEGO paper (target 10% above best parent). + """ + self.alpha = alpha + logger.debug(f"FitnessGuidedCrossover initialized with ฮฑ={alpha}") + + def __call__( + self, + parents: List["PromptCandidate"], + target_fitness: float, + llm: Callable[[str], str] + ) -> str: + """ + Combine parent prompts targeting specific fitness. + + Args: + parents: List of PromptCandidate objects (2+ parents) + target_fitness: Desired fitness for offspring + llm: Language model callable + + Returns: + str: Offspring prompt + + Raises: + ValueError: If fewer than 2 parents provided + """ + if len(parents) < 2: + raise ValueError("Crossover requires at least 2 parents") + + # Sort parents by fitness (best first) + sorted_parents = sorted(parents, key=lambda p: p.fitness, reverse=True) + + logger.debug(f"Crossover: {len(parents)} parents, target fitness={target_fitness:.3f}") + + # Build crossover prompt and call LLM + crossover_prompt = self._build_prompt(sorted_parents, target_fitness) + new_prompt = llm(crossover_prompt) + + return new_prompt + + def _build_prompt( + self, + parents: List["PromptCandidate"], + target_fitness: float + ) -> str: + """ + Build LLM prompt for crossover operation. + + Args: + parents: Sorted list of parent candidates (best first) + target_fitness: Target fitness for offspring + + Returns: + str: Prompt for LLM + """ + # Truncate parents to prevent safety filter issues + MAX_PARENT_LENGTH = 350 + + # Build parent descriptions (limit to top 2) + parent_descriptions = [] + for i, parent in enumerate(parents[:2]): + truncated = parent.prompt[:MAX_PARENT_LENGTH] + if len(parent.prompt) > MAX_PARENT_LENGTH: + truncated += "..." + parent_descriptions.append( + f"P{i+1} (f={parent.fitness:.2f}): {truncated}\n" + ) + + prompt = f"""Combine these prompts into ONE improved version (target fitness: {target_fitness:.2f}). + +{' '.join(parent_descriptions)} +Instructions: +1. Merge the best rules/principles from both parents +2. Organize logic clearly (e.g., "For X tasks: do Y", "If Z: then A") +3. Add structure to handle different cases systematically +4. Keep output format (Element: X, Description:, Reason:) +5. Max 600 chars + +Output ONLY the combined prompt:""" + + return prompt + diff --git a/src/gepa_optimizer/operators/llego_operators.py b/src/gepa_optimizer/operators/llego_operators.py new file mode 100644 index 0000000000000000000000000000000000000000..6be082eea011484ab510e6e63789db43a0c06ff6 --- /dev/null +++ b/src/gepa_optimizer/operators/llego_operators.py @@ -0,0 +1,364 @@ +""" +LLEGO Integration Layer for GEPA. + +This module provides the integration layer that wraps LLEGO genetic operators +for use with the GEPA optimization framework. + +Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025) +GitHub: https://github.com/nicolashuynh/LLEGO +""" + +from typing import List, Callable, Dict, Any, Optional, Literal +import numpy as np +import logging + +# Import from modular files (SOLID: Single Responsibility) +from .models import PromptCandidate, PromptMetadata +from .crossover import FitnessGuidedCrossover +from .mutation import DiversityGuidedMutation + +logger = logging.getLogger(__name__) + + +class LLEGOIntegrationLayer: + """ + Integration layer that wraps LLEGO operators for GEPA. + + This class manages the genetic algorithm workflow: + - Population initialization + - Parent selection (fitness-based) + - Crossover and mutation operations + - Population management + + Design Principles: + - Composition over inheritance (uses crossover_op, mutation_op) + - Single Responsibility: Only manages GA workflow + - Open/Closed: New operators can be added without modifying this class + """ + + def __init__( + self, + alpha: float = 0.05, + tau: float = 10.0, + nu: int = 4, + population_size: int = 10, + n_crossover: int = 2, + n_mutation: int = 3 + ): + """ + Initialize LLEGO integration layer. + + Args: + alpha: Fitness extrapolation for crossover (default 0.05) + tau: Diversity temperature for mutation + nu: Parent arity for diversity sampling + population_size: Maximum population size + n_crossover: Number of crossover offspring per generation + n_mutation: Number of mutation offspring per generation + """ + self.crossover_op = FitnessGuidedCrossover(alpha=alpha) + self.mutation_op = DiversityGuidedMutation(tau=tau, nu=nu) + self.population_size = population_size + self.n_crossover = n_crossover + self.n_mutation = n_mutation + self.population: List[PromptCandidate] = [] + self.current_generation = 0 + + # Track metadata for prompts generated in current generation + self._generation_metadata: Dict[str, PromptMetadata] = {} + + logger.debug(f"LLEGO initialized: pop_size={population_size}, crossover={n_crossover}, mutation={n_mutation}") + + def initialize_population(self, seed_prompt: str, initial_fitness: float = 0.5): + """Initialize population with seed prompt.""" + seed_candidate = PromptCandidate( + prompt=seed_prompt, + fitness=initial_fitness, + metadata={ + 'generation': 0, + 'operator': 'seed', + 'parent_indices': None, + 'parent_prompts': None, + 'target_fitness': None, + 'diversity_score': None, + 'sample_scores': None, + 'num_diverse_parents': None + } + ) + self.population = [seed_candidate] + logger.debug(f"Population initialized with seed prompt ({len(seed_prompt)} chars)") + + def create_candidate_with_metadata( + self, + prompt: str, + fitness: float, + generation: int, + operator: Literal['crossover', 'mutation'], + parent_indices: Optional[List[int]] = None, + parent_prompts: Optional[List[str]] = None, + target_fitness: Optional[float] = None, + diversity_score: Optional[float] = None, + sample_scores: Optional[List[float]] = None, + num_diverse_parents: Optional[int] = None + ) -> PromptCandidate: + """Create a PromptCandidate with properly populated metadata.""" + return PromptCandidate( + prompt=prompt, + fitness=fitness, + metadata={ + 'generation': generation, + 'operator': operator, + 'parent_indices': parent_indices, + 'parent_prompts': parent_prompts, + 'target_fitness': target_fitness, + 'diversity_score': diversity_score, + 'sample_scores': sample_scores, + 'num_diverse_parents': num_diverse_parents + } + ) + + def evolve_generation( + self, + llm: Callable[[str], str], + pareto_front: List[PromptCandidate] + ) -> List[str]: + """ + Evolve one generation using LLEGO operators. + + When crossover cannot run (< 2 parents with scores), it is skipped. + The caller should compensate by generating extra GEPA reflection candidates. + + Args: + llm: Language model callable + pareto_front: Current Pareto front (non-dominated prompts with scores) + + Returns: + List of new prompt candidates to evaluate + """ + new_prompts = [] + self.current_generation += 1 + self._generation_metadata = {} + + # Track crossover status for caller to handle compensation + self._crossover_skipped = False + self._crossover_deficit = 0 + self._actual_crossover_count = 0 + + logger.info(f"๐Ÿงฌ LLEGO Generation {self.current_generation}: pareto_front={len(pareto_front)}, population={len(self.population)}") + + # Crossover: Combine BEST parents (requires >= 2 parents WITH SCORES) + if len(pareto_front) >= 2: + # Sort by fitness - always use TOP scored parents for crossover + sorted_front = sorted(pareto_front, key=lambda p: p.fitness, reverse=True) + + for i in range(self.n_crossover): + # Always use top 2 highest-scored parents + parents = sorted_front[:2] + target_fitness = self._calculate_target_fitness(parents) + + offspring = self.crossover_op(parents, target_fitness, llm) + new_prompts.append(offspring) + self._actual_crossover_count += 1 + + # Store metadata with parent fitness info + self._generation_metadata[offspring] = { + 'generation': self.current_generation, + 'operator': 'crossover', + 'parent_indices': [self.population.index(p) for p in parents if p in self.population], + 'parent_prompts': [p.prompt for p in parents], + 'parent_fitnesses': [p.fitness for p in parents], + 'target_fitness': target_fitness, + 'diversity_score': None, + 'sample_scores': None, + 'num_diverse_parents': len(parents) + } + + logger.info(f" Oโ‚“โ‚’{i+1}: Crossed top parents (f={parents[0].fitness:.3f} ร— f={parents[1].fitness:.3f}) โ†’ target f*={target_fitness:.3f}") + else: + # Signal that crossover was skipped - caller should compensate with GEPA + self._crossover_skipped = True + self._crossover_deficit = self.n_crossover + logger.info(f"โš ๏ธ Crossover SKIPPED: need 2+ scored parents, have {len(pareto_front)}") + logger.info(f" โ†’ Caller should compensate with {self._crossover_deficit} extra GEPA reflection candidates") + + # Mutation: Explore diverse variations (requires >= 1 parent) + # Use pareto_front if available, otherwise fall back to population + mutation_source = pareto_front if pareto_front else self.population + + if len(mutation_source) >= 1: + for i in range(self.n_mutation): + parent = self._select_parent_for_mutation(mutation_source) + + offspring = self.mutation_op(parent, self.population, llm) + new_prompts.append(offspring) + + parent_idx = self.population.index(parent) if parent in self.population else -1 + self._generation_metadata[offspring] = { + 'generation': self.current_generation, + 'operator': 'mutation', + 'parent_indices': [parent_idx] if parent_idx >= 0 else None, + 'parent_prompts': [parent.prompt], + 'parent_fitness': parent.fitness, + 'target_fitness': None, + 'diversity_score': None, + 'sample_scores': None, + 'num_diverse_parents': min(self.mutation_op.nu, len(self.population)) + } + + crossover_count = len([p for p in new_prompts if self._generation_metadata.get(p, {}).get('operator') == 'crossover']) + mutation_count = len([p for p in new_prompts if self._generation_metadata.get(p, {}).get('operator') == 'mutation']) + + logger.info(f"๐Ÿงฌ LLEGO Generated {len(new_prompts)} candidates: {crossover_count} crossover, {mutation_count} mutation") + + return new_prompts + + def get_prompt_metadata(self, prompt: str) -> Optional[PromptMetadata]: + """Retrieve metadata for a prompt generated in the current generation.""" + return self._generation_metadata.get(prompt) + + def _convert_gepa_pareto_to_candidates( + self, + gepa_pareto_front: List[Dict[str, Any]] + ) -> List[PromptCandidate]: + """ + Convert GEPA Pareto front entries to PromptCandidate format. + + Args: + gepa_pareto_front: List of dicts with 'prompt', 'score', 'type', 'notation' + + Returns: + List of PromptCandidate objects + """ + if not gepa_pareto_front: + return [] + + # De-duplicate Pareto front + seen_prompts = set() + deduplicated_front = [] + + for entry in gepa_pareto_front: + if isinstance(entry, dict) and 'prompt' in entry: + prompt_text = entry['prompt'] + if prompt_text not in seen_prompts: + seen_prompts.add(prompt_text) + deduplicated_front.append(entry) + + candidates = [] + + for idx, entry in enumerate(deduplicated_front): + try: + if not isinstance(entry, dict): + continue + + prompt = entry.get('prompt') + if not prompt or not isinstance(prompt, str): + continue + + score = entry.get('score') + if score is None: + continue + + try: + fitness = float(score) + except (ValueError, TypeError): + continue + + candidate_type = entry.get('type', 'unknown') + notation = entry.get('notation', 'S') + + metadata: PromptMetadata = { + 'generation': self.current_generation, + 'operator': 'gepa_pareto_front', + 'parent_indices': None, + 'parent_prompts': None, + 'target_fitness': None, + 'diversity_score': None, + 'sample_scores': None, + 'num_diverse_parents': None, + 'candidate_type': candidate_type, + 'notation': notation, + 'prompt_length': len(prompt), + 'word_count': len(prompt.split()), + 'from_gepa_pareto': True + } + + candidate = PromptCandidate( + prompt=prompt, + fitness=fitness, + metadata=metadata + ) + + candidates.append(candidate) + + except Exception as e: + logger.error(f"Error converting Pareto entry #{idx+1}: {e}") + continue + + return candidates + + def update_population(self, new_candidates: List[PromptCandidate]): + """Update population with new evaluated candidates.""" + self.population.extend(new_candidates) + + # Remove duplicates + seen_prompts = set() + unique_population = [] + for p in self.population: + normalized = p.prompt.strip().strip('"\'') + if normalized not in seen_prompts: + seen_prompts.add(normalized) + unique_population.append(p) + self.population = unique_population + + # Keep top population_size by fitness + self.population.sort(key=lambda p: p.fitness, reverse=True) + self.population = self.population[:self.population_size] + + if self.population: + logger.debug(f"Population updated: {len(self.population)} candidates, best={self.population[0].fitness:.3f}") + + def _select_parents_for_crossover(self, pareto_front: List[PromptCandidate], k: int = 2) -> List[PromptCandidate]: + """Select top-k parents for crossover.""" + sorted_front = sorted(pareto_front, key=lambda p: p.fitness, reverse=True) + return sorted_front[:k] + + def _select_parent_for_mutation(self, pareto_front: List[PromptCandidate]) -> PromptCandidate: + """Select a parent for mutation (fitness-proportionate).""" + if len(pareto_front) == 1: + return pareto_front[0] + + fitnesses = np.array([p.fitness for p in pareto_front]) + fitnesses = np.maximum(fitnesses, 0.01) + probs = fitnesses / fitnesses.sum() + + idx = np.random.choice(len(pareto_front), p=probs) + return pareto_front[idx] + + def _calculate_target_fitness(self, parents: List[PromptCandidate]) -> float: + """Calculate target fitness for crossover using LLEGO formula: f* = f_max + ฮฑ(f_max - f_min)""" + fitnesses = [p.fitness for p in parents] + f_max = max(fitnesses) + f_min = min(fitnesses) + + target_fitness = f_max + self.crossover_op.alpha * (f_max - f_min) + return min(target_fitness, 1.0) + + def get_best_candidate(self) -> Optional[PromptCandidate]: + """Get current best prompt.""" + if not self.population: + return None + return max(self.population, key=lambda p: p.fitness) + + def get_stats(self) -> Dict[str, Any]: + """Get population statistics.""" + if not self.population: + return {"population_size": 0, "best_fitness": 0.0, "avg_fitness": 0.0} + + fitnesses = [p.fitness for p in self.population] + return { + "population_size": len(self.population), + "best_fitness": max(fitnesses), + "avg_fitness": np.mean(fitnesses), + "min_fitness": min(fitnesses), + "fitness_std": np.std(fitnesses) + } diff --git a/src/gepa_optimizer/operators/models.py b/src/gepa_optimizer/operators/models.py new file mode 100644 index 0000000000000000000000000000000000000000..45d92fb0843e843dc205fcd4da3ea64ec87bbe1c --- /dev/null +++ b/src/gepa_optimizer/operators/models.py @@ -0,0 +1,60 @@ +""" +Data models for LLEGO genetic operators. + +Contains the core data structures used across all genetic operators. +""" + +from typing import List, Optional, Literal +from dataclasses import dataclass, field +from datetime import datetime +from typing import TypedDict + + +class PromptMetadata(TypedDict, total=False): + """ + Metadata for tracking prompt evolution history and performance. + + This enables debugging, analysis, and visualization of the genetic algorithm's + evolution process by tracking how each prompt was created and its characteristics. + """ + generation: int # Which iteration created this prompt + operator: Literal['seed', 'crossover', 'mutation'] # How the prompt was created + parent_indices: Optional[List[int]] # Indices of parent prompts + parent_prompts: Optional[List[str]] # Actual parent prompt texts + target_fitness: Optional[float] # Target fitness for crossover + diversity_score: Optional[float] # Diversity from population + sample_scores: Optional[List[float]] # Performance per sample + num_diverse_parents: Optional[int] # Diverse parents count (mutation) + created_at: str # Creation timestamp + prompt_length: int # Character count + word_count: int # Word count + candidate_type: Optional[str] # Type for GEPA notation + + +@dataclass +class PromptCandidate: + """ + Represents a prompt candidate with fitness score and evolution metadata. + + Attributes: + prompt: The actual prompt text + fitness: Fitness score (0-1) from evaluation + metadata: Tracking information about prompt creation and performance + """ + prompt: str + fitness: float + metadata: Optional[PromptMetadata] = field(default_factory=dict) + + def __post_init__(self): + """Initialize metadata if not provided.""" + if self.metadata is None: + self.metadata = {} + + # Auto-populate prompt statistics + if 'prompt_length' not in self.metadata: + self.metadata['prompt_length'] = len(self.prompt) + if 'word_count' not in self.metadata: + self.metadata['word_count'] = len(self.prompt.split()) + if 'created_at' not in self.metadata: + self.metadata['created_at'] = datetime.now().isoformat() + diff --git a/src/gepa_optimizer/operators/mutation.py b/src/gepa_optimizer/operators/mutation.py new file mode 100644 index 0000000000000000000000000000000000000000..dc850d93bb20c0e770f48f857e932bf01d6d7394 --- /dev/null +++ b/src/gepa_optimizer/operators/mutation.py @@ -0,0 +1,185 @@ +""" +Diversity-Guided Mutation Operator. + +Adapts LLEGO's diversity-guided mutation for text prompts. +Based on: Decision Tree Induction Through LLMs via Semantically-Aware Evolution (ICLR 2025) +""" + +from typing import List, Callable, TYPE_CHECKING +import numpy as np +import logging + +from .base_operator import BaseMutationOperator + +if TYPE_CHECKING: + from .models import PromptCandidate + +logger = logging.getLogger(__name__) + + +class DiversityGuidedMutation(BaseMutationOperator): + """ + Diversity-guided mutation for text prompts. + + Explores the search space by generating diverse prompt variations + using temperature-controlled LLM sampling. + + From LLEGO paper: + "Diversity-guided mutation enables efficient global exploration by sampling + diverse parents with temperature parameter ฯ„" + + Reference: https://github.com/nicolashuynh/LLEGO + """ + + def __init__(self, tau: float = 10.0, nu: int = 4): + """ + Initialize mutation operator. + + Args: + tau: Diversity temperature (higher = more exploration). + Default 10.0 from LLEGO paper. + nu: Parent arity (number of parents to sample for diversity). + Default 4 from LLEGO paper. + """ + self.tau = tau + self.nu = nu + logger.debug(f"DiversityGuidedMutation initialized with ฯ„={tau}, ฮฝ={nu}") + + def __call__( + self, + parent: "PromptCandidate", + population: List["PromptCandidate"], + llm: Callable[[str], str] + ) -> str: + """ + Mutate a parent prompt to explore new regions. + + Args: + parent: Parent PromptCandidate to mutate + population: Current population for diversity guidance + llm: Language model callable + + Returns: + str: Mutated prompt + """ + logger.debug(f"Mutation: parent fitness={parent.fitness:.3f}") + + # Sample diverse parents for context + diverse_parents = self._sample_diverse_parents(parent, population) + + # Build mutation prompt and call LLM + mutation_prompt = self._build_prompt(parent, diverse_parents) + mutated_prompt = llm(mutation_prompt) + + return mutated_prompt + + def _sample_diverse_parents( + self, + parent: "PromptCandidate", + population: List["PromptCandidate"] + ) -> List["PromptCandidate"]: + """ + Sample diverse parents using temperature-based selection. + + Args: + parent: Current parent + population: Population to sample from + + Returns: + List of diverse parent candidates + """ + # Calculate diversity scores + diversity_scores = [] + for candidate in population: + if candidate.prompt != parent.prompt: + diversity = self._calculate_diversity(parent.prompt, candidate.prompt) + diversity_scores.append((candidate, diversity)) + + if not diversity_scores: + return [parent] + + # Temperature-based sampling + scores = np.array([score for _, score in diversity_scores]) + probs = np.exp(scores / self.tau) + probs /= probs.sum() + + # Sample nu diverse parents + n_samples = min(self.nu, len(diversity_scores)) + indices = np.random.choice( + len(diversity_scores), + size=n_samples, + replace=False, + p=probs + ) + + return [diversity_scores[i][0] for i in indices] + + def _calculate_diversity(self, prompt1: str, prompt2: str) -> float: + """ + Calculate semantic diversity between two prompts. + + Uses Jaccard distance on words as a simple diversity metric. + + Args: + prompt1: First prompt + prompt2: Second prompt + + Returns: + float: Diversity score (0-1, higher = more diverse) + """ + words1 = set(prompt1.lower().split()) + words2 = set(prompt2.lower().split()) + + intersection = len(words1 & words2) + union = len(words1 | words2) + + jaccard_similarity = intersection / union if union > 0 else 0 + return 1 - jaccard_similarity # Higher = more diverse + + def _build_prompt( + self, + parent: "PromptCandidate", + diverse_parents: List["PromptCandidate"] + ) -> str: + """ + Build LLM prompt for mutation operation. + + Args: + parent: Parent candidate to mutate + diverse_parents: Diverse parents for context + + Returns: + str: Prompt for LLM + """ + MAX_PARENT_LENGTH = 350 + MAX_DIVERSE_LENGTH = 200 + + parent_truncated = parent.prompt[:MAX_PARENT_LENGTH] + if len(parent.prompt) > MAX_PARENT_LENGTH: + parent_truncated += "..." + + # Build diversity context + diversity_context = [] + for i, diverse_parent in enumerate(diverse_parents[:2]): + truncated = diverse_parent.prompt[:MAX_DIVERSE_LENGTH] + if len(diverse_parent.prompt) > MAX_DIVERSE_LENGTH: + truncated += "..." + diversity_context.append(f"V{i+1}: {truncated}") + + prompt = f"""Create a variation of this prompt with different decision logic (fitness: {parent.fitness:.2f}). + +Parent: {parent_truncated} + +{chr(10).join(diversity_context) if diversity_context else ""} + +Instructions: +1. Explore NEW ways to categorize tasks (e.g., by element type, by action, by hierarchy) +2. Add handling for edge cases the parent might miss +3. Keep the structured, logical approach +4. Keep format (Element: X, Description:, Reason:) +5. Max 600 chars + +Output ONLY the new prompt:""" + + return prompt + diff --git a/src/gepa_optimizer/types.py b/src/gepa_optimizer/types.py new file mode 100644 index 0000000000000000000000000000000000000000..5df1de9d2dbdaf50a0e048ded5e2345cf72fc1b6 --- /dev/null +++ b/src/gepa_optimizer/types.py @@ -0,0 +1,245 @@ +""" +Type definitions for GEPA Optimizer. + +This module contains type aliases, TypedDicts, and Protocol classes +used throughout the GEPA Optimizer codebase for strict typing. +""" + +from typing import ( + Any, + Callable, + Dict, + List, + Literal, + Optional, + Protocol, + Tuple, + TypedDict, + TypeVar, + Union, +) + + +# ============================================================================ +# Dataset Types +# ============================================================================ + +class DatasetItem(TypedDict, total=False): + """Standard dataset item format for GEPA optimization.""" + input: str # Input text/command + output: str # Expected output + image_base64: str # Base64-encoded image (optional) + metadata: Dict[str, Any] # Additional metadata + reflection_input: str # Simplified input for reflection (optional) + + +DatasetSplit = Literal["train", "val", "test", "trainset", "valset", "testset"] +DatasetList = List[DatasetItem] + +# Train/Val/Test split tuple +DatasetSplitTuple = Tuple[DatasetList, DatasetList, DatasetList] + + +# ============================================================================ +# Evaluation Types +# ============================================================================ + +class EvaluationResult(TypedDict, total=False): + """Result from evaluating a single sample.""" + score: float # Primary score [0.0, 1.0] + composite_score: float # Weighted composite score + is_match: bool # Whether prediction matches expected + predicted: str # Model's prediction + expected: str # Expected output + metrics: Dict[str, float] # Detailed metric scores + feedback: str # Human-readable feedback + + +class EvaluationSummary(TypedDict): + """Summary of evaluation results.""" + total_samples: int + correct_predictions: int + accuracy: float + average_score: float + + +# ============================================================================ +# LLM Types +# ============================================================================ + +class LLMResponse(TypedDict, total=False): + """Response from LLM generation.""" + content: str # Generated text + usage: Dict[str, int] # Token usage stats + model: str # Model used + finish_reason: str # Why generation stopped + source: str # Source (gepa_reflection, llego_crossover, etc.) + + +class LLMClientProtocol(Protocol): + """Protocol for LLM client implementations.""" + + def generate( + self, + system_prompt: str, + user_prompt: str, + image_base64: str = "", + **kwargs: Any + ) -> Union[str, Dict[str, Any]]: + """Generate a response from the LLM.""" + ... + + +class BatchLLMClientProtocol(Protocol): + """Protocol for batch LLM client implementations.""" + + def submit_batch( + self, + tasks: List[Dict[str, Any]], + **kwargs: Any + ) -> str: + """Submit a batch of tasks. Returns batch ID.""" + ... + + def get_batch_results( + self, + batch_id: str, + **kwargs: Any + ) -> List[Dict[str, Any]]: + """Get results for a submitted batch.""" + ... + + +# ============================================================================ +# Evaluator Types +# ============================================================================ + +class EvaluatorProtocol(Protocol): + """Protocol for evaluator implementations.""" + + def evaluate( + self, + predicted: str, + expected: str, + **kwargs: Any + ) -> Dict[str, float]: + """Evaluate a prediction against expected output.""" + ... + + def get_composite_score( + self, + metrics: Dict[str, float] + ) -> float: + """Calculate composite score from individual metrics.""" + ... + + +# ============================================================================ +# Optimization Types +# ============================================================================ + +class CandidateDict(TypedDict, total=False): + """A prompt candidate in the optimization process.""" + system_prompt: str # The prompt text + prompt: str # Alias for system_prompt + fitness: float # Fitness score + score: float # Alias for fitness + source: str # Source of candidate (seed, gepa_reflection, llego_crossover, etc.) + type: str # Type alias for source + notation: str # Mathematical notation (Sโ‚€, Sแตฃ, Oโ‚“โ‚’, Oโ‚˜แตคโ‚œ) + index: int # Candidate index + + +class ParetoCandidate(TypedDict): + """A candidate in the Pareto front.""" + prompt: str + score: float + type: str + notation: str + + +class OptimizationState(TypedDict, total=False): + """Current state of optimization.""" + iteration: int + best_score: float + best_prompt: str + pareto_front: List[ParetoCandidate] + baseline_score: Optional[float] + + +# ============================================================================ +# Configuration Types +# ============================================================================ + +class DataSplitConfig(TypedDict, total=False): + """Configuration for dataset splitting.""" + train_ratio: float + val_ratio: float + test_ratio: float + shuffle: bool + seed: Optional[int] + + +class LLEGOConfig(TypedDict, total=False): + """Configuration for LLEGO operators.""" + mode: Literal["hybrid", "llego_only", "disabled"] + population_size: int + num_crossover_candidates: int + num_mutation_candidates: int + crossover_enabled: bool + mutation_enabled: bool + + +# ============================================================================ +# Type Variables +# ============================================================================ + +T = TypeVar("T") +DatasetT = TypeVar("DatasetT", bound=Dict[str, Any]) +ResultT = TypeVar("ResultT") + + +# ============================================================================ +# Callback Types +# ============================================================================ + +EvaluationCallback = Callable[[str, str], EvaluationResult] +GenerationCallback = Callable[[str], str] +ProgressCallback = Callable[[int, int, float], None] + + +# ============================================================================ +# Export +# ============================================================================ + +__all__ = [ + # Dataset + "DatasetItem", + "DatasetSplit", + "DatasetList", + "DatasetSplitTuple", + # Evaluation + "EvaluationResult", + "EvaluationSummary", + "EvaluatorProtocol", + # LLM + "LLMResponse", + "LLMClientProtocol", + "BatchLLMClientProtocol", + # Optimization + "CandidateDict", + "ParetoCandidate", + "OptimizationState", + # Configuration + "DataSplitConfig", + "LLEGOConfig", + # Callbacks + "EvaluationCallback", + "GenerationCallback", + "ProgressCallback", + # Type Variables + "T", + "DatasetT", + "ResultT", +] + diff --git a/src/gepa_optimizer/utils/__init__.py b/src/gepa_optimizer/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ec5d3d6813c17c86cf77e1ad00d5c9bff4aae705 --- /dev/null +++ b/src/gepa_optimizer/utils/__init__.py @@ -0,0 +1,40 @@ +""" +Utility functions for GEPA Optimizer +""" + +from .helpers import sanitize_prompt +from .logging import setup_logging +from .metrics import calculate_metrics +from .api_keys import APIKeyManager +from .exceptions import GepaOptimizerError, GepaDependencyError, InvalidInputError, DatasetError +from .universal_judge_prompt import ( + build_universal_judge_prompt, + get_universal_judge_system_prompt, + format_universal_judge_feedback, + build_empty_output_feedback +) +from .format_detection import ( + detect_output_format, + build_format_aware_reflection_prompt, + generate_format_feedback +) + +__all__ = [ + "sanitize_prompt", + "setup_logging", + "calculate_metrics", + "APIKeyManager", + "GepaOptimizerError", + "GepaDependencyError", + "InvalidInputError", + "DatasetError", + # Universal judge prompt utilities + "build_universal_judge_prompt", + "get_universal_judge_system_prompt", + "format_universal_judge_feedback", + "build_empty_output_feedback", + # Format detection utilities + "detect_output_format", + "build_format_aware_reflection_prompt", + "generate_format_feedback" +] diff --git a/src/gepa_optimizer/utils/api_keys.py b/src/gepa_optimizer/utils/api_keys.py new file mode 100644 index 0000000000000000000000000000000000000000..37cec5c12d4768e9bf76240ef78f0a4ab3daa3f3 --- /dev/null +++ b/src/gepa_optimizer/utils/api_keys.py @@ -0,0 +1,109 @@ +""" +API Key Management for GEPA Optimizer +""" + +import os +from dotenv import load_dotenv +from typing import Optional, Dict, List + +class APIKeyManager: + """Handles API keys securely without hardcoding""" + + def __init__(self): + # Load .env file if present + load_dotenv() + self._keys: Dict[str, str] = {} + self._load_from_env() + + def _load_from_env(self): + """Load API keys from environment variables""" + env_mappings = { + 'openai': 'OPENAI_API_KEY', + 'anthropic': 'ANTHROPIC_API_KEY', + 'huggingface': 'HUGGINGFACE_API_KEY', + 'cohere': 'COHERE_API_KEY', + 'ai21': 'AI21_API_KEY', + 'together': 'TOGETHER_API_KEY', + 'replicate': 'REPLICATE_API_TOKEN', + 'groq': 'GROQ_API_KEY', + 'ollama': 'OLLAMA_API_KEY', + 'google': 'GEMINI_API_KEY', + 'gemini': 'GEMINI_API_KEY' + } + + for provider, env_var in env_mappings.items(): + key = os.getenv(env_var) + if key: + self._keys[provider] = key + + def get_api_key(self, provider: str) -> Optional[str]: + """Get API key for a specific provider""" + return self._keys.get(provider.lower()) + + def set_api_key(self, provider: str, key: str): + """Set API key for a provider at runtime""" + provider_lower = provider.lower() + self._keys[provider_lower] = key + + # Handle aliases - if setting google, also set gemini and vice versa + if provider_lower == 'google': + self._keys['gemini'] = key + elif provider_lower == 'gemini': + self._keys['google'] = key + + def has_key(self, provider: str) -> bool: + """Check if API key exists for provider""" + return provider.lower() in self._keys + + def get_missing_keys(self, providers: List[str]) -> List[str]: + """Get list of providers missing API keys""" + return [p for p in providers if not self.has_key(p)] + + def validate_keys(self, providers: List[str]) -> Dict[str, bool]: + """Validate API keys for multiple providers""" + return {provider: self.has_key(provider) for provider in providers} + + # Legacy methods for backward compatibility + def set_openai_key(self, key: str): + """Set OpenAI API key at runtime""" + self.set_api_key('openai', key) + + def set_anthropic_key(self, key: str): + """Set Anthropic API key at runtime""" + self.set_api_key('anthropic', key) + + def set_google_key(self, key: str): + """Set Google API key at runtime""" + self.set_api_key('google', key) + + def set_gemini_key(self, key: str): + """Set Gemini API key at runtime (alias for Google)""" + self.set_api_key('google', key) + + def get_openai_key(self) -> str: + """Get OpenAI key or raise error if missing""" + key = self.get_api_key('openai') + if not key: + raise RuntimeError( + "OpenAI API key missing. Set via:\n" + "1. Environment variable: OPENAI_API_KEY=your_key\n" + "2. .env file: OPENAI_API_KEY=your_key\n" + "3. Code: api_manager.set_api_key('openai', 'your_key')" + ) + return key + + def get_anthropic_key(self) -> Optional[str]: + """Get Anthropic key (optional)""" + return self.get_api_key('anthropic') + + def get_google_key(self) -> Optional[str]: + """Get Google key (optional)""" + return self.get_api_key('google') + + def get_gemini_key(self) -> Optional[str]: + """Get Gemini key (alias for Google)""" + return self.get_api_key('google') + + def has_required_keys(self) -> bool: + """Check if required keys are available""" + return bool(self.get_api_key('openai')) diff --git a/src/gepa_optimizer/utils/candidate_collector.py b/src/gepa_optimizer/utils/candidate_collector.py new file mode 100644 index 0000000000000000000000000000000000000000..0f2ce7836104778ec70239fd22c77ff2256100b3 --- /dev/null +++ b/src/gepa_optimizer/utils/candidate_collector.py @@ -0,0 +1,313 @@ +""" +Candidate and Feedback Collector for Presentation + +This module collects all candidates generated during optimization along with +their feedback, scores, and metadata for presentation purposes. +""" + +import json +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Any, Optional +from dataclasses import dataclass, asdict, field + + +@dataclass +class CandidateInfo: + """Information about a single candidate prompt""" + iteration: int + candidate_id: str + source: str # "GEPA_Reflection", "LLEGO_Crossover", "LLEGO_Mutation", "Seed" + prompt: str + score: Optional[float] = None + feedback: Optional[str] = None + feedback_details: Optional[Dict[str, Any]] = None + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + + +@dataclass +class IterationInfo: + """Information about a single optimization iteration""" + iteration: int + candidates: List[CandidateInfo] = field(default_factory=list) + best_candidate: Optional[CandidateInfo] = None + best_score: Optional[float] = None + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + + +class CandidateCollector: + """ + Collects all candidates and feedback during optimization for presentation. + """ + + def __init__(self, output_dir: str = "presentation_data"): + """ + Initialize the collector. + + Args: + output_dir: Directory to save collected data + """ + self.output_dir = Path(output_dir) + self.output_dir.mkdir(exist_ok=True) + + self.iterations: List[IterationInfo] = [] + self.current_iteration: Optional[IterationInfo] = None + self.all_candidates: List[CandidateInfo] = [] + + # Track seed prompt + self.seed_prompt: Optional[str] = None + + def set_seed_prompt(self, seed_prompt: str): + """Set the seed prompt for reference""" + self.seed_prompt = seed_prompt + + def start_iteration(self, iteration: int): + """Start tracking a new iteration""" + self.current_iteration = IterationInfo(iteration=iteration) + self.iterations.append(self.current_iteration) + + def add_candidate( + self, + iteration: int, + candidate_id: str, + source: str, + prompt: str, + score: Optional[float] = None, + feedback: Optional[str] = None, + feedback_details: Optional[Dict[str, Any]] = None + ): + """ + Add a candidate to the collection. + + Args: + iteration: Iteration number + candidate_id: Unique identifier for the candidate + source: Source of the candidate ("GEPA_Reflection", "LLEGO_Crossover", etc.) + prompt: The candidate prompt text + score: Evaluation score (if available) + feedback: Feedback text (if available) + feedback_details: Additional feedback details (if available) + """ + candidate = CandidateInfo( + iteration=iteration, + candidate_id=candidate_id, + source=source, + prompt=prompt, + score=score, + feedback=feedback, + feedback_details=feedback_details + ) + + # Add to current iteration + if self.current_iteration and self.current_iteration.iteration == iteration: + self.current_iteration.candidates.append(candidate) + + # Update best candidate if this is better + if score is not None: + if (self.current_iteration.best_score is None or + score > self.current_iteration.best_score): + self.current_iteration.best_candidate = candidate + self.current_iteration.best_score = score + + # Add to all candidates list + self.all_candidates.append(candidate) + + def add_feedback( + self, + candidate_id: str, + feedback: str, + feedback_details: Optional[Dict[str, Any]] = None + ): + """ + Add feedback to an existing candidate. + + Args: + candidate_id: ID of the candidate to update + feedback: Feedback text + feedback_details: Additional feedback details + """ + for candidate in self.all_candidates: + if candidate.candidate_id == candidate_id: + candidate.feedback = feedback + candidate.feedback_details = feedback_details + break + + # Also update in iterations + for iteration in self.iterations: + for candidate in iteration.candidates: + if candidate.candidate_id == candidate_id: + candidate.feedback = feedback + candidate.feedback_details = feedback_details + break + + def add_score( + self, + candidate_id: str, + score: float + ): + """ + Add score to an existing candidate. + + Args: + candidate_id: ID of the candidate to update + score: Evaluation score + """ + for candidate in self.all_candidates: + if candidate.candidate_id == candidate_id: + candidate.score = score + break + + # Also update in iterations + for iteration in self.iterations: + for candidate in iteration.candidates: + if candidate.candidate_id == candidate_id: + candidate.score = score + # Update best candidate if needed + if (iteration.best_score is None or score > iteration.best_score): + iteration.best_candidate = candidate + iteration.best_score = score + break + + def save_to_json(self, filename: Optional[str] = None) -> Path: + """ + Save collected data to JSON file. + + Args: + filename: Optional filename (auto-generated if not provided) + + Returns: + Path to saved file + """ + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"candidates_and_feedback_{timestamp}.json" + + filepath = self.output_dir / filename + + data = { + "seed_prompt": self.seed_prompt, + "total_iterations": len(self.iterations), + "total_candidates": len(self.all_candidates), + "iterations": [asdict(iter_info) for iter_info in self.iterations], + "all_candidates": [asdict(candidate) for candidate in self.all_candidates], + "timestamp": datetime.now().isoformat() + } + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(data, f, indent=2, ensure_ascii=False) + + return filepath + + def save_to_markdown(self, filename: Optional[str] = None) -> Path: + """ + Save collected data to Markdown file (presentation-ready format). + + Args: + filename: Optional filename (auto-generated if not provided) + + Returns: + Path to saved file + """ + if filename is None: + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"candidates_and_feedback_{timestamp}.md" + + filepath = self.output_dir / filename + + with open(filepath, 'w', encoding='utf-8') as f: + # Header + f.write("# Optimization Candidates and Feedback\n\n") + f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") + f.write(f"**Total Iterations:** {len(self.iterations)}\n") + f.write(f"**Total Candidates:** {len(self.all_candidates)}\n\n") + + # Seed Prompt + if self.seed_prompt: + f.write("---\n\n") + f.write("## ๐ŸŒฑ Seed Prompt\n\n") + f.write("```\n") + f.write(self.seed_prompt) + f.write("\n```\n\n") + + # Iterations + for iter_info in self.iterations: + f.write("---\n\n") + f.write(f"## ๐Ÿ”„ Iteration {iter_info.iteration}\n\n") + + # Best candidate for this iteration + if iter_info.best_candidate: + f.write(f"### ๐Ÿ† Best Candidate (Score: {iter_info.best_score:.4f})\n\n") + f.write(f"**Source:** {iter_info.best_candidate.source}\n\n") + f.write(f"**Prompt:**\n```\n") + f.write(iter_info.best_candidate.prompt) + f.write("\n```\n\n") + + if iter_info.best_candidate.feedback: + f.write(f"**Feedback:**\n\n") + f.write(f"{iter_info.best_candidate.feedback}\n\n") + + # All candidates in this iteration + f.write(f"### ๐Ÿ“ All Candidates ({len(iter_info.candidates)})\n\n") + + for idx, candidate in enumerate(iter_info.candidates, 1): + f.write(f"#### Candidate {idx}: {candidate.source}\n\n") + f.write(f"**ID:** `{candidate.candidate_id}`\n\n") + + if candidate.score is not None: + f.write(f"**Score:** `{candidate.score:.4f}`\n\n") + + f.write(f"**Prompt:**\n```\n") + f.write(candidate.prompt) + f.write("\n```\n\n") + + if candidate.feedback: + f.write(f"**Feedback:**\n\n") + f.write(f"{candidate.feedback}\n\n") + + if candidate.feedback_details: + f.write(f"**Feedback Details:**\n\n") + f.write("```json\n") + f.write(json.dumps(candidate.feedback_details, indent=2)) + f.write("\n```\n\n") + + f.write("---\n\n") + + # Summary by source + f.write("---\n\n") + f.write("## ๐Ÿ“Š Summary by Source\n\n") + + sources = {} + for candidate in self.all_candidates: + if candidate.source not in sources: + sources[candidate.source] = [] + sources[candidate.source].append(candidate) + + for source, candidates in sources.items(): + f.write(f"### {source} ({len(candidates)} candidates)\n\n") + for candidate in candidates: + score_str = f"Score: {candidate.score:.4f}" if candidate.score else "No score" + f.write(f"- **{candidate.candidate_id}** (Iteration {candidate.iteration}, {score_str})\n") + f.write("\n") + + return filepath + + def get_summary(self) -> Dict[str, Any]: + """Get a summary of collected data""" + sources = {} + for candidate in self.all_candidates: + if candidate.source not in sources: + sources[candidate.source] = 0 + sources[candidate.source] += 1 + + scored_candidates = [c for c in self.all_candidates if c.score is not None] + avg_score = sum(c.score for c in scored_candidates) / len(scored_candidates) if scored_candidates else None + + return { + "total_iterations": len(self.iterations), + "total_candidates": len(self.all_candidates), + "candidates_by_source": sources, + "candidates_with_scores": len(scored_candidates), + "average_score": avg_score, + "candidates_with_feedback": len([c for c in self.all_candidates if c.feedback]) + } + diff --git a/src/gepa_optimizer/utils/clean_logger.py b/src/gepa_optimizer/utils/clean_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..167c596e412ca1649b214317cef94eaf17bbc75d --- /dev/null +++ b/src/gepa_optimizer/utils/clean_logger.py @@ -0,0 +1,160 @@ +""" +Clean Logger for GEPA + LLEGO Optimization +Provides simple, visual logging similar to diagram format. + +Uses the centralized logging infrastructure with a custom handler +for clean, user-friendly console output. +""" + +import logging +import sys +from typing import List, Optional + +# Create dedicated logger for clean output +_clean_output_logger = logging.getLogger("gepa_optimizer.clean_output") + + +def _setup_clean_logger(): + """Setup the clean output logger with minimal formatting.""" + if not _clean_output_logger.handlers: + handler = logging.StreamHandler(sys.stdout) + handler.setLevel(logging.INFO) + # Minimal formatter - just the message + handler.setFormatter(logging.Formatter("%(message)s")) + _clean_output_logger.addHandler(handler) + _clean_output_logger.setLevel(logging.INFO) + # Don't propagate to root logger to avoid duplicate output + _clean_output_logger.propagate = False + + +# Initialize on module load +_setup_clean_logger() + + +class CleanLogger: + """ + Simple, visual logging for optimization workflow. + + Uses a dedicated logger with minimal formatting to produce + clean, user-friendly console output. + """ + + def __init__(self): + self.current_iteration = 0 + self.gepa_reflection_count = 0 + self.llego_crossover_count = 0 + self.llego_mutation_count = 0 + self._logger = _clean_output_logger + + def log_iteration_start(self, iteration: int, seed_prompt: Optional[str] = None): + """Log start of new iteration.""" + self.current_iteration = iteration + self.gepa_reflection_count = 0 + self.llego_crossover_count = 0 + self.llego_mutation_count = 0 + + self._logger.info("") + self._logger.info("โ•" * 80) + # FIX: More accurate description - we evaluate first, then generate + if iteration == 1: + self._logger.info(f" ITERATION {iteration}: EVALUATING SEED PROMPT") + else: + self._logger.info(f" ITERATION {iteration}: EVALUATING & GENERATING CANDIDATES") + self._logger.info("โ•" * 80) + + if seed_prompt and iteration == 0: + self._logger.info("") + self._logger.info("SEED PROMPT:") + self._logger.info("โ”€" * 80) + self._logger.info(seed_prompt) + self._logger.info("โ”€" * 80) + + def log_candidate_generation_summary(self): + """Log summary of candidates generated this iteration.""" + total = self.gepa_reflection_count + self.llego_crossover_count + self.llego_mutation_count + + self._logger.info("") + self._logger.info("CANDIDATES GENERATED THIS ITERATION:") + self._logger.info(f" GEPA Reflection: {self.gepa_reflection_count}") + self._logger.info(f" LLEGO Crossover: {self.llego_crossover_count}") + self._logger.info(f" LLEGO Mutation: {self.llego_mutation_count}") + self._logger.info(f" TOTAL: {total}") + + def log_gepa_reflection_candidate(self, candidate_num: int, prompt: str): + """Log a GEPA reflection candidate.""" + self.gepa_reflection_count += 1 + self._logger.info("") + self._logger.info(f"GEPA Reflection Candidate #{candidate_num}:") + self._logger.info("โ”€" * 80) + if prompt and prompt.strip(): + self._logger.info(prompt) # Show full prompt at INFO level + else: + self._logger.warning("โš ๏ธ Empty candidate prompt!") + self._logger.info("โ”€" * 80) + + def log_llego_crossover_candidate(self, candidate_num: int, prompt: str): + """Log a LLEGO crossover candidate.""" + self.llego_crossover_count += 1 + self._logger.info("") + self._logger.info(f"LLEGO Crossover Candidate #{candidate_num}:") + self._logger.info("โ”€" * 80) + if prompt and prompt.strip(): + self._logger.info(prompt) # Show full prompt at INFO level + else: + self._logger.warning("โš ๏ธ Empty candidate prompt!") + self._logger.info("โ”€" * 80) + + def log_llego_mutation_candidate(self, candidate_num: int, prompt: str): + """Log a LLEGO mutation candidate.""" + self.llego_mutation_count += 1 + self._logger.info("") + self._logger.info(f"LLEGO Mutation Candidate #{candidate_num}:") + self._logger.info("โ”€" * 80) + if prompt and prompt.strip(): + self._logger.info(prompt) # Show full prompt at INFO level + else: + self._logger.warning("โš ๏ธ Empty candidate prompt!") + self._logger.info("โ”€" * 80) + + def log_evaluation_results(self, candidate_prompts: List[str], scores: List[float]): + """Log evaluation results for all candidates.""" + self._logger.info("") + self._logger.info("โ•" * 80) + self._logger.info(" EVALUATION RESULTS") + self._logger.info("โ•" * 80) + + for i, (prompt, score) in enumerate(zip(candidate_prompts, scores), 1): + self._logger.info(f"") + self._logger.info(f"Candidate #{i}:") + self._logger.info(f" Score: {score:.4f}") + self._logger.info(f" Prompt Preview: {prompt[:100]}...") + + def log_pareto_front_update(self, pareto_size: int, best_score: float): + """Log Pareto front update.""" + self._logger.info("") + self._logger.info("โ•" * 80) + self._logger.info(" PARETO FRONT UPDATE") + self._logger.info("โ•" * 80) + self._logger.info(f" Front Size: {pareto_size} candidates") + self._logger.info(f" Best Score: {best_score:.4f}") + + def log_iteration_summary(self, iteration: int, total_candidates: int, best_score: float): + """Log iteration summary.""" + self._logger.info("") + self._logger.info("โ•" * 80) + self._logger.info(f" ITERATION {iteration} SUMMARY") + self._logger.info("โ•" * 80) + self._logger.info(f" Candidates Evaluated: {total_candidates}") + self._logger.info(f" Best Score: {best_score:.4f}") + self._logger.info(f" GEPA Reflection: {self.gepa_reflection_count}") + self._logger.info(f" LLEGO Crossover: {self.llego_crossover_count}") + self._logger.info(f" LLEGO Mutation: {self.llego_mutation_count}") + + +# Global instance +_clean_logger_instance = CleanLogger() + + +def get_clean_logger() -> CleanLogger: + """Get global clean logger instance.""" + return _clean_logger_instance diff --git a/src/gepa_optimizer/utils/exceptions.py b/src/gepa_optimizer/utils/exceptions.py new file mode 100644 index 0000000000000000000000000000000000000000..5e4e043e6edafbab84261debe19151cf208cfdef --- /dev/null +++ b/src/gepa_optimizer/utils/exceptions.py @@ -0,0 +1,27 @@ +""" +Custom exceptions for GEPA Optimizer +""" + +class GepaOptimizerError(Exception): + """Base class for all GEPA Optimizer exceptions""" + pass + +class GepaDependencyError(GepaOptimizerError): + """Exception raised for errors related to the GEPA library dependency""" + pass + +class InvalidInputError(GepaOptimizerError): + """Exception raised for invalid user inputs""" + pass + +class DatasetError(GepaOptimizerError): + """Exception raised for errors related to the dataset""" + pass + +class TestSetEvaluationError(GepaOptimizerError): + """Exception raised when test set evaluation fails""" + pass + +class ConfigurationError(GepaOptimizerError): + """Exception raised for invalid configuration""" + pass diff --git a/src/gepa_optimizer/utils/format_detection.py b/src/gepa_optimizer/utils/format_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..e574cd3f73c98eafb1cbf2cf5d07c30fdf2e4c65 --- /dev/null +++ b/src/gepa_optimizer/utils/format_detection.py @@ -0,0 +1,391 @@ +""" +Format Detection Utilities for GEPA Optimizer. + +This module provides utilities to automatically detect output format patterns +from expected outputs and generate format constraints for reflection prompts. + +Key Features: +1. Auto-detect JSON, key-value, tabular, or free-text formats +2. Generate format specifications from examples +3. Create format constraint strings for prompt injection +""" + +import re +import json +from typing import List, Dict, Any, Optional, Tuple + + +def detect_output_format(expected_outputs: List[str]) -> Dict[str, Any]: + """ + Analyze expected outputs to detect the common format pattern. + + Args: + expected_outputs: List of expected output strings from the dataset + + Returns: + Dictionary containing: + - format_type: 'json', 'key_value', 'tabular', 'structured_text', 'free_text' + - format_spec: Human-readable format specification + - format_example: Example showing the format + - format_constraint: Constraint text to add to prompts + - detected_keys: List of keys/fields detected (for structured formats) + - avg_length: Average length of outputs (to enforce conciseness) + """ + if not expected_outputs: + return { + 'format_type': 'unknown', + 'format_spec': 'Unknown format', + 'format_example': '', + 'format_constraint': '', + 'detected_keys': [], + 'avg_length': 0 + } + + # Filter out empty outputs + valid_outputs = [o for o in expected_outputs if o and o.strip()] + if not valid_outputs: + return _create_format_result('unknown', 'Unknown format', '', [], 0) + + # Calculate average length for conciseness constraint + avg_length = sum(len(o) for o in valid_outputs) // len(valid_outputs) + max_length = max(len(o) for o in valid_outputs) + + # Try to detect format type (in order of specificity) + + # 1. Check for JSON format + json_result = _detect_json_format(valid_outputs, avg_length, max_length) + if json_result: + return json_result + + # 2. Check for key-value format (e.g., "Department: X | Sentiment: Y") + kv_result = _detect_key_value_format(valid_outputs, avg_length, max_length) + if kv_result: + return kv_result + + # 3. Check for bullet/list format + list_result = _detect_list_format(valid_outputs, avg_length, max_length) + if list_result: + return list_result + + # 4. Check for tabular/structured text + structured_result = _detect_structured_text(valid_outputs, avg_length, max_length) + if structured_result: + return structured_result + + # 5. Default to free text with length constraint + return _create_format_result( + 'free_text', + f'Free-form text response (typically {avg_length} characters)', + valid_outputs[0][:100] if valid_outputs else '', + [], + avg_length, + max_length + ) + + +def _detect_json_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]: + """Detect if outputs are JSON format.""" + json_count = 0 + all_keys = [] + + for output in outputs: + stripped = output.strip() + if stripped.startswith('{') and stripped.endswith('}'): + try: + parsed = json.loads(stripped) + if isinstance(parsed, dict): + json_count += 1 + all_keys.extend(parsed.keys()) + except json.JSONDecodeError: + pass + + # If majority are JSON + if json_count >= len(outputs) * 0.7: + # Find common keys + key_counts = {} + for key in all_keys: + key_counts[key] = key_counts.get(key, 0) + 1 + + common_keys = [k for k, v in key_counts.items() if v >= json_count * 0.5] + + # Build format spec + format_spec = f"JSON object with keys: {', '.join(common_keys)}" + format_example = outputs[0][:200] if outputs else '{}' + + return _create_format_result( + 'json', + format_spec, + format_example, + common_keys, + avg_length, + max_length + ) + + return None + + +def _detect_key_value_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]: + """Detect key-value formats like 'Department: X | Sentiment: Y'.""" + # Common separators for key-value pairs + separators = ['|', '\n', ';', ','] + key_patterns = [ + r'([A-Za-z_][A-Za-z0-9_\s]*)\s*[:=]\s*([^|;\n,]+)', # Key: Value or Key = Value + ] + + all_keys = [] + kv_count = 0 + detected_separator = None + + for output in outputs: + # Try to find key-value pairs + for pattern in key_patterns: + matches = re.findall(pattern, output) + if len(matches) >= 2: # At least 2 key-value pairs + kv_count += 1 + for key, _ in matches: + all_keys.append(key.strip()) + + # Detect separator + for sep in separators: + if sep in output: + detected_separator = sep + break + break + + # If majority are key-value + if kv_count >= len(outputs) * 0.6: + # Find common keys + key_counts = {} + for key in all_keys: + normalized = key.strip().lower() + key_counts[normalized] = key_counts.get(normalized, 0) + 1 + + common_keys = [k for k, v in sorted(key_counts.items(), key=lambda x: -x[1]) + if v >= kv_count * 0.4][:5] # Top 5 keys + + # Determine the exact format pattern + sep_display = detected_separator if detected_separator else ' | ' + format_spec = f"Key-value pairs: {sep_display.join([f'{k}: [value]' for k in common_keys])}" + format_example = outputs[0] if outputs else '' + + return _create_format_result( + 'key_value', + format_spec, + format_example, + common_keys, + avg_length, + max_length + ) + + return None + + +def _detect_list_format(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]: + """Detect bullet/numbered list formats.""" + list_patterns = [ + r'^[-*โ€ข]\s+', # Bullet points + r'^\d+[.)]\s+', # Numbered list + ] + + list_count = 0 + + for output in outputs: + lines = output.strip().split('\n') + list_lines = 0 + for line in lines: + for pattern in list_patterns: + if re.match(pattern, line.strip()): + list_lines += 1 + break + + if list_lines >= len(lines) * 0.5: # Majority are list items + list_count += 1 + + if list_count >= len(outputs) * 0.6: + return _create_format_result( + 'list', + 'Bullet or numbered list format', + outputs[0][:200] if outputs else '', + [], + avg_length, + max_length + ) + + return None + + +def _detect_structured_text(outputs: List[str], avg_length: int, max_length: int) -> Optional[Dict[str, Any]]: + """Detect structured text with consistent patterns.""" + # Check for consistent line patterns + line_counts = [len(o.strip().split('\n')) for o in outputs] + avg_lines = sum(line_counts) // len(line_counts) if line_counts else 1 + + if avg_lines >= 2: + return _create_format_result( + 'structured_text', + f'Structured text with ~{avg_lines} lines', + outputs[0][:200] if outputs else '', + [], + avg_length, + max_length + ) + + return None + + +def _create_format_result( + format_type: str, + format_spec: str, + format_example: str, + detected_keys: List[str], + avg_length: int, + max_length: int = 0 +) -> Dict[str, Any]: + """Create a standardized format detection result.""" + # Generate format constraint based on type + if format_type == 'json': + constraint = f"""OUTPUT FORMAT REQUIREMENT: +- Return ONLY a valid JSON object +- Required keys: {', '.join(detected_keys) if detected_keys else 'as shown in examples'} +- NO explanations, NO prose, NO markdown code blocks +- Maximum length: ~{max_length} characters +- Example format: {format_example[:150]}""" + + elif format_type == 'key_value': + constraint = f"""OUTPUT FORMAT REQUIREMENT: +- Return ONLY in key-value format: {format_spec} +- NO explanations, NO reasoning, NO additional text +- Be CONCISE - output should be ~{avg_length} characters max +- Example: {format_example}""" + + elif format_type == 'list': + constraint = f"""OUTPUT FORMAT REQUIREMENT: +- Return as a bullet or numbered list +- NO explanations before or after the list +- Keep it concise (~{avg_length} characters)""" + + elif format_type == 'structured_text': + constraint = f"""OUTPUT FORMAT REQUIREMENT: +- Follow the structured format shown in examples +- NO additional explanations or commentary +- Keep output concise (~{avg_length} characters)""" + + else: + constraint = f"""OUTPUT FORMAT REQUIREMENT: +- Keep response CONCISE and DIRECT +- NO lengthy explanations or reasoning +- Target length: ~{avg_length} characters (max {max_length}) +- Match the format/style of the expected examples""" + + return { + 'format_type': format_type, + 'format_spec': format_spec, + 'format_example': format_example[:200] if format_example else '', + 'format_constraint': constraint, + 'detected_keys': detected_keys, + 'avg_length': avg_length, + 'max_length': max_length + } + + +def build_format_aware_reflection_prompt( + base_prompt: str, + format_info: Dict[str, Any], + include_example: bool = True +) -> str: + """ + Enhance a reflection prompt with format awareness. + + Args: + base_prompt: The original reflection prompt + format_info: Format detection result from detect_output_format() + include_example: Whether to include format example + + Returns: + Enhanced prompt with format constraints + """ + if not format_info or format_info.get('format_type') == 'unknown': + return base_prompt + + format_section = f""" + +๐ŸŽฏ CRITICAL FORMAT REQUIREMENT: +The optimized prompt MUST produce outputs that match this EXACT format: + +{format_info['format_constraint']} + +โš ๏ธ COMMON FAILURE MODES TO AVOID: +1. Generating explanations when only the answer is needed +2. Adding "Here's the analysis..." or similar preambles +3. Producing verbose output when concise is required +4. Wrong structure (e.g., prose instead of key-value pairs) +""" + + if include_example and format_info.get('format_example'): + format_section += f""" +๐Ÿ“‹ EXAMPLE OF CORRECT OUTPUT FORMAT: +{format_info['format_example']} +""" + + # Insert format section near the end of the prompt but before any final instructions + return base_prompt + format_section + + +def generate_format_feedback( + predicted_output: str, + expected_output: str, + format_info: Dict[str, Any] +) -> str: + """ + Generate specific feedback about format compliance. + + Args: + predicted_output: What the model actually produced + expected_output: The ground truth output + format_info: Format detection result + + Returns: + Specific format-related feedback + """ + predicted_len = len(predicted_output) if predicted_output else 0 + expected_len = len(expected_output) if expected_output else 0 + + issues = [] + + # Check length discrepancy + if format_info.get('avg_length', 0) > 0: + if predicted_len > format_info['avg_length'] * 3: + issues.append(f"OUTPUT TOO VERBOSE: Generated {predicted_len} chars, expected ~{format_info['avg_length']} chars") + elif predicted_len > format_info.get('max_length', predicted_len) * 2: + issues.append(f"OUTPUT TOO LONG: {predicted_len} chars vs max expected {format_info.get('max_length', 'unknown')}") + + # Check format type compliance + format_type = format_info.get('format_type', 'unknown') + + if format_type == 'json': + try: + json.loads(predicted_output.strip() if predicted_output else '{}') + except json.JSONDecodeError: + issues.append("FORMAT ERROR: Expected JSON but got non-JSON output") + + elif format_type == 'key_value': + # Check if output has key-value structure + if predicted_output and ':' not in predicted_output: + issues.append("FORMAT ERROR: Expected key-value pairs (Key: Value) but output lacks this structure") + + # Check for common verbose patterns + verbose_indicators = [ + 'let me', 'i will', 'here is', "here's", 'analysis:', 'step-by-step', + 'first,', 'to begin', 'in order to', 'the following', 'please note' + ] + + if predicted_output: + lower_output = predicted_output.lower() + found_verbose = [v for v in verbose_indicators if v in lower_output] + if found_verbose: + issues.append(f"VERBOSITY WARNING: Output contains explanatory phrases: {', '.join(found_verbose[:3])}") + + if not issues: + return "" + + return "\n๐Ÿšจ FORMAT ISSUES DETECTED:\n" + "\n".join(f" โ€ข {issue}" for issue in issues) diff --git a/src/gepa_optimizer/utils/helpers.py b/src/gepa_optimizer/utils/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..cbd57f5d498d68feecf2b293159ffdf665376162 --- /dev/null +++ b/src/gepa_optimizer/utils/helpers.py @@ -0,0 +1,23 @@ +""" +Helper functions for GEPA Optimizer +""" + +def sanitize_prompt(prompt: str) -> str: + """ + Sanitize and validate prompt string + + Args: + prompt: Input prompt string to sanitize + + Returns: + str: Cleaned and validated prompt + """ + if not isinstance(prompt, str): + prompt = str(prompt) + + prompt = prompt.strip() + + if not prompt: + prompt = "You are a helpful assistant." + + return prompt diff --git a/src/gepa_optimizer/utils/llm_judge_prompt.py b/src/gepa_optimizer/utils/llm_judge_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..fcab0497cde56ad02b9db146d556594add9881cd --- /dev/null +++ b/src/gepa_optimizer/utils/llm_judge_prompt.py @@ -0,0 +1,322 @@ +""" +LLM-as-Judge Prompt for Index Caching Use Case + +This module provides a specialized LLM-as-Judge prompt template for analyzing +index caching evaluation results and generating actionable feedback for prompt improvement. +""" + +from typing import Dict, Any, Optional + + +def build_index_caching_judge_prompt( + task_command: str, + predicted_dict: Dict[str, Any], + expected_dict: Dict[str, Any], + predicted_output: str, + expected_output: str, + current_prompt: Optional[str] = None, + evaluation_results: Optional[Dict[str, Any]] = None, + image_base64: Optional[str] = None +) -> str: + """ + Build LLM-as-Judge prompt for index caching use case. + + This prompt analyzes why the LLM failed to correctly identify: + - is_index_based (boolean) + - index_value (int or null) + - parent_element_id (string or null) + - element_id_of_nth_child_of_parent (string or null) + - selected_element_is_correct (boolean) + + Args: + task_command: The natural language command + predicted_dict: Parsed predicted JSON output + expected_dict: Parsed expected JSON output + predicted_output: Raw predicted output string + expected_output: Raw expected output string + current_prompt: Current system prompt being optimized + evaluation_results: Full evaluation results with field scores + image_base64: Optional base64 encoded screenshot + + Returns: + Formatted judge prompt string + """ + + # Extract field values for comparison + pred_is_index = predicted_dict.get("is_index_based") + exp_is_index = expected_dict.get("is_index_based") + pred_index_val = predicted_dict.get("index_value") + exp_index_val = expected_dict.get("index_value") + pred_parent = predicted_dict.get("parent_element_id") + exp_parent = expected_dict.get("parent_element_id") + pred_element = predicted_dict.get("element_id_of_nth_child_of_parent") + exp_element = expected_dict.get("element_id_of_nth_child_of_parent") + pred_selected = predicted_dict.get("selected_element_is_correct") + exp_selected = expected_dict.get("selected_element_is_correct") + + # Extract notes/reasoning if available + pred_notes = predicted_dict.get("notes", "") + exp_notes = expected_dict.get("notes", "") + + # Get field scores from evaluation results + field_scores = {} + if evaluation_results: + field_scores = { + "is_index_based": evaluation_results.get("is_index_based_match", 0.0), + "index_value": evaluation_results.get("index_value_match", 0.0), + "parent_element_id": evaluation_results.get("parent_element_id_match", 0.0), + "element_id_of_nth_child": evaluation_results.get("element_id_of_nth_child_match", 0.0), + "selected_element_is_correct": evaluation_results.get("selected_element_correct_match", 0.0), + } + + # Build field-by-field comparison + field_comparisons = [] + + # 1. is_index_based + is_index_match = pred_is_index == exp_is_index + field_comparisons.append(f""" +1. **is_index_based** ({'โœ… CORRECT' if is_index_match else 'โŒ WRONG'}): + - Expected: {exp_is_index} + - Predicted: {pred_is_index} + - Score: {field_scores.get('is_index_based', 0.0):.0%} +""") + + # 2. index_value + index_val_match = pred_index_val == exp_index_val + field_comparisons.append(f""" +2. **index_value** ({'โœ… CORRECT' if index_val_match else 'โŒ WRONG'}): + - Expected: {exp_index_val} + - Predicted: {pred_index_val} + - Score: {field_scores.get('index_value', 0.0):.0%} +""") + + # 3. parent_element_id + parent_match = pred_parent == exp_parent + field_comparisons.append(f""" +3. **parent_element_id** ({'โœ… CORRECT' if parent_match else 'โŒ WRONG'}): + - Expected: {exp_parent} + - Predicted: {pred_parent} + - Score: {field_scores.get('parent_element_id', 0.0):.0%} +""") + + # 4. element_id_of_nth_child_of_parent + element_match = pred_element == exp_element + field_comparisons.append(f""" +4. **element_id_of_nth_child_of_parent** ({'โœ… CORRECT' if element_match else 'โŒ WRONG'}): + - Expected: {exp_element} + - Predicted: {pred_element} + - Score: {field_scores.get('element_id_of_nth_child', 0.0):.0%} +""") + + # 5. selected_element_is_correct + selected_match = pred_selected == exp_selected + field_comparisons.append(f""" +5. **selected_element_is_correct** ({'โœ… CORRECT' if selected_match else 'โŒ WRONG'}): + - Expected: {exp_selected} + - Predicted: {pred_selected} + - Score: {field_scores.get('selected_element_is_correct', 0.0):.0%} +""") + + # Visual analysis instruction + visual_instruction = "" + if image_base64: + visual_instruction = """ +๐Ÿ–ผ๏ธ VISUAL ANALYSIS (You can see the screenshot): +- Look at the annotated screenshot with bounding boxes +- Identify which element is highlighted (the target element) +- Understand the UI structure and hierarchy +- Analyze why the LLM might have misidentified the parent container or nth child +""" + + judge_prompt = f"""You are an expert prompt engineer specializing in mobile UI automation and index-based element selection prompts. + +{"You can SEE the mobile app screenshot with annotated bounding boxes." if image_base64 else "You are analyzing text descriptions only (no image provided)."} + +TASK: Improve the SYSTEM PROMPT to better guide the LLM in correctly identifying index-based element selection. + +CONTEXT: +- Task Command: "{task_command}" + +FULL EXPECTED OUTPUT (Ground Truth JSON): +```json +{expected_output} +``` + +FULL PREDICTED OUTPUT (What the LLM Actually Returned): +```json +{predicted_output} +``` + +FIELD-BY-FIELD COMPARISON: +{''.join(field_comparisons)} +{visual_instruction if image_base64 else ""} + +EXPECTED REASONING (from notes): +{exp_notes if exp_notes else "N/A - No reasoning provided in expected output"} + +PREDICTED REASONING (from notes): +{pred_notes if pred_notes else "N/A - No reasoning provided in predicted output"} + +CURRENT SYSTEM PROMPT (being optimized): +{current_prompt if current_prompt else "N/A"} + +ANALYSIS REQUIRED: + +1. **is_index_based Analysis** (CRITICAL): + - Why did the LLM classify this as {"index-based" if pred_is_index else "non-index-based"} when it should be {"index-based" if exp_is_index else "non-index-based"}? + - What specific words or patterns in the command "{task_command}" should have led to the correct classification? + - What instruction in the prompt failed to guide correct classification? + - What edge case or ambiguity caused the misclassification? + +2. **index_value Analysis** (if is_index_based should be true): + - Why did the LLM extract index_value={pred_index_val} when it should be {exp_index_val}? + - What ordinal word ("first", "second", "third", etc.) in "{task_command}" should have been converted to {exp_index_val}? + - Did the LLM fail to recognize the ordinal, or did it count incorrectly? + - What instruction would help the LLM correctly parse ordinals? + +3. **parent_element_id Analysis** (if is_index_based should be true): + - Why did the LLM identify parent_element_id="{pred_parent}" when it should be "{exp_parent}"? + - What container in the XML hierarchy should have been identified as the parent? + - Did the LLM fail to walk up the hierarchy correctly? + - Did the LLM include non-item children (like headers) in the parent container? + - What instruction would help the LLM identify the correct parent container? + +4. **element_id_of_nth_child_of_parent Analysis** (if is_index_based should be true): + - Why did the LLM identify element_id_of_nth_child_of_parent="{pred_element}" when it should be "{exp_element}"? + - What is the outermost component representing the nth item? + - Did the LLM select a nested child instead of the full item? + - Did the LLM count items incorrectly (wrong nth position)? + - What instruction would help the LLM identify the correct outermost item? + +5. **selected_element_is_correct Analysis**: + - Why did the LLM determine selected_element_is_correct={pred_selected} when it should be {exp_selected}? + - Is the highlighted element actually the correct target for the command? + - What visual or structural cue did the LLM miss or misinterpret? + +6. **Prompt Weakness Identification**: + - Which specific instruction in the current system prompt is missing, unclear, or misleading? + - What concept from the expected reasoning should the prompt emphasize more? + - What edge case handling is missing? + - What example or clarification would help? + +7. **Actionable Prompt Improvement**: + - What exact instruction should be ADDED to fix each failing field? + - What should be REMOVED or CLARIFIED? + - What specific wording would guide the LLM to the correct field values? + - How can the prompt help the LLM follow the same logic as the expected output? + +OUTPUT FORMAT (JSON): +{{ + "is_index_based_error": "Specific explanation of why is_index_based classification was wrong. Reference the command and explain what pattern should have been recognized.", + "index_value_error": "If index_value was wrong, explain why. What ordinal word should have been converted to which number?", + "parent_element_id_error": "If parent_element_id was wrong, explain why. What container should have been identified and why?", + "element_id_of_nth_child_error": "If element_id_of_nth_child_of_parent was wrong, explain why. What item should have been selected and why?", + "selected_element_correct_error": "If selected_element_is_correct was wrong, explain why. Is the highlighted element actually correct?", + "key_weakness": "The single most important prompt weakness that caused the most errors", + "missing_instruction": "What specific instruction should be added to address the key weakness", + "improvement_suggestion": "Specific, actionable prompt improvement that addresses all field errors", + "example_instruction": "An example instruction that would help the LLM correctly identify all 5 fields" +}} + +CRITICAL: Your analysis must focus on WHY each of the 5 fields was wrong. Be specific about: +- Command interpretation (for is_index_based) +- Ordinal parsing (for index_value) +- XML hierarchy traversal (for parent_element_id and element_id_of_nth_child_of_parent) +- Element correctness assessment (for selected_element_is_correct) + +Reference the task command, expected vs predicted values, and provide actionable improvements to the system prompt.""" + + return judge_prompt + + +def format_index_caching_judge_feedback( + judge_output: str, + predicted_dict: Dict[str, Any], + expected_dict: Dict[str, Any], + task_command: str, + field_scores: Dict[str, float] +) -> str: + """ + Format LLM-as-Judge output into structured feedback. + + Args: + judge_output: Raw output from LLM-as-Judge + predicted_dict: Parsed predicted JSON + expected_dict: Parsed expected JSON + task_command: The task command + field_scores: Field-by-field scores from evaluation + + Returns: + Formatted feedback string + """ + import json + import re + + # Try to parse JSON from judge output + json_match = re.search(r'\{[^{}]*(?:\{[^{}]*\}[^{}]*)*\}', judge_output, re.DOTALL) + if json_match: + try: + analysis = json.loads(json_match.group(0)) + + # Build formatted feedback + feedback = f"""โŒ INDEX CACHING EVALUATION FAILURE + +๐Ÿ“‹ FIELD-BY-FIELD ANALYSIS: + +๐Ÿ” is_index_based Error: + Expected: {expected_dict.get('is_index_based')}, Predicted: {predicted_dict.get('is_index_based')} + {analysis.get('is_index_based_error', 'N/A')} + +๐Ÿ” index_value Error: + Expected: {expected_dict.get('index_value')}, Predicted: {predicted_dict.get('index_value')} + {analysis.get('index_value_error', 'N/A')} + +๐Ÿ” parent_element_id Error: + Expected: {expected_dict.get('parent_element_id')}, Predicted: {predicted_dict.get('parent_element_id')} + {analysis.get('parent_element_id_error', 'N/A')} + +๐Ÿ” element_id_of_nth_child_of_parent Error: + Expected: {expected_dict.get('element_id_of_nth_child_of_parent')}, Predicted: {predicted_dict.get('element_id_of_nth_child_of_parent')} + {analysis.get('element_id_of_nth_child_error', 'N/A')} + +๐Ÿ” selected_element_is_correct Error: + Expected: {expected_dict.get('selected_element_is_correct')}, Predicted: {predicted_dict.get('selected_element_is_correct')} + {analysis.get('selected_element_correct_error', 'N/A')} + +๐Ÿ” KEY WEAKNESS: +{analysis.get('key_weakness', 'N/A')} + +๐Ÿ’ก MISSING INSTRUCTION: +{analysis.get('missing_instruction', 'N/A')} + +๐Ÿ’ก IMPROVEMENT SUGGESTION: +{analysis.get('improvement_suggestion', 'N/A')} + +๐Ÿ“ EXAMPLE INSTRUCTION: +{analysis.get('example_instruction', 'N/A')} + +๐Ÿ’ญ CONTEXT: +- Task: "{task_command}" +- Field Scores: is_index_based={field_scores.get('is_index_based', 0.0):.0%}, index_value={field_scores.get('index_value', 0.0):.0%}, parent_element_id={field_scores.get('parent_element_id', 0.0):.0%}, element_id_of_nth_child={field_scores.get('element_id_of_nth_child', 0.0):.0%}, selected_element_is_correct={field_scores.get('selected_element_is_correct', 0.0):.0%}""" + + return feedback + except json.JSONDecodeError: + pass + + # Fallback to raw output + return f"LLM-as-Judge Analysis (Index Caching):\n{judge_output}" + + +# System prompt for LLM-as-Judge +INDEX_CACHING_JUDGE_SYSTEM_PROMPT = """You are an expert prompt engineer analyzing mobile UI automation prompts for index-based element selection. + +Your task is to analyze why an LLM failed to correctly identify index-based element selection fields and provide actionable feedback to improve the system prompt. + +Focus on: +- Command interpretation (is_index_based classification) +- Ordinal parsing (index_value extraction) +- XML hierarchy traversal (parent_element_id and element_id_of_nth_child_of_parent) +- Element correctness assessment (selected_element_is_correct) + +You can see the screenshot with annotated bounding boxes if provided. Analyze the visual structure to understand why the LLM made errors.""" + diff --git a/src/gepa_optimizer/utils/log_parser.py b/src/gepa_optimizer/utils/log_parser.py new file mode 100644 index 0000000000000000000000000000000000000000..b9889a1e09f0bbbfd1378caf3b230bcc5d527a25 --- /dev/null +++ b/src/gepa_optimizer/utils/log_parser.py @@ -0,0 +1,298 @@ +""" +Log Parser for Extracting Candidates and Feedback + +Parses optimization logs to extract candidate prompts, feedback, and scores. +""" + +import re +from typing import List, Dict, Optional, Tuple +from pathlib import Path + + +class OptimizationLogParser: + """Parse optimization logs to extract candidates and feedback""" + + def __init__(self, log_file: str): + """ + Initialize parser with log file path. + + Args: + log_file: Path to log file + """ + self.log_file = Path(log_file) + self.content = "" + if self.log_file.exists(): + with open(self.log_file, 'r', encoding='utf-8') as f: + self.content = f.read() + + def extract_iterations(self) -> List[Dict]: + """Extract iteration information from logs""" + iterations = [] + + # Pattern to find iteration markers + iteration_pattern = r'Iteration\s+(\d+)|Starting GEPA optimization|๐Ÿš€ Starting GEPA optimization' + + # Find all iteration starts + for match in re.finditer(iteration_pattern, self.content): + # Try to extract iteration number + iter_num = 1 + if match.group(1): + iter_num = int(match.group(1)) + + # Find the section for this iteration + start_pos = match.start() + next_match = list(re.finditer(iteration_pattern, self.content)) + next_idx = next((i for i, m in enumerate(next_match) if m.start() > start_pos), None) + + if next_idx is not None: + end_pos = next_match[next_idx].start() + iter_content = self.content[start_pos:end_pos] + else: + iter_content = self.content[start_pos:] + + iterations.append({ + 'iteration': iter_num, + 'content': iter_content, + 'start_pos': start_pos + }) + + return iterations + + def extract_candidates(self, iteration_content: str) -> List[Dict]: + """ + Extract candidate prompts from iteration content. + + Args: + iteration_content: Content for a single iteration + + Returns: + List of candidate dictionaries + """ + candidates = [] + + # Pattern 1: GEPA Reflection candidates + # Look for "PROPOSED PROMPT" or "๐Ÿ“ PROPOSED PROMPT" + gepa_patterns = [ + r'๐Ÿ“ PROPOSED PROMPT.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + r'PROPOSED PROMPT.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + r'GEPA REFLECTION.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + ] + + for pattern in gepa_patterns: + for match in re.finditer(pattern, iteration_content, re.DOTALL): + prompt = match.group(1).strip() + if prompt and len(prompt) > 20: # Valid prompt + candidates.append({ + 'source': 'GEPA_Reflection', + 'prompt': prompt, + 'position': match.start() + }) + + # Pattern 2: LLEGO Crossover candidates + crossover_patterns = [ + r'๐Ÿงฌ Crossover.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + r'Crossover.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + ] + + for pattern in crossover_patterns: + for match in re.finditer(pattern, iteration_content, re.DOTALL): + prompt = match.group(1).strip() + if prompt and len(prompt) > 20: + candidates.append({ + 'source': 'LLEGO_Crossover', + 'prompt': prompt, + 'position': match.start() + }) + + # Pattern 3: LLEGO Mutation candidates + mutation_patterns = [ + r'๐ŸŽฒ Mutation.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + r'Mutation.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + ] + + for pattern in mutation_patterns: + for match in re.finditer(pattern, iteration_content, re.DOTALL): + prompt = match.group(1).strip() + if prompt and len(prompt) > 20: + candidates.append({ + 'source': 'LLEGO_Mutation', + 'prompt': prompt, + 'position': match.start() + }) + + # Pattern 4: Generic candidate markers + # Look for prompts in quotes or code blocks + generic_patterns = [ + r'"([^"]{50,})"', # Quoted prompts + r'```\s*(.*?)\s*```', # Code blocks + ] + + for pattern in generic_patterns: + for match in re.finditer(pattern, iteration_content, re.DOTALL): + prompt = match.group(1).strip() + # Check if it looks like a prompt (contains task instructions) + if (len(prompt) > 50 and + any(keyword in prompt.lower() for keyword in + ['you are', 'task', 'instruction', 'element', 'identify', 'select'])): + # Check if we haven't already captured this + if not any(c['prompt'] == prompt for c in candidates): + candidates.append({ + 'source': 'Unknown', + 'prompt': prompt, + 'position': match.start() + }) + + # Sort by position + candidates.sort(key=lambda x: x['position']) + + return candidates + + def extract_feedback(self, iteration_content: str) -> List[Dict]: + """ + Extract feedback from iteration content. + + Args: + iteration_content: Content for a single iteration + + Returns: + List of feedback dictionaries + """ + feedback_list = [] + + # Pattern 1: Explicit feedback markers + feedback_patterns = [ + r'๐Ÿ’ฌ FEEDBACK:\s*(.*?)(?=\n\n|\n๐Ÿ“Š|\n๐Ÿš€|\n๐Ÿ’ก|$)', + r'FEEDBACK:\s*(.*?)(?=\n\n|\n๐Ÿ“Š|\n๐Ÿš€|\n๐Ÿ’ก|$)', + r'Feedback:\s*(.*?)(?=\n\n|\n๐Ÿ“Š|\n๐Ÿš€|\n๐Ÿ’ก|$)', + ] + + for pattern in feedback_patterns: + for match in re.finditer(pattern, iteration_content, re.DOTALL): + feedback_text = match.group(1).strip() + if feedback_text and len(feedback_text) > 10: + feedback_list.append({ + 'feedback': feedback_text, + 'position': match.start() + }) + + # Pattern 2: LLM-as-Judge feedback + judge_patterns = [ + r'LLM-as-Judge.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + r'Judge Feedback.*?----------------------------------------\s*(.*?)(?=----------------------------------------|๐Ÿ“Š|๐Ÿš€|$)', + ] + + for pattern in judge_patterns: + for match in re.finditer(pattern, iteration_content, re.DOTALL): + feedback_text = match.group(1).strip() + if feedback_text and len(feedback_text) > 10: + feedback_list.append({ + 'feedback': feedback_text, + 'position': match.start(), + 'source': 'LLM-as-Judge' + }) + + # Sort by position + feedback_list.sort(key=lambda x: x['position']) + + return feedback_list + + def extract_scores(self, iteration_content: str) -> List[Dict]: + """ + Extract scores from iteration content. + + Args: + iteration_content: Content for a single iteration + + Returns: + List of score dictionaries + """ + scores = [] + + # Pattern for scores + score_patterns = [ + r'Score:\s*([\d.]+)', + r'Average score:\s*([\d.]+)', + r'๐ŸŽฏ SCORE:\s*([\d.]+)', + r'๐Ÿ“Š Score:\s*([\d.]+)', + ] + + for pattern in score_patterns: + for match in re.finditer(pattern, iteration_content): + score_value = float(match.group(1)) + scores.append({ + 'score': score_value, + 'position': match.start() + }) + + # Sort by position + scores.sort(key=lambda x: x['position']) + + return scores + + def parse_all(self) -> Dict: + """ + Parse entire log file and extract all information. + + Returns: + Dictionary with all extracted information + """ + iterations = self.extract_iterations() + + result = { + 'iterations': [], + 'total_iterations': len(iterations), + 'all_candidates': [], + 'all_feedback': [] + } + + for iter_info in iterations: + iter_num = iter_info['iteration'] + iter_content = iter_info['content'] + + candidates = self.extract_candidates(iter_content) + feedback = self.extract_feedback(iter_content) + scores = self.extract_scores(iter_content) + + # Try to associate scores with candidates + for i, candidate in enumerate(candidates): + # Find nearest score after this candidate + candidate_pos = candidate['position'] + nearest_score = None + min_distance = float('inf') + + for score_info in scores: + if score_info['position'] > candidate_pos: + distance = score_info['position'] - candidate_pos + if distance < min_distance: + min_distance = distance + nearest_score = score_info['score'] + + if nearest_score is not None: + candidate['score'] = nearest_score + + # Try to associate feedback + nearest_feedback = None + min_distance = float('inf') + + for feedback_info in feedback: + if feedback_info['position'] > candidate_pos: + distance = feedback_info['position'] - candidate_pos + if distance < min_distance and distance < 5000: # Within reasonable distance + min_distance = distance + nearest_feedback = feedback_info['feedback'] + + if nearest_feedback: + candidate['feedback'] = nearest_feedback + + result['iterations'].append({ + 'iteration': iter_num, + 'candidates': candidates, + 'feedback': feedback, + 'scores': scores + }) + + result['all_candidates'].extend(candidates) + result['all_feedback'].extend(feedback) + + return result + diff --git a/src/gepa_optimizer/utils/logging.py b/src/gepa_optimizer/utils/logging.py new file mode 100644 index 0000000000000000000000000000000000000000..401a95751534521d769f664e15639e8240d26be9 --- /dev/null +++ b/src/gepa_optimizer/utils/logging.py @@ -0,0 +1,107 @@ +""" +Logging setup for GEPA Optimizer. + +This module provides backward-compatible logging functions that delegate +to the centralized logging infrastructure. + +For new code, prefer importing directly from infrastructure.logging: + from gepa_optimizer.infrastructure.logging import get_logger, configure_logging +""" + +import logging +from pathlib import Path +from datetime import datetime +from typing import Optional, Union + +# Import from centralized infrastructure +from ..infrastructure.logging import ( + get_logger as _get_logger, + configure_logging as _configure_logging, + LogLevel, +) + + +def setup_logging( + level: str = "INFO", + log_file: Optional[Union[str, bool]] = None, + use_colors: bool = True, + include_emoji: bool = True, +) -> None: + """ + Configure logging for GEPA Optimizer with optional file logging. + + This function provides backward compatibility with existing code. + New code should use configure_logging() from infrastructure.logging. + + Args: + level: Logging level (e.g. "DEBUG", "INFO", "WARNING") + log_file: Path to log file. + - None: Auto-generates timestamped filename in logs/ + - False: Disables file logging + - str: Uses specified path + use_colors: Whether to use colored output in console + include_emoji: Whether to include emoji in log messages + + Example: + # Basic setup + setup_logging(level="INFO") + + # With file logging + setup_logging(level="DEBUG", log_file="optimization.log") + + # Console only, no colors + setup_logging(level="INFO", log_file=False, use_colors=False) + """ + # Handle auto-generated log file + actual_log_file: Optional[str] = None + + if log_file is None: + # Auto-generate log filename with timestamp + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + log_dir = Path("logs") + log_dir.mkdir(exist_ok=True) + actual_log_file = str(log_dir / f"optimization_{timestamp}.log") + elif log_file is not False: + # Use specified path + log_path = Path(log_file) + log_path.parent.mkdir(parents=True, exist_ok=True) + actual_log_file = str(log_file) + + # Delegate to centralized configuration + _configure_logging( + level=level, + log_file=actual_log_file, + use_colors=use_colors, + include_emoji=include_emoji, + ) + + # Log configuration info + logger = _get_logger(__name__) + if actual_log_file: + logger.info(f"Logging to file: {actual_log_file}") + logger.info(f"Logging configured at {level} level (console + file)") + else: + logger.info(f"Logging configured at {level} level (console only)") + + +def get_logger(name: str) -> logging.Logger: + """ + Get a logger for a specific module. + + This function provides backward compatibility. New code should use: + from gepa_optimizer.infrastructure.logging import get_logger + + Args: + name: Module name (typically __name__) + + Returns: + Configured Logger instance + """ + return _get_logger(name) + + +# Re-export for convenience +__all__ = [ + "setup_logging", + "get_logger", +] diff --git a/src/gepa_optimizer/utils/metrics.py b/src/gepa_optimizer/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..2e1bbfb9884fd93d548a93a199cbfafd3d8edd96 --- /dev/null +++ b/src/gepa_optimizer/utils/metrics.py @@ -0,0 +1,220 @@ +""" +Comprehensive metrics calculations for GEPA Optimizer +""" + +from typing import Dict, List, Optional, Any +import re +import time +from collections import Counter + +def calculate_metrics(original_prompt: str, + optimized_prompt: str, + performance_data: Optional[Dict[str, Any]] = None) -> Dict[str, float]: + """ + Calculate comprehensive improvement metrics between original and optimized prompts + + Args: + original_prompt: Original seed prompt + optimized_prompt: GEPA-optimized prompt + performance_data: Optional performance metrics from GEPA + + Returns: + Dict[str, float]: Comprehensive metrics dictionary + """ + metrics = {} + + # Basic length metrics + orig_len = len(original_prompt) + opt_len = len(optimized_prompt) + + if orig_len > 0: + metrics['length_change_percent'] = ((opt_len - orig_len) / orig_len) * 100 + else: + metrics['length_change_percent'] = 0.0 + + metrics['original_length'] = orig_len + metrics['optimized_length'] = opt_len + + # Word count metrics + orig_words = len(original_prompt.split()) + opt_words = len(optimized_prompt.split()) + + if orig_words > 0: + metrics['word_change_percent'] = ((opt_words - orig_words) / orig_words) * 100 + else: + metrics['word_change_percent'] = 0.0 + + metrics['original_words'] = orig_words + metrics['optimized_words'] = opt_words + + # Complexity metrics + metrics['original_complexity'] = calculate_text_complexity(original_prompt) + metrics['optimized_complexity'] = calculate_text_complexity(optimized_prompt) + metrics['complexity_change'] = metrics['optimized_complexity'] - metrics['original_complexity'] + + # Similarity metrics + metrics['similarity_score'] = calculate_similarity(original_prompt, optimized_prompt) + + # Include GEPA performance data if available + if performance_data: + for key, value in performance_data.items(): + if isinstance(value, (int, float)): + metrics[f'gepa_{key}'] = float(value) + + return metrics + +def calculate_text_complexity(text: str) -> float: + """ + Calculate a simple complexity score for text + + Args: + text: Text to analyze + + Returns: + float: Complexity score (higher = more complex) + """ + if not text: + return 0.0 + + # Count various complexity indicators + sentence_count = len(re.findall(r'[.!?]+', text)) + word_count = len(text.split()) + char_count = len(text) + unique_words = len(set(text.lower().split())) + + # Avoid division by zero + if word_count == 0: + return 0.0 + + # Simple complexity calculation + avg_word_length = char_count / word_count + lexical_diversity = unique_words / word_count + avg_sentence_length = word_count / max(sentence_count, 1) + + # Weighted complexity score + complexity = ( + avg_word_length * 0.3 + + lexical_diversity * 0.4 + + avg_sentence_length * 0.3 + ) + + return round(complexity, 3) + +def calculate_similarity(text1: str, text2: str) -> float: + """ + Calculate similarity between two texts using simple word overlap + + Args: + text1: First text + text2: Second text + + Returns: + float: Similarity score between 0 and 1 + """ + if not text1 or not text2: + return 0.0 + + # Convert to lowercase and split into words + words1 = set(text1.lower().split()) + words2 = set(text2.lower().split()) + + # Calculate Jaccard similarity + intersection = len(words1.intersection(words2)) + union = len(words1.union(words2)) + + if union == 0: + return 0.0 + + similarity = intersection / union + return round(similarity, 3) + +def track_optimization_progress(iteration: int, + score: float, + improvement: float, + time_elapsed: float) -> Dict[str, Any]: + """ + Track progress during optimization iterations + + Args: + iteration: Current iteration number + score: Current performance score + improvement: Improvement over baseline + time_elapsed: Time elapsed in seconds + + Returns: + Dict[str, Any]: Progress metrics + """ + return { + 'iteration': iteration, + 'score': round(score, 4), + 'improvement': round(improvement, 4), + 'time_elapsed': round(time_elapsed, 2), + 'score_per_second': round(score / max(time_elapsed, 0.001), 4) + } + +def calculate_cost_efficiency(improvement_percent: float, + estimated_cost: float) -> Dict[str, float]: + """ + Calculate cost efficiency metrics + + Args: + improvement_percent: Performance improvement percentage + estimated_cost: Estimated cost in USD + + Returns: + Dict[str, float]: Cost efficiency metrics + """ + if estimated_cost <= 0: + return {'improvement_per_dollar': 0.0, 'cost_efficiency': 0.0} + + improvement_per_dollar = improvement_percent / estimated_cost + + # Cost efficiency score (higher is better) + cost_efficiency = min(improvement_per_dollar / 10.0, 1.0) # Normalized to 0-1 + + return { + 'improvement_per_dollar': round(improvement_per_dollar, 3), + 'cost_efficiency': round(cost_efficiency, 3), + 'estimated_cost': estimated_cost + } + +def summarize_optimization_results(metrics: Dict[str, float]) -> str: + """ + Create a human-readable summary of optimization results + + Args: + metrics: Metrics dictionary from calculate_metrics + + Returns: + str: Human-readable summary + """ + summary_parts = [] + + # Length changes + length_change = metrics.get('length_change_percent', 0) + if length_change > 5: + summary_parts.append(f"Prompt expanded by {length_change:.1f}%") + elif length_change < -5: + summary_parts.append(f"Prompt condensed by {abs(length_change):.1f}%") + else: + summary_parts.append("Prompt length remained similar") + + # Complexity changes + complexity_change = metrics.get('complexity_change', 0) + if complexity_change > 0.1: + summary_parts.append("increased complexity") + elif complexity_change < -0.1: + summary_parts.append("reduced complexity") + else: + summary_parts.append("maintained similar complexity") + + # Similarity + similarity = metrics.get('similarity_score', 1.0) + if similarity > 0.8: + summary_parts.append(f"high similarity to original ({similarity:.2f})") + elif similarity > 0.5: + summary_parts.append(f"moderate changes from original ({similarity:.2f})") + else: + summary_parts.append(f"significant changes from original ({similarity:.2f})") + + return f"Optimization results: {', '.join(summary_parts)}" diff --git a/src/gepa_optimizer/utils/pareto_logger.py b/src/gepa_optimizer/utils/pareto_logger.py new file mode 100644 index 0000000000000000000000000000000000000000..923873788f0f1a9655b4274552ecd48ef6a136a7 --- /dev/null +++ b/src/gepa_optimizer/utils/pareto_logger.py @@ -0,0 +1,461 @@ +""" +Pareto Front Logger - Tracks candidate comparisons and Pareto front updates +""" + +from typing import Dict, List, Optional +from collections import defaultdict +import logging + +logger = logging.getLogger(__name__) + +class ParetoLogger: + """Tracks evaluations and Pareto front updates""" + + def __init__(self): + self.candidates_evaluated = [] # List of (prompt, score, type, dataset) + self.pareto_front = [] # Current Pareto front (prompt, score, type) + self.baseline_score = None + + def log_candidate_evaluation(self, prompt: str, score: float, candidate_type: str, dataset_type: str): + """Log a candidate evaluation""" + self.candidates_evaluated.append({ + 'prompt': prompt, + 'score': score, + 'type': candidate_type, + 'dataset': dataset_type + }) + + # If evaluated on Dpareto, check against Pareto front + if dataset_type == 'dpareto': + self._check_pareto_update(prompt, score, candidate_type) + + def _check_pareto_update(self, prompt: str, score: float, candidate_type: str): + """Check if candidate should be added to Pareto front + + ๐Ÿ”ฅ CRITICAL RULE: Candidate must be better than baseline (f(Sโ‚€)) to enter Pareto front + Exception: Seed prompt (Sโ‚€) itself is always added as baseline + """ + # Get notation for candidate with better mapping + if candidate_type == 'gepa_reflection': + cand_notation = 'Sแตฃ' + elif candidate_type == 'llego_crossover' or candidate_type == 'llego_crossover1' or candidate_type == 'llego_crossover2': + cand_notation = 'Oโ‚“โ‚’' + elif candidate_type == 'llego_mutation' or candidate_type == 'llego_mutation1' or candidate_type == 'llego_mutation2': + cand_notation = 'Oโ‚˜แตคโ‚œ' + elif candidate_type == 'seed': + cand_notation = 'Sโ‚€' + elif candidate_type == 'unknown' or not candidate_type: + cand_notation = 'S' # Default for unknown + else: + # For any other type, use base notation + cand_notation = 'S' + + logger.info("\n" + "โ•" * 80) + logger.info(f"๐Ÿ“Š PARETO FRONT P ANALYSIS - Evaluating {cand_notation}") + logger.info("โ•" * 80) + + logger.info(f"\n ๐Ÿ“Š Evaluating: {cand_notation} with f({cand_notation}) = {score:.4f}") + + # ๐Ÿ”ฅ CRITICAL BASELINE CHECK: Candidate must be better than baseline (unless it's the seed itself) + # Rule: Only candidates with f(candidate) > f(Sโ‚€) can enter Pareto front + # Exception: Seed prompt (Sโ‚€) itself is always added as the baseline + if candidate_type == 'seed': + logger.info(f"\n โœ… {cand_notation} is seed prompt - always added as baseline") + + # Set baseline if not already set (safety check - adapter should have done this) + if self.baseline_score is None: + self.baseline_score = score + logger.info(f" ๐Ÿ’ก Setting baseline: f(Sโ‚€) = {score:.4f}") + + # Add seed to Pareto front immediately (no dominance check needed) + self.pareto_front.append({ + 'prompt': prompt, + 'score': score, + 'type': candidate_type, + 'notation': cand_notation + }) + self.pareto_front.sort(key=lambda x: x['score'], reverse=True) + + # Display Pareto front with seed + front_notations = [c.get('notation', 'S') for c in self.pareto_front] + logger.info(f"\n โœ… ADDED to Pareto Front P (baseline)") + logger.info(f" P = {{{', '.join(front_notations)}}}") + self._display_pareto_front() + + return # Seed is always added - skip dominance check + else: + # For non-seed candidates, must be better than baseline to proceed + if self.baseline_score is not None: + if score > self.baseline_score: + logger.info(f"\n โœ… {cand_notation} meets baseline requirement:") + logger.info(f" f(Sโ‚€) = {self.baseline_score:.4f} (baseline)") + logger.info(f" f({cand_notation}) = {score:.4f}") + logger.info(f" f({cand_notation}) > f(Sโ‚€) โ†’ Can be added to Pareto front") + logger.info(f" Improvement over baseline: +{score - self.baseline_score:.4f}") + else: + logger.info(f"\n โŒ {cand_notation} does NOT meet baseline requirement:") + logger.info(f" f(Sโ‚€) = {self.baseline_score:.4f} (baseline)") + logger.info(f" f({cand_notation}) = {score:.4f}") + logger.info(f" f({cand_notation}) โ‰ค f(Sโ‚€) โ†’ NOT ADDED to Pareto front") + logger.info(f" ๐Ÿ’ก Only candidates better than baseline can enter Pareto front") + logger.info(f" ๐Ÿ’ก Difference: {score - self.baseline_score:.4f} (needs to be > 0)") + return # Skip Pareto front update - candidate is not better than baseline + else: + # CRITICAL: Baseline must be set before evaluating any non-seed candidates + logger.error(f"\n โŒ CRITICAL ERROR: Baseline score not set!") + logger.error(f" Cannot evaluate {cand_notation} without baseline f(Sโ‚€)") + logger.error(f" ๐Ÿ’ก Seed prompt must be evaluated on Dpareto first") + logger.error(f" ๐Ÿ’ก Rejecting candidate to maintain correctness") + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "B", "location": "pareto_logger.py:baseline_not_set", "message": "CRITICAL: Baseline not set when checking Pareto", "data": {"candidate_type": candidate_type, "candidate_notation": cand_notation, "score": score}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + return # Reject candidate - baseline is required + + # Check if this candidate dominates any in current front + dominated = [] + for i, front_candidate in enumerate(self.pareto_front): + front_score = front_candidate['score'] + front_notation = front_candidate.get('notation', 'S') + + # Simple dominance: higher score dominates + if score > front_score: + dominated.append(i) + logger.info(f"\n โœ… {cand_notation} DOMINATES P{i+1}:") + logger.info(f" f(P{i+1}) = {front_score:.4f}") + logger.info(f" f({cand_notation}) = {score:.4f}") + logger.info(f" f({cand_notation}) > f({front_notation}) โ†’ DOMINANCE") + logger.info(f" Improvement: +{score - front_score:.4f}") + + if dominated: + # Remove dominated candidates + for i in reversed(dominated): + removed = self.pareto_front.pop(i) + removed_notation = removed.get('notation', 'S') + logger.info(f" โžก๏ธ Removing {removed_notation} from Pareto front P (dominated by {cand_notation})") + + # Add new candidate + self.pareto_front.append({ + 'prompt': prompt, + 'score': score, + 'type': candidate_type, + 'notation': cand_notation + }) + + # Sort by score + self.pareto_front.sort(key=lambda x: x['score'], reverse=True) + + # Display Pareto front with candidate notations + front_notations = [c.get('notation', 'S') for c in self.pareto_front] + logger.info(f"\n โœ… ADDED to Pareto Front P") + logger.info(f" P = {{{', '.join(front_notations)}}}") + else: + # Check if any in front dominates this candidate + is_dominated = False + for i, front_candidate in enumerate(self.pareto_front): + if front_candidate['score'] > score: + front_notation = front_candidate.get('notation', 'S') + logger.info(f"\n โŒ {cand_notation} is DOMINATED by {front_notation}:") + logger.info(f" f({front_notation}) = {front_candidate['score']:.4f}") + logger.info(f" f({cand_notation}) = {score:.4f}") + logger.info(f" f({front_notation}) > f({cand_notation}) โ†’ DOMINATED") + logger.info(f" Difference: {score - front_candidate['score']:.4f}") + is_dominated = True + break + + if not is_dominated: + # Check for equal scores (for single-objective, we can add if non-dominated) + equal_candidates = [c.get('notation', 'S') for c in self.pareto_front if abs(c['score'] - score) < 1e-6] + + # Non-dominated: add to front + self.pareto_front.append({ + 'prompt': prompt, + 'score': score, + 'type': candidate_type, + 'notation': cand_notation + }) + self.pareto_front.sort(key=lambda x: x['score'], reverse=True) + + # Display Pareto front with candidate notations + front_notations = [c.get('notation', 'S') for c in self.pareto_front] + if equal_candidates: + logger.info(f"\n โœ… ADDED to Pareto Front P (non-dominated)") + logger.info(f" f({cand_notation}) = {score:.4f} (same score as {', '.join(equal_candidates)})") + logger.info(f" P = {{{', '.join(front_notations)}}}") + else: + logger.info(f"\n โœ… ADDED to Pareto Front P (non-dominated)") + logger.info(f" {cand_notation} is non-dominated โ†’ kept in P") + logger.info(f" P = {{{', '.join(front_notations)}}}") + else: + # Show all dominating candidates with their notations + dominating_list = [(c.get('notation', 'S'), c['score']) for c in self.pareto_front if c['score'] > score] + if dominating_list: + for dom_notation, dom_score in dominating_list: + logger.info(f"\n โŒ {cand_notation} is DOMINATED by {dom_notation}:") + logger.info(f" f({dom_notation}) = {dom_score:.4f}") + logger.info(f" f({cand_notation}) = {score:.4f}") + logger.info(f" f({dom_notation}) > f({cand_notation}) โ†’ DOMINATED") + logger.info(f"\n โŒ NOT ADDED to Pareto Front P (dominated)") + + self._display_pareto_front() + + def _display_pareto_front(self): + """Display current Pareto front with candidate notation""" + logger.info(f"\n๐Ÿ“‹ CURRENT PARETO FRONT P (Size: |P| = {len(self.pareto_front)}):") + logger.info("โ”€" * 80) + + if not self.pareto_front: + logger.info(" P = {} (Empty - no candidates added yet)") + logger.info(" ๐Ÿ’ก NOTATION: P = Pareto front (non-dominated solutions)") + return + + # Display Pareto front using candidate notations instead of P1, P2, etc. + front_notations = [c.get('notation', 'S') for c in self.pareto_front] + logger.info(f" P = {{{', '.join(front_notations)}}}") + + for candidate in self.pareto_front: + notation = candidate.get('notation', 'S') + + # Enhanced type labels with full notation + type_labels = { + 'seed': ('๐ŸŒฑ Seed Prompt', 'Sโ‚€'), + 'gepa_reflection': ('๐Ÿ“ GEPA Reflection Candidate', 'Sแตฃ'), + 'llego_crossover': ('๐Ÿ”€ LLEGO Crossover Offspring', 'Oโ‚“โ‚’'), + 'llego_mutation': ('๐ŸŽฒ LLEGO Mutation Offspring', 'Oโ‚˜แตคโ‚œ'), + 'unknown': ('๐Ÿ”„ Unknown Candidate', 'S') + } + + cand_type = candidate.get('type', 'unknown') + type_label, type_notation = type_labels.get(cand_type, (f'๐Ÿ”„ {cand_type}', notation)) + + # Use the notation from the candidate if available, otherwise use type notation + display_notation = notation if notation != 'S' else type_notation + + logger.info(f"\n {display_notation}: {type_label}") + logger.info(f" f({display_notation}) = {candidate['score']:.4f}") + prompt_preview = candidate['prompt'][:150] if len(candidate['prompt']) > 150 else candidate['prompt'] + logger.info(f" Prompt ({len(candidate['prompt'])} chars): {prompt_preview}{'...' if len(candidate['prompt']) > 150 else ''}") + + logger.info(f"\n ๐Ÿ’ก NOTATION EXPLANATION:") + logger.info(f" P = Pareto front (set of non-dominated solutions)") + logger.info(f" Sโ‚€ = Seed prompt (baseline)") + logger.info(f" Sแตฃ = GEPA Reflection candidate") + logger.info(f" Oโ‚“โ‚’ = LLEGO Crossover offspring (combines parents)") + logger.info(f" Oโ‚˜แตคโ‚œ = LLEGO Mutation offspring (explores variations)") + logger.info(f" f({', '.join(front_notations[:3])}) = Fitness scores of candidates in Pareto front") + logger.info("โ”€" * 80) + + def set_baseline(self, score: float): + """Set baseline score for comparison""" + self.baseline_score = score + # Add seed to Pareto front if we have it + if self.pareto_front: + seed_candidate = self.pareto_front[0] # First is usually seed + seed_candidate['baseline_score'] = score + + def batch_update_pareto_front(self, candidates_with_scores: List[Dict]) -> List[Dict]: + """ + ๐Ÿ”ฅ BATCH PARETO FRONT UPDATE + + Efficiently update Pareto front with multiple candidates in one operation. + + Steps: + 1. Filter by baseline (score > baseline_score) + 2. Find non-dominated among filtered candidates + 3. Compare with current Pareto front + 4. Update Pareto front (remove dominated, add non-dominated) + + Args: + candidates_with_scores: List of dicts with keys: + - 'prompt': str + - 'score': float + - 'type': str (candidate_type) + - 'notation': str (optional, will be generated if missing) + + Returns: + List of candidates that were added to Pareto front + """ + if not candidates_with_scores: + return [] + + logger.info("\n" + "โ•" * 80) + logger.info(f"๐Ÿ”ฅ BATCH PARETO FRONT UPDATE - Processing {len(candidates_with_scores)} candidates") + logger.info("โ•" * 80) + + # Step 1: Filter by baseline (score > baseline_score) + if self.baseline_score is None: + logger.error("โŒ Baseline score not set - cannot perform batch update") + logger.error(" ๐Ÿ’ก Seed prompt must be evaluated on Dpareto first") + return [] + + baseline = self.baseline_score + filtered = [] + + for cand in candidates_with_scores: + score = cand.get('score', 0.0) + cand_type = cand.get('type', 'unknown') + + # Seed is always included (it's the baseline) + if cand_type == 'seed': + filtered.append(cand) + continue + + # Non-seed candidates must be better than baseline + if score > baseline: + filtered.append(cand) + logger.info(f" โœ… {cand.get('notation', 'S')} passes baseline: f={score:.4f} > f(Sโ‚€)={baseline:.4f}") + else: + notation = cand.get('notation', 'S') + logger.info(f" โŒ {notation} fails baseline: f={score:.4f} โ‰ค f(Sโ‚€)={baseline:.4f}") + + if not filtered: + logger.info(f"\n โŒ No candidates pass baseline filter (baseline: {baseline:.4f})") + logger.info(" ๐Ÿ’ก All candidates are worse than or equal to seed prompt") + return [] + + logger.info(f"\n ๐Ÿ“Š After baseline filter: {len(filtered)}/{len(candidates_with_scores)} candidates remain") + + # Step 2: Find non-dominated among filtered candidates + # Sort by score (descending) for easier dominance checking + filtered_sorted = sorted(filtered, key=lambda x: x.get('score', 0.0), reverse=True) + non_dominated_batch = [] + + for i, cand in enumerate(filtered_sorted): + cand_score = cand.get('score', 0.0) + cand_notation = cand.get('notation', 'S') + is_dominated = False + + # Check if dominated by any other candidate in batch + for other in filtered_sorted[:i]: # Only check candidates with higher scores + other_score = other.get('score', 0.0) + if other_score > cand_score: + other_notation = other.get('notation', 'S') + logger.info(f" โŒ {cand_notation} dominated by {other_notation} in batch: f({other_notation})={other_score:.4f} > f({cand_notation})={cand_score:.4f}") + is_dominated = True + break + + if not is_dominated: + non_dominated_batch.append(cand) + logger.info(f" โœ… {cand_notation} is non-dominated in batch: f={cand_score:.4f}") + + logger.info(f"\n ๐Ÿ“Š After batch dominance check: {len(non_dominated_batch)}/{len(filtered)} non-dominated candidates") + + if not non_dominated_batch: + logger.info(" โŒ No non-dominated candidates in batch") + return [] + + # Step 3: Compare with current Pareto front and update + added_to_front = [] + candidates_to_remove = [] + + # First, check which current front candidates are dominated by new batch + for front_cand in self.pareto_front: + front_score = front_cand.get('score', 0.0) + front_notation = front_cand.get('notation', 'S') + + # Check if any new candidate dominates this front candidate + for new_cand in non_dominated_batch: + new_score = new_cand.get('score', 0.0) + new_notation = new_cand.get('notation', 'S') + + if new_score > front_score: + candidates_to_remove.append(front_cand) + logger.info(f" โžก๏ธ {front_notation} will be removed (dominated by {new_notation}): f({front_notation})={front_score:.4f} < f({new_notation})={new_score:.4f}") + break + + # Remove dominated candidates from front + for cand_to_remove in candidates_to_remove: + if cand_to_remove in self.pareto_front: + self.pareto_front.remove(cand_to_remove) + + # Now add non-dominated new candidates (check they're not dominated by remaining front) + for new_cand in non_dominated_batch: + new_score = new_cand.get('score', 0.0) + new_notation = new_cand.get('notation', 'S') + new_type = new_cand.get('type', 'unknown') + new_prompt = new_cand.get('prompt', '') + + # Check if dominated by any remaining front candidate + is_dominated_by_front = False + for front_cand in self.pareto_front: + front_score = front_cand.get('score', 0.0) + if front_score > new_score: + front_notation = front_cand.get('notation', 'S') + logger.info(f" โŒ {new_notation} dominated by existing {front_notation}: f({front_notation})={front_score:.4f} > f({new_notation})={new_score:.4f}") + is_dominated_by_front = True + break + + if not is_dominated_by_front: + # Generate notation if missing + if 'notation' not in new_cand: + if new_type == 'gepa_reflection': + new_notation = 'Sแตฃ' + elif new_type.startswith('llego_crossover'): + new_notation = 'Oโ‚“โ‚’' + elif new_type.startswith('llego_mutation'): + new_notation = 'Oโ‚˜แตคโ‚œ' + elif new_type == 'seed': + new_notation = 'Sโ‚€' + else: + new_notation = 'S' + + # Add to Pareto front + front_entry = { + 'prompt': new_prompt, + 'score': new_score, + 'type': new_type, + 'notation': new_notation + } + self.pareto_front.append(front_entry) + added_to_front.append(new_cand) + + # Also log to candidates_evaluated for tracking + self.candidates_evaluated.append({ + 'prompt': new_prompt, + 'score': new_score, + 'type': new_type, + 'dataset': 'dpareto' + }) + + logger.info(f" โœ… {new_notation} ADDED to Pareto front: f={new_score:.4f}") + + # Sort Pareto front by score + self.pareto_front.sort(key=lambda x: x.get('score', 0.0), reverse=True) + + # Display updated Pareto front + logger.info(f"\n{'โ•'*80}") + logger.info(f"โœ… BATCH UPDATE COMPLETE") + logger.info(f" Added: {len(added_to_front)} candidates") + logger.info(f" Removed: {len(candidates_to_remove)} dominated candidates") + logger.info(f" Pareto front size: |P| = {len(self.pareto_front)}") + + front_notations = [c.get('notation', 'S') for c in self.pareto_front] + logger.info(f" P = {{{', '.join(front_notations)}}}") + self._display_pareto_front() + logger.info("โ•" * 80 + "\n") + + return added_to_front + +# Global instance +_pareto_logger = ParetoLogger() + +def get_pareto_logger() -> ParetoLogger: + """Get global Pareto logger instance""" + return _pareto_logger + +def reset_pareto_logger() -> ParetoLogger: + """Reset global Pareto logger instance (for new runs)""" + global _pareto_logger + _pareto_logger = ParetoLogger() + # #region agent log + import json as _json_debug + import time as _time_debug + _debug_log_path = "/Users/suhas/Desktop/Projects/Prompt-Optimizer/.cursor/debug.log" + with open(_debug_log_path, "a") as _f: + _f.write(_json_debug.dumps({"hypothesisId": "E", "location": "pareto_logger.py:reset", "message": "Pareto logger reset", "data": {"baseline_score": _pareto_logger.baseline_score, "pareto_front_size": len(_pareto_logger.pareto_front)}, "timestamp": int(_time_debug.time() * 1000), "sessionId": "debug-session"}) + "\n") + # #endregion + return _pareto_logger + diff --git a/src/gepa_optimizer/utils/universal_judge_prompt.py b/src/gepa_optimizer/utils/universal_judge_prompt.py new file mode 100644 index 0000000000000000000000000000000000000000..d6ab8d9152f41ae6a087ebb66a3b40d38e9e285d --- /dev/null +++ b/src/gepa_optimizer/utils/universal_judge_prompt.py @@ -0,0 +1,317 @@ +""" +Universal LLM-as-Judge Prompt Builder for ANY prompt optimization use case. + +This module provides prompts for semantic comparison and feedback generation +that work for text, JSON, structured outputs, and any other task type. + +NO UI-specific assumptions. NO element IDs. NO bounding boxes. +Pure semantic and structural comparison for universal prompt optimization. +""" + +from typing import Dict, Any, Optional + + +def build_universal_judge_prompt( + task_input: str, + predicted_output: str, + expected_output: str, + current_prompt: Optional[str] = None, + evaluation_results: Optional[Dict[str, Any]] = None, + image_base64: Optional[str] = None +) -> str: + """ + Build a universal LLM-as-Judge prompt for ANY task type. + + Works for: + - Text extraction (NER, summarization, translation) + - JSON generation (structured data extraction) + - Classification tasks (sentiment, category) + - Question answering + - Code generation + - Multi-modal tasks (with images) + + Args: + task_input: The input given to the LLM (task/question/text to process) + predicted_output: What the LLM actually returned + expected_output: The ground truth / desired output + current_prompt: The system prompt being optimized + evaluation_results: Optional evaluation scores + image_base64: Optional image for multi-modal tasks + + Returns: + Formatted judge prompt string + """ + # Handle empty outputs + if not predicted_output or predicted_output.strip() == '': + predicted_display = "[EMPTY - No output generated]" + else: + predicted_display = predicted_output + + if not expected_output or expected_output.strip() == '': + expected_display = "[EMPTY - No expected output provided]" + else: + expected_display = expected_output + + # Build evaluation context if available + eval_context = "" + if evaluation_results: + score = evaluation_results.get('composite_score', 0.0) + semantic = evaluation_results.get('semantic_similarity', 0.0) + structural = evaluation_results.get('structural_similarity', 0.0) + eval_context = f""" +EVALUATION SCORES: +- Composite Score: {score:.2%} +- Semantic Similarity: {semantic:.2%} +- Structural Similarity: {structural:.2%} +""" + + # Image context for multi-modal + image_context = "" + if image_base64: + image_context = """ +NOTE: An image was provided with this task. The LLM should have analyzed the image content. +Consider whether the predicted output accurately reflects the image content. +""" + + # Build the universal judge prompt - OPTIMIZED for complex enterprise use cases + # Uses 3-Layer Forensic Analysis: Syntax -> Structure -> Semantics + judge_prompt = f""" +You are a **Principal Forensic Prompt Auditor**. Your specialty is analyzing failures in Enterprise AI systems. +Your goal is to compare a [PREDICTED_OUTPUT] against an [EXPECTED_OUTPUT] to identify the *exact* root cause of failure in the [SYSTEM_PROMPT]. + + + + +{task_input} + + + +{current_prompt if current_prompt else "[No system prompt provided - Baseline Test]"} + + + + + +{expected_display} + + + +{predicted_display} + +{eval_context} +{image_context} + + + +You must evaluate the prediction using a 3-Layer Depth approach: + +1. **SYNTAX LAYER (Format)**: + - Is the output valid JSON/XML/Code? + - Are data types correct? (e.g., string "100" vs number 100). + - Are required headers or markdown tags present? + +2. **STRUCTURAL LAYER (Schema)**: + - For JSON: Do specific paths match? (e.g., check `orders[0].items[3].price`). + - For Lists: Is the count correct? Are items in the correct order? + - **CRITICAL**: Identify the *exact* nested key that failed. + +3. **SEMANTIC LAYER (Meaning)**: + - "Phone" vs "Mobile Device" (Acceptable Synonym). + - "User is 25" vs "Age: 25" (Acceptable Logic). + - Hallucinations: Did the model invent data not in the source? + + + +Return a JSON object analyzing the failure. NO preamble. +{{ + "match_status": "FULL_MATCH" | "PARTIAL_MATCH" | "CRITICAL_FAILURE", + "structural_analysis": {{ + "format_valid": true, + "schema_compliance": true, + "deep_diff": ["List specific paths that failed, e.g., 'data.users[0].id expected int, got string'"] + }}, + "semantic_analysis": {{ + "meaning_preserved": true, + "hallucinations": ["List specific invented facts"], + "missed_constraints": ["List specific constraints from prompt that were ignored"] + }}, + "root_cause_hypothesis": "Why did the prompt fail? (e.g., 'Ambiguity in field naming', 'Lack of negative constraint for X')", + "surgical_fix": "The EXACT instruction to add/change. (e.g., 'Change: Extract entities -> To: Extract entities and return as JSON list of objects')" +}} +""" + + return judge_prompt + + +def get_universal_judge_system_prompt(has_image: bool = False) -> str: + """ + Get the system prompt for the universal LLM-as-Judge. + + Args: + has_image: Whether an image is involved in the task + + Returns: + System prompt string for the judge + """ + base_prompt = """You are a **Principal Forensic Prompt Auditor** specializing in Enterprise AI system failures. + +Your task is to: +1. Perform 3-Layer Analysis: SYNTAX (format) โ†’ STRUCTURE (schema) โ†’ SEMANTICS (meaning) +2. Identify the EXACT nested path that failed (e.g., `data.items[2].price`) +3. Provide a ROOT CAUSE hypothesis for why the prompt failed +4. Deliver a SURGICAL FIX - the exact instruction to add or change + +Key principles: +- DEEP DIFF: Traverse nested JSON structures to find exact failure points +- SEMANTIC FLEXIBILITY: "Phone" == "Mobile Device" (synonyms OK) +- STRICT DATA: Wrong IDs, numbers, or hallucinated facts = CRITICAL_FAILURE +- ROOT CAUSE: Explain WHY the prompt failed (ambiguity? missing constraint?) + +Return your analysis as valid JSON only. No preamble.""" + + if has_image: + base_prompt += """ + +Note: This task involved image analysis. Factor visual content accuracy into your +SEMANTIC LAYER analysis. Did the model correctly interpret the image?""" + + return base_prompt + + +def format_universal_judge_feedback( + judge_output: str, + task_input: str, + predicted_output: str, + expected_output: str, + score: float = 0.0 +) -> str: + """ + Format the LLM-as-Judge output into readable feedback. + + Handles the new forensic analysis JSON schema with structural/semantic layers. + + Args: + judge_output: Raw output from the judge LLM + task_input: The original task input + predicted_output: The LLM's predicted output + expected_output: The expected output + score: Evaluation score + + Returns: + Formatted feedback string + """ + import json + import re + + # Try to parse JSON from judge output + json_match = re.search(r'\{[\s\S]*\}', judge_output) + + if json_match: + try: + analysis = json.loads(json_match.group(0)) + + # Determine status icon based on match_status + match_status = analysis.get('match_status', 'CRITICAL_FAILURE') + status_icon = 'โœ…' if match_status == 'FULL_MATCH' else 'โš ๏ธ' if match_status == 'PARTIAL_MATCH' else 'โŒ' + + # Extract structural analysis + structural = analysis.get('structural_analysis', {}) + deep_diff = structural.get('deep_diff', []) + deep_diff_str = '\n - '.join(deep_diff) if deep_diff else 'No structural issues' + + # Extract semantic analysis + semantic = analysis.get('semantic_analysis', {}) + hallucinations = semantic.get('hallucinations', []) + hallucinations_str = '\n - '.join(hallucinations) if hallucinations else 'None detected' + missed_constraints = semantic.get('missed_constraints', []) + missed_str = '\n - '.join(missed_constraints) if missed_constraints else 'None' + + # Format as detailed, actionable feedback + feedback = f"""{status_icon} Forensic Analysis (Score: {score:.2%}) - {match_status} + +๐Ÿ“Š STRUCTURAL ANALYSIS (Schema Layer): + Format Valid: {'โœ…' if structural.get('format_valid', True) else 'โŒ'} + Schema Compliance: {'โœ…' if structural.get('schema_compliance', True) else 'โŒ'} + Deep Diff Issues: + - {deep_diff_str} + +๐Ÿง  SEMANTIC ANALYSIS (Meaning Layer): + Meaning Preserved: {'โœ…' if semantic.get('meaning_preserved', True) else 'โŒ'} + Hallucinations: + - {hallucinations_str} + Missed Constraints: + - {missed_str} + +๐Ÿ”ฌ ROOT CAUSE HYPOTHESIS: +{analysis.get('root_cause_hypothesis', 'Unable to determine root cause')} + +๐Ÿ’‰ SURGICAL FIX: +{analysis.get('surgical_fix', 'No specific fix suggested')} + +๐Ÿ’ญ CONTEXT: +- Task: "{task_input[:200]}{'...' if len(task_input) > 200 else ''}" +- Expected: {expected_output[:200]}{'...' if len(expected_output) > 200 else ''} +- Predicted: {predicted_output[:200] if predicted_output else '[EMPTY]'}{'...' if predicted_output and len(predicted_output) > 200 else ''}""" + + return feedback + + except json.JSONDecodeError: + pass + + # Fallback: return raw output with header + return f"""Forensic Analysis (Score: {score:.2%}): + +{judge_output} + +๐Ÿ’ญ CONTEXT: +- Task: "{task_input[:200]}{'...' if len(task_input) > 200 else ''}" +- Expected: {expected_output[:200]}{'...' if len(expected_output) > 200 else ''} +- Predicted: {predicted_output[:200] if predicted_output else '[EMPTY]'}""" + + +def build_empty_output_feedback( + task_input: str, + expected_output: str, + current_prompt: Optional[str] = None +) -> str: + """ + Generate feedback specifically for when the LLM produces no output. + + Args: + task_input: The task input + expected_output: What was expected + current_prompt: The current system prompt + + Returns: + Feedback explaining the empty output issue + """ + return f"""โŒ CRITICAL: Empty Output Generated + +๐Ÿ” PROBLEM: +The LLM produced NO OUTPUT for this task. + +๐Ÿ“‹ TASK INPUT: +{task_input[:500]}{'...' if len(task_input) > 500 else ''} + +๐Ÿ“‹ EXPECTED OUTPUT: +{expected_output[:500]}{'...' if len(expected_output) > 500 else ''} + +๐Ÿ“‹ CURRENT PROMPT: +{current_prompt[:300] if current_prompt else '[No prompt provided]'}{'...' if current_prompt and len(current_prompt) > 300 else ''} + +๐Ÿ” LIKELY CAUSES: +1. Prompt is too vague - doesn't clearly specify what output is expected +2. Prompt lacks output format instructions +3. Prompt might be confusing the LLM about what action to take +4. Task input might not align with prompt expectations + +๐Ÿ’ก SUGGESTED FIX: +Add explicit output instructions to the prompt: +- "You MUST provide a response for every input" +- "Always output in the following format: ..." +- "Extract and return: [specific fields]" + +๐Ÿ“ EXAMPLE IMPROVEMENT: +If extracting JSON, add: "Extract the following fields and return as JSON: [list expected fields from expected output]" +""" + diff --git a/src/gepa_optimizer/version.py b/src/gepa_optimizer/version.py new file mode 100644 index 0000000000000000000000000000000000000000..4aa6349bd4b32a6d902f3d2b32367b0436f33142 --- /dev/null +++ b/src/gepa_optimizer/version.py @@ -0,0 +1,5 @@ +""" +Version information for GEPA Optimizer +""" + +__version__ = "0.1.0"