Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

File size: 9,918 Bytes

cacd4d0

"""
Index Caching Dataset Loader

Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format.
"""

import os
import json
import base64
import logging
from typing import List, Dict, Any, Optional
from pathlib import Path

logger = logging.getLogger(__name__)


class IndexCachingDatasetLoader:
    """
    Loads index caching dataset from JSON file.
    
    Expected JSON format:
    [
        {
            "command": "Tap on first option from the suggestion",
            "image": "element_images/QMxgc_14_0_tap_IkALe_element.png",
            "xml": "xml/IkALe__debug.xml",
            "expected": {
                "is_index_based": true,
                "index_value": 1,
                "parent_element_id": "aaaabf",
                "element_id_of_nth_child_of_parent": "aaaabg",
                "selected_element_is_correct": true
            }
        },
        ...
    ]
    
    Converts to GEPA format:
    - input: command text (seed prompt will be provided in test script)
    - output: JSON string with expected values
    - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
    - input: Command + XML content (combined in user prompt)
    - metadata: All original fields plus converted values
    """
    
    def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None):
        """
        Initialize index caching dataset loader.
        
        Args:
            json_path: Path to JSON file. Default: "./note2_debug.json" or from env var
            base_dir: Base directory for resolving relative paths in JSON.
                    Default: Directory containing JSON file
        
        Raises:
            FileNotFoundError: If JSON file doesn't exist
            json.JSONDecodeError: If JSON file is invalid
        """
        # Get JSON path from env or use default
        if json_path is None:
            json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json")
        
        self.json_path = Path(json_path).resolve()
        
        if not self.json_path.exists():
            raise FileNotFoundError(
                f"Dataset file not found: {self.json_path}\n"
                f"Make sure note2_debug.json exists in the project root."
            )
        
        # Base directory for resolving relative paths
        if base_dir is None:
            base_dir = self.json_path.parent
        self.base_dir = Path(base_dir).resolve()
    
    def load_dataset(self) -> List[Dict[str, Any]]:
        """
        Load dataset from JSON file and convert to GEPA format.
        
        Returns:
            List of dataset items in GEPA format:
            [
                {
                    "input": "Tap on first option from the suggestion",  # Command only
                    "output": '{"is_index_based": true, "index_value": 1, ...}',  # Expected JSON
                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL
                    "metadata": {
                        "command": "...",
                        "image_path": "...",
                        "xml_path": "...",
                        "expected": {...}
                    }
                },
                ...
            ]
        
        Raises:
            FileNotFoundError: If image or XML file doesn't exist
            json.JSONDecodeError: If JSON file is invalid
        """
        # Load JSON file
        with open(self.json_path, "r", encoding="utf-8") as f:
            dataset = json.load(f)
        
        gepa_dataset = []
        
        for idx, entry in enumerate(dataset):
            command = entry.get("command", "")
            image_path = entry.get("image", "")
            xml_path = entry.get("xml", "")
            expected = entry.get("expected", {})
            
            # Resolve paths relative to base_dir
            abs_image_path = (self.base_dir / image_path).resolve()
            abs_xml_path = (self.base_dir / xml_path).resolve()
            
            # Validate paths
            if not abs_image_path.exists():
                raise FileNotFoundError(
                    f"Image file not found: {abs_image_path}\n"
                    f"Entry {idx + 1}: {command}"
                )
            
            if not abs_xml_path.exists():
                raise FileNotFoundError(
                    f"XML file not found: {abs_xml_path}\n"
                    f"Entry {idx + 1}: {command}"
                )
            
            # Load and encode image
            with open(abs_image_path, "rb") as f:
                image_data = f.read()
                image_base64 = base64.b64encode(image_data).decode("utf-8")
            
            # Load XML content
            with open(abs_xml_path, "r", encoding="utf-8") as f:
                xml_content = f.read()
            
            # Convert expected to JSON string
            expected_json = json.dumps(expected, ensure_ascii=False)
            
            # Create user prompt with command + XML content
            # The XML will be included in the user prompt text (as the agent does)
            user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```"
            
            # For reflection, we don't need full XML - just the command is enough
            # Reflection is about improving the prompt based on evaluation feedback,
            # not analyzing specific XML structures
            reflection_input = command  # Just the command, no XML
            
            # Create GEPA format item
            gepa_item = {
                "input": user_prompt,  # Command + XML content (for evaluation)
                "reflection_input": reflection_input,  # Just command (for reflection)
                "output": expected_json,  # Expected output as JSON string
                "image_base64": image_base64,  # TOP LEVEL for UniversalConverter
                "metadata": {
                    "command": command,
                    "image_path": str(image_path),
                    "xml_path": str(xml_path),
                    "abs_image_path": str(abs_image_path),
                    "abs_xml_path": str(abs_xml_path),
                    "xml_content": xml_content,  # Store XML separately in metadata
                    "expected": expected,
                    "dataset_index": idx
                }
            }
            
            gepa_dataset.append(gepa_item)
        
        return gepa_dataset
    
    def load_split(
        self,
        train_ratio: float = 0.6,
        val_ratio: float = 0.4
    ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
        """
        Load dataset and split into train/val sets (no test set).
        
        Args:
            train_ratio: Ratio for training set (default: 0.6)
            val_ratio: Ratio for validation set (default: 0.4)
        
        Returns:
            Tuple of (train_set, val_set)
        
        Raises:
            ValueError: If ratios don't sum to 1.0
        """
        if abs(train_ratio + val_ratio - 1.0) > 0.01:
            raise ValueError(
                f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}"
            )
        
        dataset = self.load_dataset()
        total = len(dataset)
        
        train_end = int(total * train_ratio)
        
        train_set = dataset[:train_end]
        val_set = dataset[train_end:]
        
        return train_set, val_set


def load_index_caching_dataset(
    json_path: Optional[str] = None,
    base_dir: Optional[str] = None
) -> List[Dict[str, Any]]:
    """
    Convenience function to load index caching dataset.
    
    Args:
        json_path: Path to JSON file
        base_dir: Base directory for resolving relative paths
    
    Returns:
        List of dataset items in GEPA format
    """
    loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
    return loader.load_dataset()


def load_index_caching_split(
    json_path: Optional[str] = None,
    base_dir: Optional[str] = None,
    train_ratio: float = 0.6,
    val_ratio: float = 0.4
) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
    """
    Convenience function to load and split index caching dataset.
    
    Args:
        json_path: Path to JSON file
        base_dir: Base directory for resolving relative paths
        train_ratio: Ratio for training set
        val_ratio: Ratio for validation set
    
    Returns:
        Tuple of (train_set, val_set) - no test set
    """
    loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
    return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio)


# Example usage
if __name__ == "__main__":
    print("🚀 Testing Index Caching Dataset Loader...")
    
    # Test loading
    try:
        loader = IndexCachingDatasetLoader(json_path="./note2_debug.json")
        dataset = loader.load_dataset()
        
        print(f"\n✅ Loaded {len(dataset)} items")
        
        # Show sample
        if dataset:
            sample = dataset[0]
            print(f"\n📝 Sample Item:")
            print(f"   Command: {sample['input']}")
            print(f"   Image path: {sample['metadata']['image_path']}")
            print(f"   XML path: {sample['metadata']['xml_path']}")
            print(f"   Expected: {sample['output'][:100]}...")
            print(f"   Image base64 length: {len(sample['image_base64'])}")
            print(f"   XML content length: {len(sample['metadata'].get('xml_content', ''))}")
        
        # Test split
        train, val = loader.load_split()
        print(f"\n📊 Dataset Split:")
        print(f"   Training: {len(train)} samples")
        print(f"   Validation: {len(val)} samples")
        print(f"   Test: Not used (no test set)")
        
    except Exception as e:
        print(f"❌ Error: {e}")