""" Index Caching Dataset Loader Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format. """ import os import json import base64 import logging from typing import List, Dict, Any, Optional from pathlib import Path logger = logging.getLogger(__name__) class IndexCachingDatasetLoader: """ Loads index caching dataset from JSON file. Expected JSON format: [ { "command": "Tap on first option from the suggestion", "image": "element_images/QMxgc_14_0_tap_IkALe_element.png", "xml": "xml/IkALe__debug.xml", "expected": { "is_index_based": true, "index_value": 1, "parent_element_id": "aaaabf", "element_id_of_nth_child_of_parent": "aaaabg", "selected_element_is_correct": true } }, ... ] Converts to GEPA format: - input: command text (seed prompt will be provided in test script) - output: JSON string with expected values - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter) - input: Command + XML content (combined in user prompt) - metadata: All original fields plus converted values """ def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None): """ Initialize index caching dataset loader. Args: json_path: Path to JSON file. Default: "./note2_debug.json" or from env var base_dir: Base directory for resolving relative paths in JSON. Default: Directory containing JSON file Raises: FileNotFoundError: If JSON file doesn't exist json.JSONDecodeError: If JSON file is invalid """ # Get JSON path from env or use default if json_path is None: json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json") self.json_path = Path(json_path).resolve() if not self.json_path.exists(): raise FileNotFoundError( f"Dataset file not found: {self.json_path}\n" f"Make sure note2_debug.json exists in the project root." ) # Base directory for resolving relative paths if base_dir is None: base_dir = self.json_path.parent self.base_dir = Path(base_dir).resolve() def load_dataset(self) -> List[Dict[str, Any]]: """ Load dataset from JSON file and convert to GEPA format. Returns: List of dataset items in GEPA format: [ { "input": "Tap on first option from the suggestion", # Command only "output": '{"is_index_based": true, "index_value": 1, ...}', # Expected JSON "image_base64": "", # TOP LEVEL "metadata": { "command": "...", "image_path": "...", "xml_path": "...", "expected": {...} } }, ... ] Raises: FileNotFoundError: If image or XML file doesn't exist json.JSONDecodeError: If JSON file is invalid """ # Load JSON file with open(self.json_path, "r", encoding="utf-8") as f: dataset = json.load(f) gepa_dataset = [] for idx, entry in enumerate(dataset): command = entry.get("command", "") image_path = entry.get("image", "") xml_path = entry.get("xml", "") expected = entry.get("expected", {}) # Resolve paths relative to base_dir abs_image_path = (self.base_dir / image_path).resolve() abs_xml_path = (self.base_dir / xml_path).resolve() # Validate paths if not abs_image_path.exists(): raise FileNotFoundError( f"Image file not found: {abs_image_path}\n" f"Entry {idx + 1}: {command}" ) if not abs_xml_path.exists(): raise FileNotFoundError( f"XML file not found: {abs_xml_path}\n" f"Entry {idx + 1}: {command}" ) # Load and encode image with open(abs_image_path, "rb") as f: image_data = f.read() image_base64 = base64.b64encode(image_data).decode("utf-8") # Load XML content with open(abs_xml_path, "r", encoding="utf-8") as f: xml_content = f.read() # Convert expected to JSON string expected_json = json.dumps(expected, ensure_ascii=False) # Create user prompt with command + XML content # The XML will be included in the user prompt text (as the agent does) user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```" # For reflection, we don't need full XML - just the command is enough # Reflection is about improving the prompt based on evaluation feedback, # not analyzing specific XML structures reflection_input = command # Just the command, no XML # Create GEPA format item gepa_item = { "input": user_prompt, # Command + XML content (for evaluation) "reflection_input": reflection_input, # Just command (for reflection) "output": expected_json, # Expected output as JSON string "image_base64": image_base64, # TOP LEVEL for UniversalConverter "metadata": { "command": command, "image_path": str(image_path), "xml_path": str(xml_path), "abs_image_path": str(abs_image_path), "abs_xml_path": str(abs_xml_path), "xml_content": xml_content, # Store XML separately in metadata "expected": expected, "dataset_index": idx } } gepa_dataset.append(gepa_item) return gepa_dataset def load_split( self, train_ratio: float = 0.6, val_ratio: float = 0.4 ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """ Load dataset and split into train/val sets (no test set). Args: train_ratio: Ratio for training set (default: 0.6) val_ratio: Ratio for validation set (default: 0.4) Returns: Tuple of (train_set, val_set) Raises: ValueError: If ratios don't sum to 1.0 """ if abs(train_ratio + val_ratio - 1.0) > 0.01: raise ValueError( f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}" ) dataset = self.load_dataset() total = len(dataset) train_end = int(total * train_ratio) train_set = dataset[:train_end] val_set = dataset[train_end:] return train_set, val_set def load_index_caching_dataset( json_path: Optional[str] = None, base_dir: Optional[str] = None ) -> List[Dict[str, Any]]: """ Convenience function to load index caching dataset. Args: json_path: Path to JSON file base_dir: Base directory for resolving relative paths Returns: List of dataset items in GEPA format """ loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir) return loader.load_dataset() def load_index_caching_split( json_path: Optional[str] = None, base_dir: Optional[str] = None, train_ratio: float = 0.6, val_ratio: float = 0.4 ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: """ Convenience function to load and split index caching dataset. Args: json_path: Path to JSON file base_dir: Base directory for resolving relative paths train_ratio: Ratio for training set val_ratio: Ratio for validation set Returns: Tuple of (train_set, val_set) - no test set """ loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir) return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio) # Example usage if __name__ == "__main__": print("šŸš€ Testing Index Caching Dataset Loader...") # Test loading try: loader = IndexCachingDatasetLoader(json_path="./note2_debug.json") dataset = loader.load_dataset() print(f"\nāœ… Loaded {len(dataset)} items") # Show sample if dataset: sample = dataset[0] print(f"\nšŸ“ Sample Item:") print(f" Command: {sample['input']}") print(f" Image path: {sample['metadata']['image_path']}") print(f" XML path: {sample['metadata']['xml_path']}") print(f" Expected: {sample['output'][:100]}...") print(f" Image base64 length: {len(sample['image_base64'])}") print(f" XML content length: {len(sample['metadata'].get('xml_content', ''))}") # Test split train, val = loader.load_split() print(f"\nšŸ“Š Dataset Split:") print(f" Training: {len(train)} samples") print(f" Validation: {len(val)} samples") print(f" Test: Not used (no test set)") except Exception as e: print(f"āŒ Error: {e}")