Spaces:
Sleeping
Sleeping
| """ | |
| Index Caching Dataset Loader | |
| Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format. | |
| """ | |
| import os | |
| import json | |
| import base64 | |
| import logging | |
| from typing import List, Dict, Any, Optional | |
| from pathlib import Path | |
| logger = logging.getLogger(__name__) | |
| class IndexCachingDatasetLoader: | |
| """ | |
| Loads index caching dataset from JSON file. | |
| Expected JSON format: | |
| [ | |
| { | |
| "command": "Tap on first option from the suggestion", | |
| "image": "element_images/QMxgc_14_0_tap_IkALe_element.png", | |
| "xml": "xml/IkALe__debug.xml", | |
| "expected": { | |
| "is_index_based": true, | |
| "index_value": 1, | |
| "parent_element_id": "aaaabf", | |
| "element_id_of_nth_child_of_parent": "aaaabg", | |
| "selected_element_is_correct": true | |
| } | |
| }, | |
| ... | |
| ] | |
| Converts to GEPA format: | |
| - input: command text (seed prompt will be provided in test script) | |
| - output: JSON string with expected values | |
| - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter) | |
| - input: Command + XML content (combined in user prompt) | |
| - metadata: All original fields plus converted values | |
| """ | |
| def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None): | |
| """ | |
| Initialize index caching dataset loader. | |
| Args: | |
| json_path: Path to JSON file. Default: "./note2_debug.json" or from env var | |
| base_dir: Base directory for resolving relative paths in JSON. | |
| Default: Directory containing JSON file | |
| Raises: | |
| FileNotFoundError: If JSON file doesn't exist | |
| json.JSONDecodeError: If JSON file is invalid | |
| """ | |
| # Get JSON path from env or use default | |
| if json_path is None: | |
| json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json") | |
| self.json_path = Path(json_path).resolve() | |
| if not self.json_path.exists(): | |
| raise FileNotFoundError( | |
| f"Dataset file not found: {self.json_path}\n" | |
| f"Make sure note2_debug.json exists in the project root." | |
| ) | |
| # Base directory for resolving relative paths | |
| if base_dir is None: | |
| base_dir = self.json_path.parent | |
| self.base_dir = Path(base_dir).resolve() | |
| def load_dataset(self) -> List[Dict[str, Any]]: | |
| """ | |
| Load dataset from JSON file and convert to GEPA format. | |
| Returns: | |
| List of dataset items in GEPA format: | |
| [ | |
| { | |
| "input": "Tap on first option from the suggestion", # Command only | |
| "output": '{"is_index_based": true, "index_value": 1, ...}', # Expected JSON | |
| "image_base64": "<base64_encoded_image>", # TOP LEVEL | |
| "metadata": { | |
| "command": "...", | |
| "image_path": "...", | |
| "xml_path": "...", | |
| "expected": {...} | |
| } | |
| }, | |
| ... | |
| ] | |
| Raises: | |
| FileNotFoundError: If image or XML file doesn't exist | |
| json.JSONDecodeError: If JSON file is invalid | |
| """ | |
| # Load JSON file | |
| with open(self.json_path, "r", encoding="utf-8") as f: | |
| dataset = json.load(f) | |
| gepa_dataset = [] | |
| for idx, entry in enumerate(dataset): | |
| command = entry.get("command", "") | |
| image_path = entry.get("image", "") | |
| xml_path = entry.get("xml", "") | |
| expected = entry.get("expected", {}) | |
| # Resolve paths relative to base_dir | |
| abs_image_path = (self.base_dir / image_path).resolve() | |
| abs_xml_path = (self.base_dir / xml_path).resolve() | |
| # Validate paths | |
| if not abs_image_path.exists(): | |
| raise FileNotFoundError( | |
| f"Image file not found: {abs_image_path}\n" | |
| f"Entry {idx + 1}: {command}" | |
| ) | |
| if not abs_xml_path.exists(): | |
| raise FileNotFoundError( | |
| f"XML file not found: {abs_xml_path}\n" | |
| f"Entry {idx + 1}: {command}" | |
| ) | |
| # Load and encode image | |
| with open(abs_image_path, "rb") as f: | |
| image_data = f.read() | |
| image_base64 = base64.b64encode(image_data).decode("utf-8") | |
| # Load XML content | |
| with open(abs_xml_path, "r", encoding="utf-8") as f: | |
| xml_content = f.read() | |
| # Convert expected to JSON string | |
| expected_json = json.dumps(expected, ensure_ascii=False) | |
| # Create user prompt with command + XML content | |
| # The XML will be included in the user prompt text (as the agent does) | |
| user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```" | |
| # For reflection, we don't need full XML - just the command is enough | |
| # Reflection is about improving the prompt based on evaluation feedback, | |
| # not analyzing specific XML structures | |
| reflection_input = command # Just the command, no XML | |
| # Create GEPA format item | |
| gepa_item = { | |
| "input": user_prompt, # Command + XML content (for evaluation) | |
| "reflection_input": reflection_input, # Just command (for reflection) | |
| "output": expected_json, # Expected output as JSON string | |
| "image_base64": image_base64, # TOP LEVEL for UniversalConverter | |
| "metadata": { | |
| "command": command, | |
| "image_path": str(image_path), | |
| "xml_path": str(xml_path), | |
| "abs_image_path": str(abs_image_path), | |
| "abs_xml_path": str(abs_xml_path), | |
| "xml_content": xml_content, # Store XML separately in metadata | |
| "expected": expected, | |
| "dataset_index": idx | |
| } | |
| } | |
| gepa_dataset.append(gepa_item) | |
| return gepa_dataset | |
| def load_split( | |
| self, | |
| train_ratio: float = 0.6, | |
| val_ratio: float = 0.4 | |
| ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: | |
| """ | |
| Load dataset and split into train/val sets (no test set). | |
| Args: | |
| train_ratio: Ratio for training set (default: 0.6) | |
| val_ratio: Ratio for validation set (default: 0.4) | |
| Returns: | |
| Tuple of (train_set, val_set) | |
| Raises: | |
| ValueError: If ratios don't sum to 1.0 | |
| """ | |
| if abs(train_ratio + val_ratio - 1.0) > 0.01: | |
| raise ValueError( | |
| f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}" | |
| ) | |
| dataset = self.load_dataset() | |
| total = len(dataset) | |
| train_end = int(total * train_ratio) | |
| train_set = dataset[:train_end] | |
| val_set = dataset[train_end:] | |
| return train_set, val_set | |
| def load_index_caching_dataset( | |
| json_path: Optional[str] = None, | |
| base_dir: Optional[str] = None | |
| ) -> List[Dict[str, Any]]: | |
| """ | |
| Convenience function to load index caching dataset. | |
| Args: | |
| json_path: Path to JSON file | |
| base_dir: Base directory for resolving relative paths | |
| Returns: | |
| List of dataset items in GEPA format | |
| """ | |
| loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir) | |
| return loader.load_dataset() | |
| def load_index_caching_split( | |
| json_path: Optional[str] = None, | |
| base_dir: Optional[str] = None, | |
| train_ratio: float = 0.6, | |
| val_ratio: float = 0.4 | |
| ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]: | |
| """ | |
| Convenience function to load and split index caching dataset. | |
| Args: | |
| json_path: Path to JSON file | |
| base_dir: Base directory for resolving relative paths | |
| train_ratio: Ratio for training set | |
| val_ratio: Ratio for validation set | |
| Returns: | |
| Tuple of (train_set, val_set) - no test set | |
| """ | |
| loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir) | |
| return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio) | |
| # Example usage | |
| if __name__ == "__main__": | |
| print("🚀 Testing Index Caching Dataset Loader...") | |
| # Test loading | |
| try: | |
| loader = IndexCachingDatasetLoader(json_path="./note2_debug.json") | |
| dataset = loader.load_dataset() | |
| print(f"\n✅ Loaded {len(dataset)} items") | |
| # Show sample | |
| if dataset: | |
| sample = dataset[0] | |
| print(f"\n📝 Sample Item:") | |
| print(f" Command: {sample['input']}") | |
| print(f" Image path: {sample['metadata']['image_path']}") | |
| print(f" XML path: {sample['metadata']['xml_path']}") | |
| print(f" Expected: {sample['output'][:100]}...") | |
| print(f" Image base64 length: {len(sample['image_base64'])}") | |
| print(f" XML content length: {len(sample['metadata'].get('xml_content', ''))}") | |
| # Test split | |
| train, val = loader.load_split() | |
| print(f"\n📊 Dataset Split:") | |
| print(f" Training: {len(train)} samples") | |
| print(f" Validation: {len(val)} samples") | |
| print(f" Test: Not used (no test set)") | |
| except Exception as e: | |
| print(f"❌ Error: {e}") | |