Suhasdev's picture
Deploy Universal Prompt Optimizer to HF Spaces (clean)
cacd4d0
"""
Index Caching Dataset Loader
Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format.
"""
import os
import json
import base64
import logging
from typing import List, Dict, Any, Optional
from pathlib import Path
logger = logging.getLogger(__name__)
class IndexCachingDatasetLoader:
"""
Loads index caching dataset from JSON file.
Expected JSON format:
[
{
"command": "Tap on first option from the suggestion",
"image": "element_images/QMxgc_14_0_tap_IkALe_element.png",
"xml": "xml/IkALe__debug.xml",
"expected": {
"is_index_based": true,
"index_value": 1,
"parent_element_id": "aaaabf",
"element_id_of_nth_child_of_parent": "aaaabg",
"selected_element_is_correct": true
}
},
...
]
Converts to GEPA format:
- input: command text (seed prompt will be provided in test script)
- output: JSON string with expected values
- image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
- input: Command + XML content (combined in user prompt)
- metadata: All original fields plus converted values
"""
def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None):
"""
Initialize index caching dataset loader.
Args:
json_path: Path to JSON file. Default: "./note2_debug.json" or from env var
base_dir: Base directory for resolving relative paths in JSON.
Default: Directory containing JSON file
Raises:
FileNotFoundError: If JSON file doesn't exist
json.JSONDecodeError: If JSON file is invalid
"""
# Get JSON path from env or use default
if json_path is None:
json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json")
self.json_path = Path(json_path).resolve()
if not self.json_path.exists():
raise FileNotFoundError(
f"Dataset file not found: {self.json_path}\n"
f"Make sure note2_debug.json exists in the project root."
)
# Base directory for resolving relative paths
if base_dir is None:
base_dir = self.json_path.parent
self.base_dir = Path(base_dir).resolve()
def load_dataset(self) -> List[Dict[str, Any]]:
"""
Load dataset from JSON file and convert to GEPA format.
Returns:
List of dataset items in GEPA format:
[
{
"input": "Tap on first option from the suggestion", # Command only
"output": '{"is_index_based": true, "index_value": 1, ...}', # Expected JSON
"image_base64": "<base64_encoded_image>", # TOP LEVEL
"metadata": {
"command": "...",
"image_path": "...",
"xml_path": "...",
"expected": {...}
}
},
...
]
Raises:
FileNotFoundError: If image or XML file doesn't exist
json.JSONDecodeError: If JSON file is invalid
"""
# Load JSON file
with open(self.json_path, "r", encoding="utf-8") as f:
dataset = json.load(f)
gepa_dataset = []
for idx, entry in enumerate(dataset):
command = entry.get("command", "")
image_path = entry.get("image", "")
xml_path = entry.get("xml", "")
expected = entry.get("expected", {})
# Resolve paths relative to base_dir
abs_image_path = (self.base_dir / image_path).resolve()
abs_xml_path = (self.base_dir / xml_path).resolve()
# Validate paths
if not abs_image_path.exists():
raise FileNotFoundError(
f"Image file not found: {abs_image_path}\n"
f"Entry {idx + 1}: {command}"
)
if not abs_xml_path.exists():
raise FileNotFoundError(
f"XML file not found: {abs_xml_path}\n"
f"Entry {idx + 1}: {command}"
)
# Load and encode image
with open(abs_image_path, "rb") as f:
image_data = f.read()
image_base64 = base64.b64encode(image_data).decode("utf-8")
# Load XML content
with open(abs_xml_path, "r", encoding="utf-8") as f:
xml_content = f.read()
# Convert expected to JSON string
expected_json = json.dumps(expected, ensure_ascii=False)
# Create user prompt with command + XML content
# The XML will be included in the user prompt text (as the agent does)
user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```"
# For reflection, we don't need full XML - just the command is enough
# Reflection is about improving the prompt based on evaluation feedback,
# not analyzing specific XML structures
reflection_input = command # Just the command, no XML
# Create GEPA format item
gepa_item = {
"input": user_prompt, # Command + XML content (for evaluation)
"reflection_input": reflection_input, # Just command (for reflection)
"output": expected_json, # Expected output as JSON string
"image_base64": image_base64, # TOP LEVEL for UniversalConverter
"metadata": {
"command": command,
"image_path": str(image_path),
"xml_path": str(xml_path),
"abs_image_path": str(abs_image_path),
"abs_xml_path": str(abs_xml_path),
"xml_content": xml_content, # Store XML separately in metadata
"expected": expected,
"dataset_index": idx
}
}
gepa_dataset.append(gepa_item)
return gepa_dataset
def load_split(
self,
train_ratio: float = 0.6,
val_ratio: float = 0.4
) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Load dataset and split into train/val sets (no test set).
Args:
train_ratio: Ratio for training set (default: 0.6)
val_ratio: Ratio for validation set (default: 0.4)
Returns:
Tuple of (train_set, val_set)
Raises:
ValueError: If ratios don't sum to 1.0
"""
if abs(train_ratio + val_ratio - 1.0) > 0.01:
raise ValueError(
f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}"
)
dataset = self.load_dataset()
total = len(dataset)
train_end = int(total * train_ratio)
train_set = dataset[:train_end]
val_set = dataset[train_end:]
return train_set, val_set
def load_index_caching_dataset(
json_path: Optional[str] = None,
base_dir: Optional[str] = None
) -> List[Dict[str, Any]]:
"""
Convenience function to load index caching dataset.
Args:
json_path: Path to JSON file
base_dir: Base directory for resolving relative paths
Returns:
List of dataset items in GEPA format
"""
loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
return loader.load_dataset()
def load_index_caching_split(
json_path: Optional[str] = None,
base_dir: Optional[str] = None,
train_ratio: float = 0.6,
val_ratio: float = 0.4
) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Convenience function to load and split index caching dataset.
Args:
json_path: Path to JSON file
base_dir: Base directory for resolving relative paths
train_ratio: Ratio for training set
val_ratio: Ratio for validation set
Returns:
Tuple of (train_set, val_set) - no test set
"""
loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio)
# Example usage
if __name__ == "__main__":
print("🚀 Testing Index Caching Dataset Loader...")
# Test loading
try:
loader = IndexCachingDatasetLoader(json_path="./note2_debug.json")
dataset = loader.load_dataset()
print(f"\n✅ Loaded {len(dataset)} items")
# Show sample
if dataset:
sample = dataset[0]
print(f"\n📝 Sample Item:")
print(f" Command: {sample['input']}")
print(f" Image path: {sample['metadata']['image_path']}")
print(f" XML path: {sample['metadata']['xml_path']}")
print(f" Expected: {sample['output'][:100]}...")
print(f" Image base64 length: {len(sample['image_base64'])}")
print(f" XML content length: {len(sample['metadata'].get('xml_content', ''))}")
# Test split
train, val = loader.load_split()
print(f"\n📊 Dataset Split:")
print(f" Training: {len(train)} samples")
print(f" Validation: {len(val)} samples")
print(f" Test: Not used (no test set)")
except Exception as e:
print(f"❌ Error: {e}")