Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Sleeping

App Files Files Community

Universal-prompt-Optimizer / src /gepa_optimizer /data /index_caching_loader.py

Suhasdev

Deploy Universal Prompt Optimizer to HF Spaces (clean)

cacd4d0 about 2 months ago

raw

history blame contribute delete

9.92 kB

	"""
	Index Caching Dataset Loader

	Loads index caching dataset from JSON file (note2_debug.json format) and converts to GEPA-compatible format.
	"""

	import os
	import json
	import base64
	import logging
	from typing import List, Dict, Any, Optional
	from pathlib import Path

	logger = logging.getLogger(__name__)


	class IndexCachingDatasetLoader:
	"""
	Loads index caching dataset from JSON file.

	Expected JSON format:
	[
	{
	"command": "Tap on first option from the suggestion",
	"image": "element_images/QMxgc_14_0_tap_IkALe_element.png",
	"xml": "xml/IkALe__debug.xml",
	"expected": {
	"is_index_based": true,
	"index_value": 1,
	"parent_element_id": "aaaabf",
	"element_id_of_nth_child_of_parent": "aaaabg",
	"selected_element_is_correct": true
	}
	},
	...
	]

	Converts to GEPA format:
	- input: command text (seed prompt will be provided in test script)
	- output: JSON string with expected values
	- image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
	- input: Command + XML content (combined in user prompt)
	- metadata: All original fields plus converted values
	"""

	def __init__(self, json_path: Optional[str] = None, base_dir: Optional[str] = None):
	"""
	Initialize index caching dataset loader.

	Args:
	json_path: Path to JSON file. Default: "./note2_debug.json" or from env var
	base_dir: Base directory for resolving relative paths in JSON.
	Default: Directory containing JSON file

	Raises:
	FileNotFoundError: If JSON file doesn't exist
	json.JSONDecodeError: If JSON file is invalid
	"""
	# Get JSON path from env or use default
	if json_path is None:
	json_path = os.getenv("INDEX_CACHING_DATASET_PATH", "./note2_debug.json")

	self.json_path = Path(json_path).resolve()

	if not self.json_path.exists():
	raise FileNotFoundError(
	f"Dataset file not found: {self.json_path}\n"
	f"Make sure note2_debug.json exists in the project root."
	)

	# Base directory for resolving relative paths
	if base_dir is None:
	base_dir = self.json_path.parent
	self.base_dir = Path(base_dir).resolve()

	def load_dataset(self) -> List[Dict[str, Any]]:
	"""
	Load dataset from JSON file and convert to GEPA format.

	Returns:
	List of dataset items in GEPA format:
	[
	{
	"input": "Tap on first option from the suggestion", # Command only
	"output": '{"is_index_based": true, "index_value": 1, ...}', # Expected JSON
	"image_base64": "<base64_encoded_image>", # TOP LEVEL
	"metadata": {
	"command": "...",
	"image_path": "...",
	"xml_path": "...",
	"expected": {...}
	}
	},
	...
	]

	Raises:
	FileNotFoundError: If image or XML file doesn't exist
	json.JSONDecodeError: If JSON file is invalid
	"""
	# Load JSON file
	with open(self.json_path, "r", encoding="utf-8") as f:
	dataset = json.load(f)

	gepa_dataset = []

	for idx, entry in enumerate(dataset):
	command = entry.get("command", "")
	image_path = entry.get("image", "")
	xml_path = entry.get("xml", "")
	expected = entry.get("expected", {})

	# Resolve paths relative to base_dir
	abs_image_path = (self.base_dir / image_path).resolve()
	abs_xml_path = (self.base_dir / xml_path).resolve()

	# Validate paths
	if not abs_image_path.exists():
	raise FileNotFoundError(
	f"Image file not found: {abs_image_path}\n"
	f"Entry {idx + 1}: {command}"
	)

	if not abs_xml_path.exists():
	raise FileNotFoundError(
	f"XML file not found: {abs_xml_path}\n"
	f"Entry {idx + 1}: {command}"
	)

	# Load and encode image
	with open(abs_image_path, "rb") as f:
	image_data = f.read()
	image_base64 = base64.b64encode(image_data).decode("utf-8")

	# Load XML content
	with open(abs_xml_path, "r", encoding="utf-8") as f:
	xml_content = f.read()

	# Convert expected to JSON string
	expected_json = json.dumps(expected, ensure_ascii=False)

	# Create user prompt with command + XML content
	# The XML will be included in the user prompt text (as the agent does)
	user_prompt = f"{command}\n\nXML Content:\n\n```xml\n{xml_content}\n```"

	# For reflection, we don't need full XML - just the command is enough
	# Reflection is about improving the prompt based on evaluation feedback,
	# not analyzing specific XML structures
	reflection_input = command # Just the command, no XML

	# Create GEPA format item
	gepa_item = {
	"input": user_prompt, # Command + XML content (for evaluation)
	"reflection_input": reflection_input, # Just command (for reflection)
	"output": expected_json, # Expected output as JSON string
	"image_base64": image_base64, # TOP LEVEL for UniversalConverter
	"metadata": {
	"command": command,
	"image_path": str(image_path),
	"xml_path": str(xml_path),
	"abs_image_path": str(abs_image_path),
	"abs_xml_path": str(abs_xml_path),
	"xml_content": xml_content, # Store XML separately in metadata
	"expected": expected,
	"dataset_index": idx
	}
	}

	gepa_dataset.append(gepa_item)

	return gepa_dataset

	def load_split(
	self,
	train_ratio: float = 0.6,
	val_ratio: float = 0.4
	) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
	"""
	Load dataset and split into train/val sets (no test set).

	Args:
	train_ratio: Ratio for training set (default: 0.6)
	val_ratio: Ratio for validation set (default: 0.4)

	Returns:
	Tuple of (train_set, val_set)

	Raises:
	ValueError: If ratios don't sum to 1.0
	"""
	if abs(train_ratio + val_ratio - 1.0) > 0.01:
	raise ValueError(
	f"Split ratios must sum to 1.0, got {train_ratio + val_ratio:.3f}"
	)

	dataset = self.load_dataset()
	total = len(dataset)

	train_end = int(total * train_ratio)

	train_set = dataset[:train_end]
	val_set = dataset[train_end:]

	return train_set, val_set


	def load_index_caching_dataset(
	json_path: Optional[str] = None,
	base_dir: Optional[str] = None
	) -> List[Dict[str, Any]]:
	"""
	Convenience function to load index caching dataset.

	Args:
	json_path: Path to JSON file
	base_dir: Base directory for resolving relative paths

	Returns:
	List of dataset items in GEPA format
	"""
	loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
	return loader.load_dataset()


	def load_index_caching_split(
	json_path: Optional[str] = None,
	base_dir: Optional[str] = None,
	train_ratio: float = 0.6,
	val_ratio: float = 0.4
	) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]]]:
	"""
	Convenience function to load and split index caching dataset.

	Args:
	json_path: Path to JSON file
	base_dir: Base directory for resolving relative paths
	train_ratio: Ratio for training set
	val_ratio: Ratio for validation set

	Returns:
	Tuple of (train_set, val_set) - no test set
	"""
	loader = IndexCachingDatasetLoader(json_path=json_path, base_dir=base_dir)
	return loader.load_split(train_ratio=train_ratio, val_ratio=val_ratio)


	# Example usage
	if __name__ == "__main__":
	print("🚀 Testing Index Caching Dataset Loader...")

	# Test loading
	try:
	loader = IndexCachingDatasetLoader(json_path="./note2_debug.json")
	dataset = loader.load_dataset()

	print(f"\n✅ Loaded {len(dataset)} items")

	# Show sample
	if dataset:
	sample = dataset[0]
	print(f"\n📝 Sample Item:")
	print(f" Command: {sample['input']}")
	print(f" Image path: {sample['metadata']['image_path']}")
	print(f" XML path: {sample['metadata']['xml_path']}")
	print(f" Expected: {sample['output'][:100]}...")
	print(f" Image base64 length: {len(sample['image_base64'])}")
	print(f" XML content length: {len(sample['metadata'].get('xml_content', ''))}")

	# Test split
	train, val = loader.load_split()
	print(f"\n📊 Dataset Split:")
	print(f" Training: {len(train)} samples")
	print(f" Validation: {len(val)} samples")
	print(f" Test: Not used (no test set)")

	except Exception as e:
	print(f"❌ Error: {e}")