Spaces:

Suhasdev
/

Universal-prompt-Optimizer

Running

File size: 12,820 Bytes

cacd4d0

"""
Scroll Element Dataset Loader for Drizz Mobile App Testing

Loads screenshots with bounding boxes and commands to identify scroll elements.
Converts to GEPA-compatible format for prompt optimization.
"""

import base64
import random
import logging
from typing import List, Dict, Any, Tuple, Optional
from pathlib import Path

logger = logging.getLogger(__name__)


class ScrollDatasetLoader:
    """
    GENERIC dataset loader for image-based tasks.
    
    This is a LIBRARY class - NO hardcoded assumptions about:
    - What the task is (OCR, element detection, classification, etc.)
    - Input format (questions, commands, descriptions, etc.)
    - Output format (IDs, text, JSON, etc.)
    
    Users define their dataset in the test script and pass it here.
    
    Dataset format per item: (image_filename, input_text, expected_output)
    
    Example usage (ANY task):
        # Define YOUR dataset in YOUR test script
        my_dataset = [
            ("img1.png", "What is the main color?", "blue"),
            ("img2.png", "Count the objects", "5"),
            ("img3.png", "Describe the scene", "A cat on a sofa"),
        ]
        
        # Pass to loader
        loader = ScrollDatasetLoader(
            images_dir="images",
            dataset_config=my_dataset
        )
        data = loader.load_dataset()
    """
    
    def __init__(
        self,
        images_dir: str = "images",
        dataset_config: Optional[List[Tuple[str, str, str]]] = None
    ):
        """
        Initialize dataset loader.
        
        Args:
            images_dir: Directory containing images
            dataset_config: List of (image_filename, input_text, expected_output) tuples.
                           REQUIRED - no hardcoded defaults to keep library generic.
        
        Raises:
            FileNotFoundError: If images_dir doesn't exist
            ValueError: If dataset_config is None
        """
        self.images_dir = Path(images_dir)
        
        if not self.images_dir.exists():
            raise FileNotFoundError(f"Images directory not found: {images_dir}")
        
        if dataset_config is None:
            raise ValueError(
                "dataset_config is required. This is a library class - define your "
                "dataset in the test script:\n"
                "  dataset = [('img1.png', 'your input', 'expected output'), ...]\n"
                "  loader = ScrollDatasetLoader(images_dir='...', dataset_config=dataset)"
            )
        
        self.dataset_config = dataset_config
    
    def load_dataset(self) -> List[Dict[str, Any]]:
        """
        Load complete dataset with images.
        
        Phase 1: Includes element_id extraction from expected output.
        
        Returns:
            List of dataset items in GEPA format:
            [
                {
                    "input": "Command: Scroll down by 70%",
                    "output": "3",
                    "image_base64": "<base64_encoded_image>",  # TOP LEVEL
                    "metadata": {
                        "image_path": "images/5.png",
                        "input_text": "Command: Scroll down by 70%",
                        "expected_output": "3",
                        "image_filename": "5.png",
                        "element_id": 3  # Extracted integer (None if extraction fails)
                    }
                },
                ...
            ]
        """
        dataset = []
        
        # Generic variable names - no assumptions about data type
        for image_filename, input_text, expected_output in self.dataset_config:
            image_path = self.images_dir / image_filename
            
            # Validate image exists
            if not image_path.exists():
                logger.warning(f"Image not found: {image_path}")
                continue
            
            # Read and encode image
            try:
                image_base64 = self._encode_image(image_path)
            except Exception as e:
                logger.warning(f"Error encoding {image_filename}: {e}")
                continue
            
            # 🔥 Phase 1: Extract element_id from expected_output for robust evaluation
            element_id = self._extract_element_id(expected_output)
            if element_id is None:
                logger.warning(f"Could not extract element_id from '{expected_output}' in {image_filename}")
            
            # Create dataset item - COMPLETELY GENERIC
            # NO assumptions about output format (element IDs, commands, etc.)
            # Just: image + input text + expected output text
            # Library doesn't know or care what the task is!
            # IMPORTANT: Put image_base64 at TOP LEVEL for UniversalConverter to find it
            dataset_item = {
                "input": input_text,  # Generic input text (ANY format)
                "output": expected_output,  # Generic expected output (ANY format, full reasoning)
                "image_base64": image_base64,  # TOP LEVEL for converter
                "metadata": {
                    "image_path": str(image_path),
                    "input_text": input_text,
                    "expected_output": expected_output,
                    "image_filename": image_filename,
                    "element_id": element_id  # NEW: Extracted element ID (int or None)
                }
            }
            
            dataset.append(dataset_item)
        
        if not dataset:
            raise ValueError("No valid images found in dataset")
        
        logger.info(f"Loaded {len(dataset)} scroll element detection samples")
        return dataset
    
    def _extract_element_id(self, expected_output: str) -> Optional[int]:
        """
        Extract element ID from expected output string.
        
        Handles multiple formats:
        - "Element: 4"
        - "Element 4"
        - "4" (standalone)
        - "Element: 4, Description: ..." (full reasoning)
        
        Args:
            expected_output: Full expected output string with reasoning
            
        Returns:
            Element ID as integer, or None if not found
        """
        import re
        
        if not expected_output:
            return None
        
        # Pattern 1: "Element: X" or "Element X" (case insensitive)
        patterns = [
            r'element[:\s]+(\d+)',  # "Element: 4" or "Element 4"
            r'\belement\s+(\d+)\b',  # "element 4" (word boundary)
        ]
        
        for pattern in patterns:
            match = re.search(pattern, expected_output, re.IGNORECASE)
            if match:
                try:
                    element_id = int(match.group(1))
                    # Validate range (reasonable UI element IDs)
                    if 1 <= element_id <= 100:
                        return element_id
                except (ValueError, IndexError):
                    continue
        
        # Pattern 2: First standalone number (if no "Element:" pattern found)
        # Only use if it's a reasonable element ID (1-100)
        number_match = re.search(r'\b(\d{1,3})\b', expected_output)
        if number_match:
            try:
                element_id = int(number_match.group(1))
                if 1 <= element_id <= 100:  # Reasonable range for UI elements
                    return element_id
            except ValueError:
                pass
        
        return None
    
    def _encode_image(self, image_path: Path) -> str:
        """
        Encode image to base64 string.
        
        Args:
            image_path: Path to image file
            
        Returns:
            Base64 encoded image string
        """
        with open(image_path, "rb") as image_file:
            encoded = base64.b64encode(image_file.read()).decode('utf-8')
        return encoded
    
    def split_dataset(
        self, 
        dataset: List[Dict[str, Any]], 
        train_size: int = 4,
        val_size: int = 1,
        test_size: int = 1,
        shuffle: bool = True,
        seed: Optional[int] = None
    ) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
        """
        Split dataset into train, validation, and test sets.
        
        🔥 NEW: Added shuffling support to ensure different image distribution
        across splits, preventing hard images from always landing in validation set.
        
        Args:
            dataset: Complete dataset
            train_size: Number of samples for training (default: 4)
            val_size: Number of samples for validation (default: 1)
            test_size: Number of samples for test (default: 1)
            shuffle: Whether to shuffle dataset before splitting (default: True)
            seed: Random seed for reproducible shuffling (default: None = random)
            
        Returns:
            Tuple of (train_set, val_set, test_set)
        """
        n = len(dataset)
        
        # Validate split sizes
        total_size = train_size + val_size + test_size
        if total_size > n:
            logger.warning(f"Requested split ({total_size}) exceeds dataset size ({n}). Adjusting split proportionally...")
            ratio = n / total_size
            train_size = int(train_size * ratio)
            val_size = int(val_size * ratio)
            test_size = n - train_size - val_size
        
        # 🔥 CRITICAL: Shuffle dataset to ensure different image distribution
        # This prevents the same hard images from always being in validation set
        dataset_copy = dataset.copy()  # Don't modify original
        if shuffle:
            if seed is not None:
                random.seed(seed)
                logger.debug(f"Shuffling dataset with seed={seed} for reproducible splits")
            else:
                logger.debug(f"Shuffling dataset randomly (no seed)")
            random.shuffle(dataset_copy)
        else:
            logger.warning(f"Not shuffling dataset - using original order")
        
        # Split shuffled dataset
        train_set = dataset_copy[:train_size]
        val_set = dataset_copy[train_size:train_size + val_size]
        test_set = dataset_copy[train_size + val_size:train_size + val_size + test_size]
        
        logger.info(f"Dataset split: {len(train_set)} train, {len(val_set)} val, {len(test_set)} test")
        
        # Log which images are in each split for debugging
        if shuffle:
            train_images = [item['metadata'].get('image_filename', 'N/A') for item in train_set]
            val_images = [item['metadata'].get('image_filename', 'N/A') for item in val_set]
            test_images = [item['metadata'].get('image_filename', 'N/A') for item in test_set]
            print(f"   Train images: {train_images[:5]}{'...' if len(train_images) > 5 else ''}")
            print(f"   Val images:   {val_images}")
            print(f"   Test images:  {test_images[:5]}{'...' if len(test_images) > 5 else ''}")
        
        return train_set, val_set, test_set


def load_scroll_dataset(
    images_dir: str = "images",
    dataset_config: List[Tuple[str, str, str]] = None,
    split: bool = True
) -> Tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
    """
    Convenience function to load image-based dataset (GENERIC).
    
    Args:
        images_dir: Directory containing images
        dataset_config: List of (image_filename, input_text, expected_output) tuples
        split: Whether to split into train/val/test
        
    Returns:
        If split=True: (train_set, val_set, test_set)
        If split=False: (full_dataset, [], [])
        
    Example (works for ANY task):
        dataset_config = [
            ("img1.png", "What color is the sky?", "blue"),
            ("img2.png", "Count the dogs", "2"),
        ]
        train, val, test = load_scroll_dataset(
            images_dir="images",
            dataset_config=dataset_config
        )
    """
    loader = ScrollDatasetLoader(images_dir, dataset_config=dataset_config)
    dataset = loader.load_dataset()
    
    if split:
        return loader.split_dataset(dataset)
    else:
        return dataset, [], []


# Example usage (for testing the library loader itself)
if __name__ == "__main__":
    print("🚀 Testing Scroll Dataset Loader...")
    print("⚠️  NOTE: This is a library class. Define your dataset in your test script.")
    print("\nExample:")
    print("  dataset_config = [")
    print("      ('image1.png', 'Scroll down by 50%', '3'),")
    print("      ('image2.png', 'Swipe left', '4'),")
    print("  ]")
    print("  train, val, test = load_scroll_dataset(")
    print("      images_dir='images',")
    print("      dataset_config=dataset_config")
    print("  )")