""" Validation Dataset Loader for UI Validation Use Case Loads validation datapoints from SQLite database and converts to GEPA-compatible format. Supports filtering by data_type (trainset/valset/testset) and confirmed status. """ import os import sqlite3 import base64 import logging from typing import List, Dict, Any, Optional, Literal from pathlib import Path logger = logging.getLogger(__name__) class ValidationDatasetLoader: """ Loads validation dataset from SQLite database. Database schema: - validation_data: id, image_id, command, result (0/1), reasoning, data_type, confirmed, created_at - images: image_id, mime, bytes (BLOB), created_at Converts to GEPA format: - input: command text (seed prompt will be provided in test script) - output: "true" or "false" (converted from 0/1) - image_base64: base64 encoded image (TOP LEVEL for UniversalConverter) - metadata: All original fields plus converted values Note: The seed prompt is NOT stored in database - it will be provided in the test script. The input field contains just the command, and the image is at top level. """ def __init__( self, db_path: Optional[str] = None, confirmed_only: bool = True ): """ Initialize validation dataset loader. Args: db_path: Path to SQLite database file. Default: "./validation_data.db" or from VD_DB_PATH env var confirmed_only: If True, only load datapoints where confirmed=1. Default: True (only manually reviewed data) Raises: FileNotFoundError: If database file doesn't exist sqlite3.Error: If database connection fails """ # Get database path from env or use default if db_path is None: db_path = os.getenv("VD_DB_PATH", "./validation_data.db") self.db_path = Path(db_path).resolve() if not self.db_path.exists(): raise FileNotFoundError( f"Database file not found: {self.db_path}\n" f"Make sure validation_data_ui_server_async.py has been run at least once to create the database." ) self.confirmed_only = confirmed_only def load_dataset( self, data_type: Optional[Literal["trainset", "valset", "testset"]] = None, confirmed_only: Optional[bool] = None ) -> List[Dict[str, Any]]: """ Load dataset from database and convert to GEPA format. Args: data_type: Filter by data_type. If None, loads all types. Options: "trainset", "valset", "testset" confirmed_only: Override instance default. If True, only load confirmed datapoints. If None, uses instance default (self.confirmed_only) Returns: List of dataset items in GEPA format: [ { "input": "Validate Submit button is visible", # Command only (seed prompt in test script) "output": "true", # or "false" (converted from 0/1) "image_base64": "", # TOP LEVEL (image + command together) "metadata": { "id": 1, "image_id": "abc123...", "command": "Validate Submit button is visible", "result": True, # Boolean "result_int": 1, # Original 0/1 "reasoning": "Detailed explanation...", "data_type": "trainset", "confirmed": True, "created_at": "2024-01-01 12:00:00" } }, ... ] Note: Seed prompt is provided separately in test script, not in database. Raises: sqlite3.Error: If database query fails ValueError: If no datapoints found matching criteria """ # Use provided confirmed_only or instance default use_confirmed = confirmed_only if confirmed_only is not None else self.confirmed_only conn = sqlite3.connect(str(self.db_path)) conn.row_factory = sqlite3.Row # Access columns by name dataset = [] try: # Build query with filters query = """ SELECT v.id, v.image_id, v.command, v.result, v.reasoning, v.data_type, v.confirmed, v.created_at, i.mime, i.bytes FROM validation_data v INNER JOIN images i ON v.image_id = i.image_id WHERE 1=1 """ params = [] # Add filters if use_confirmed: query += " AND v.confirmed = 1" if data_type: query += " AND v.data_type = ?" params.append(data_type) query += " ORDER BY v.id ASC" # Execute query cursor = conn.execute(query, params) rows = cursor.fetchall() if not rows: filter_msg = [] if use_confirmed: filter_msg.append("confirmed=1") if data_type: filter_msg.append(f"data_type='{data_type}'") filter_str = " with filters: " + ", ".join(filter_msg) if filter_msg else "" raise ValueError( f"No datapoints found{filter_str} in database: {self.db_path}\n" f"Make sure you have generated and saved datapoints using the validation UI." ) # Convert rows to GEPA format for row in rows: # Convert 0/1 to "true"/"false" string for GEPA result_str = "true" if row["result"] == 1 else "false" # Encode image bytes to base64 image_base64 = base64.b64encode(row["bytes"]).decode("utf-8") # Create GEPA format item # Input: command (seed prompt will be provided in test script) # Image: separate at top level (image_base64) # Output: "true" or "false" (converted from 0/1) dataset_item = { "input": row["command"], # Just the command - seed prompt will be in test script "output": result_str, # "true" or "false" (string) "image_base64": image_base64, # TOP LEVEL for UniversalConverter (image + command together) "metadata": { "id": row["id"], "image_id": row["image_id"], "command": row["command"], # Keep original for reference "result": bool(row["result"]), # Boolean for reference "result_int": row["result"], # Original 0/1 for reference "reasoning": row["reasoning"], "data_type": row["data_type"], "confirmed": bool(row["confirmed"]), "created_at": row["created_at"], "mime": row["mime"], } } dataset.append(dataset_item) # Log summary data_type_str = f" ({data_type})" if data_type else "" confirmed_str = " (confirmed only)" if use_confirmed else " (all)" logger.info(f"Loaded {len(dataset)} validation datapoints{data_type_str}{confirmed_str}") return dataset finally: conn.close() def load_split_dataset( self, confirmed_only: Optional[bool] = None ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: """ Load dataset split by data_type (trainset/valset/testset). Convenience method that loads all three splits at once. Args: confirmed_only: Override instance default. If True, only load confirmed datapoints. Returns: Tuple of (train_set, val_set, test_set) in GEPA format Example: loader = ValidationDatasetLoader(db_path="./validation_data.db") train, val, test = loader.load_split_dataset() """ train_set = self.load_dataset(data_type="trainset", confirmed_only=confirmed_only) val_set = self.load_dataset(data_type="valset", confirmed_only=confirmed_only) test_set = self.load_dataset(data_type="testset", confirmed_only=confirmed_only) logger.info(f"Dataset Split Summary: Training={len(train_set)}, Validation={len(val_set)}, Test={len(test_set)}, Total={len(train_set) + len(val_set) + len(test_set)}") return train_set, val_set, test_set def get_dataset_stats(self) -> Dict[str, Any]: """ Get statistics about the dataset in the database. Returns: Dictionary with dataset statistics: { "total": 100, "confirmed": 95, "unconfirmed": 5, "by_data_type": { "trainset": 70, "valset": 15, "testset": 15 }, "by_result": { "true": 50, "false": 50 } } """ conn = sqlite3.connect(str(self.db_path)) conn.row_factory = sqlite3.Row try: stats = {} # Total counts total = conn.execute("SELECT COUNT(*) FROM validation_data").fetchone()[0] confirmed = conn.execute("SELECT COUNT(*) FROM validation_data WHERE confirmed = 1").fetchone()[0] stats["total"] = total stats["confirmed"] = confirmed stats["unconfirmed"] = total - confirmed # By data_type data_type_rows = conn.execute(""" SELECT data_type, COUNT(*) as count FROM validation_data GROUP BY data_type """).fetchall() stats["by_data_type"] = {row["data_type"]: row["count"] for row in data_type_rows} # By result (true/false) result_rows = conn.execute(""" SELECT result, COUNT(*) as count FROM validation_data GROUP BY result """).fetchall() stats["by_result"] = { "true": sum(row["count"] for row in result_rows if row["result"] == 1), "false": sum(row["count"] for row in result_rows if row["result"] == 0) } return stats finally: conn.close() def load_validation_dataset( db_path: Optional[str] = None, data_type: Optional[Literal["trainset", "valset", "testset"]] = None, confirmed_only: bool = True ) -> List[Dict[str, Any]]: """ Convenience function to load validation dataset. Args: db_path: Path to SQLite database file. Default: "./validation_data.db" data_type: Filter by data_type. If None, loads all types. confirmed_only: If True, only load confirmed datapoints. Returns: List of dataset items in GEPA format Example: # Load all confirmed training data train_data = load_validation_dataset(data_type="trainset", confirmed_only=True) # Load all confirmed data all_data = load_validation_dataset(confirmed_only=True) """ loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only) return loader.load_dataset(data_type=data_type, confirmed_only=confirmed_only) def load_validation_split( db_path: Optional[str] = None, confirmed_only: bool = True ) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]: """ Convenience function to load validation dataset split by data_type. Args: db_path: Path to SQLite database file. Default: "./validation_data.db" confirmed_only: If True, only load confirmed datapoints. Returns: Tuple of (train_set, val_set, test_set) in GEPA format Example: train, val, test = load_validation_split(confirmed_only=True) """ loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only) return loader.load_split_dataset(confirmed_only=confirmed_only) # Example usage and testing if __name__ == "__main__": print("šŸš€ Testing Validation Dataset Loader...") try: loader = ValidationDatasetLoader() # Get stats print("\nšŸ“Š Dataset Statistics:") stats = loader.get_dataset_stats() print(f" Total: {stats['total']}") print(f" Confirmed: {stats['confirmed']}") print(f" Unconfirmed: {stats['unconfirmed']}") print(f" By data_type: {stats['by_data_type']}") print(f" By result: {stats['by_result']}") # Load split dataset print("\nšŸ“¦ Loading split dataset...") train, val, test = loader.load_split_dataset() # Show sample if train: sample = train[0] print(f"\nšŸ“ Sample Training Item:") print(f" Input: {sample['input']}") print(f" Output: {sample['output']}") print(f" Image ID: {sample['metadata']['image_id'][:8]}...") print(f" Data Type: {sample['metadata']['data_type']}") print(f" Result: {sample['metadata']['result']} (int: {sample['metadata']['result_int']})") except FileNotFoundError as e: print(f"āŒ {e}") print("\nšŸ’” Make sure validation_data_ui_server_async.py has been run to create the database.") except ValueError as e: print(f"āŒ {e}") print("\nšŸ’” Generate and save some datapoints using the validation UI first.")