Spaces:
Sleeping
Sleeping
File size: 14,588 Bytes
cacd4d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 |
"""
Validation Dataset Loader for UI Validation Use Case
Loads validation datapoints from SQLite database and converts to GEPA-compatible format.
Supports filtering by data_type (trainset/valset/testset) and confirmed status.
"""
import os
import sqlite3
import base64
import logging
from typing import List, Dict, Any, Optional, Literal
from pathlib import Path
logger = logging.getLogger(__name__)
class ValidationDatasetLoader:
"""
Loads validation dataset from SQLite database.
Database schema:
- validation_data: id, image_id, command, result (0/1), reasoning, data_type, confirmed, created_at
- images: image_id, mime, bytes (BLOB), created_at
Converts to GEPA format:
- input: command text (seed prompt will be provided in test script)
- output: "true" or "false" (converted from 0/1)
- image_base64: base64 encoded image (TOP LEVEL for UniversalConverter)
- metadata: All original fields plus converted values
Note: The seed prompt is NOT stored in database - it will be provided in the test script.
The input field contains just the command, and the image is at top level.
"""
def __init__(
self,
db_path: Optional[str] = None,
confirmed_only: bool = True
):
"""
Initialize validation dataset loader.
Args:
db_path: Path to SQLite database file.
Default: "./validation_data.db" or from VD_DB_PATH env var
confirmed_only: If True, only load datapoints where confirmed=1.
Default: True (only manually reviewed data)
Raises:
FileNotFoundError: If database file doesn't exist
sqlite3.Error: If database connection fails
"""
# Get database path from env or use default
if db_path is None:
db_path = os.getenv("VD_DB_PATH", "./validation_data.db")
self.db_path = Path(db_path).resolve()
if not self.db_path.exists():
raise FileNotFoundError(
f"Database file not found: {self.db_path}\n"
f"Make sure validation_data_ui_server_async.py has been run at least once to create the database."
)
self.confirmed_only = confirmed_only
def load_dataset(
self,
data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
confirmed_only: Optional[bool] = None
) -> List[Dict[str, Any]]:
"""
Load dataset from database and convert to GEPA format.
Args:
data_type: Filter by data_type. If None, loads all types.
Options: "trainset", "valset", "testset"
confirmed_only: Override instance default. If True, only load confirmed datapoints.
If None, uses instance default (self.confirmed_only)
Returns:
List of dataset items in GEPA format:
[
{
"input": "Validate Submit button is visible", # Command only (seed prompt in test script)
"output": "true", # or "false" (converted from 0/1)
"image_base64": "<base64_encoded_image>", # TOP LEVEL (image + command together)
"metadata": {
"id": 1,
"image_id": "abc123...",
"command": "Validate Submit button is visible",
"result": True, # Boolean
"result_int": 1, # Original 0/1
"reasoning": "Detailed explanation...",
"data_type": "trainset",
"confirmed": True,
"created_at": "2024-01-01 12:00:00"
}
},
...
]
Note: Seed prompt is provided separately in test script, not in database.
Raises:
sqlite3.Error: If database query fails
ValueError: If no datapoints found matching criteria
"""
# Use provided confirmed_only or instance default
use_confirmed = confirmed_only if confirmed_only is not None else self.confirmed_only
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row # Access columns by name
dataset = []
try:
# Build query with filters
query = """
SELECT
v.id,
v.image_id,
v.command,
v.result,
v.reasoning,
v.data_type,
v.confirmed,
v.created_at,
i.mime,
i.bytes
FROM validation_data v
INNER JOIN images i ON v.image_id = i.image_id
WHERE 1=1
"""
params = []
# Add filters
if use_confirmed:
query += " AND v.confirmed = 1"
if data_type:
query += " AND v.data_type = ?"
params.append(data_type)
query += " ORDER BY v.id ASC"
# Execute query
cursor = conn.execute(query, params)
rows = cursor.fetchall()
if not rows:
filter_msg = []
if use_confirmed:
filter_msg.append("confirmed=1")
if data_type:
filter_msg.append(f"data_type='{data_type}'")
filter_str = " with filters: " + ", ".join(filter_msg) if filter_msg else ""
raise ValueError(
f"No datapoints found{filter_str} in database: {self.db_path}\n"
f"Make sure you have generated and saved datapoints using the validation UI."
)
# Convert rows to GEPA format
for row in rows:
# Convert 0/1 to "true"/"false" string for GEPA
result_str = "true" if row["result"] == 1 else "false"
# Encode image bytes to base64
image_base64 = base64.b64encode(row["bytes"]).decode("utf-8")
# Create GEPA format item
# Input: command (seed prompt will be provided in test script)
# Image: separate at top level (image_base64)
# Output: "true" or "false" (converted from 0/1)
dataset_item = {
"input": row["command"], # Just the command - seed prompt will be in test script
"output": result_str, # "true" or "false" (string)
"image_base64": image_base64, # TOP LEVEL for UniversalConverter (image + command together)
"metadata": {
"id": row["id"],
"image_id": row["image_id"],
"command": row["command"], # Keep original for reference
"result": bool(row["result"]), # Boolean for reference
"result_int": row["result"], # Original 0/1 for reference
"reasoning": row["reasoning"],
"data_type": row["data_type"],
"confirmed": bool(row["confirmed"]),
"created_at": row["created_at"],
"mime": row["mime"],
}
}
dataset.append(dataset_item)
# Log summary
data_type_str = f" ({data_type})" if data_type else ""
confirmed_str = " (confirmed only)" if use_confirmed else " (all)"
logger.info(f"Loaded {len(dataset)} validation datapoints{data_type_str}{confirmed_str}")
return dataset
finally:
conn.close()
def load_split_dataset(
self,
confirmed_only: Optional[bool] = None
) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Load dataset split by data_type (trainset/valset/testset).
Convenience method that loads all three splits at once.
Args:
confirmed_only: Override instance default. If True, only load confirmed datapoints.
Returns:
Tuple of (train_set, val_set, test_set) in GEPA format
Example:
loader = ValidationDatasetLoader(db_path="./validation_data.db")
train, val, test = loader.load_split_dataset()
"""
train_set = self.load_dataset(data_type="trainset", confirmed_only=confirmed_only)
val_set = self.load_dataset(data_type="valset", confirmed_only=confirmed_only)
test_set = self.load_dataset(data_type="testset", confirmed_only=confirmed_only)
logger.info(f"Dataset Split Summary: Training={len(train_set)}, Validation={len(val_set)}, Test={len(test_set)}, Total={len(train_set) + len(val_set) + len(test_set)}")
return train_set, val_set, test_set
def get_dataset_stats(self) -> Dict[str, Any]:
"""
Get statistics about the dataset in the database.
Returns:
Dictionary with dataset statistics:
{
"total": 100,
"confirmed": 95,
"unconfirmed": 5,
"by_data_type": {
"trainset": 70,
"valset": 15,
"testset": 15
},
"by_result": {
"true": 50,
"false": 50
}
}
"""
conn = sqlite3.connect(str(self.db_path))
conn.row_factory = sqlite3.Row
try:
stats = {}
# Total counts
total = conn.execute("SELECT COUNT(*) FROM validation_data").fetchone()[0]
confirmed = conn.execute("SELECT COUNT(*) FROM validation_data WHERE confirmed = 1").fetchone()[0]
stats["total"] = total
stats["confirmed"] = confirmed
stats["unconfirmed"] = total - confirmed
# By data_type
data_type_rows = conn.execute("""
SELECT data_type, COUNT(*) as count
FROM validation_data
GROUP BY data_type
""").fetchall()
stats["by_data_type"] = {row["data_type"]: row["count"] for row in data_type_rows}
# By result (true/false)
result_rows = conn.execute("""
SELECT result, COUNT(*) as count
FROM validation_data
GROUP BY result
""").fetchall()
stats["by_result"] = {
"true": sum(row["count"] for row in result_rows if row["result"] == 1),
"false": sum(row["count"] for row in result_rows if row["result"] == 0)
}
return stats
finally:
conn.close()
def load_validation_dataset(
db_path: Optional[str] = None,
data_type: Optional[Literal["trainset", "valset", "testset"]] = None,
confirmed_only: bool = True
) -> List[Dict[str, Any]]:
"""
Convenience function to load validation dataset.
Args:
db_path: Path to SQLite database file. Default: "./validation_data.db"
data_type: Filter by data_type. If None, loads all types.
confirmed_only: If True, only load confirmed datapoints.
Returns:
List of dataset items in GEPA format
Example:
# Load all confirmed training data
train_data = load_validation_dataset(data_type="trainset", confirmed_only=True)
# Load all confirmed data
all_data = load_validation_dataset(confirmed_only=True)
"""
loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
return loader.load_dataset(data_type=data_type, confirmed_only=confirmed_only)
def load_validation_split(
db_path: Optional[str] = None,
confirmed_only: bool = True
) -> tuple[List[Dict[str, Any]], List[Dict[str, Any]], List[Dict[str, Any]]]:
"""
Convenience function to load validation dataset split by data_type.
Args:
db_path: Path to SQLite database file. Default: "./validation_data.db"
confirmed_only: If True, only load confirmed datapoints.
Returns:
Tuple of (train_set, val_set, test_set) in GEPA format
Example:
train, val, test = load_validation_split(confirmed_only=True)
"""
loader = ValidationDatasetLoader(db_path=db_path, confirmed_only=confirmed_only)
return loader.load_split_dataset(confirmed_only=confirmed_only)
# Example usage and testing
if __name__ == "__main__":
print("🚀 Testing Validation Dataset Loader...")
try:
loader = ValidationDatasetLoader()
# Get stats
print("\n📊 Dataset Statistics:")
stats = loader.get_dataset_stats()
print(f" Total: {stats['total']}")
print(f" Confirmed: {stats['confirmed']}")
print(f" Unconfirmed: {stats['unconfirmed']}")
print(f" By data_type: {stats['by_data_type']}")
print(f" By result: {stats['by_result']}")
# Load split dataset
print("\n📦 Loading split dataset...")
train, val, test = loader.load_split_dataset()
# Show sample
if train:
sample = train[0]
print(f"\n📝 Sample Training Item:")
print(f" Input: {sample['input']}")
print(f" Output: {sample['output']}")
print(f" Image ID: {sample['metadata']['image_id'][:8]}...")
print(f" Data Type: {sample['metadata']['data_type']}")
print(f" Result: {sample['metadata']['result']} (int: {sample['metadata']['result_int']})")
except FileNotFoundError as e:
print(f"❌ {e}")
print("\n💡 Make sure validation_data_ui_server_async.py has been run to create the database.")
except ValueError as e:
print(f"❌ {e}")
print("\n💡 Generate and save some datapoints using the validation UI first.")
|