AURA-Backend / python_backend /services /ui_vision_dataset.py
Vijayadhith7's picture
Upload 22 files
188709e verified
import os
import logging
from typing import Iterator, Dict, Any, Optional
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("UIVisionDataset")
try:
from datasets import load_dataset
except ImportError:
logger.warning("Hugging Face 'datasets' package is not installed. To run this loader, install it using: pip install datasets")
class UIVisionDatasetManager:
"""
Manager to load, stream, and parse ServiceNow's UI-Vision dataset from Hugging Face.
Used to train Aura Assist's visual computer-use agents, multi-step workflow planning, and automation intelligence.
Dataset Reference: "ServiceNow/ui-vision"
"""
def __init__(self, cache_dir: str = "./memory/datasets/ui_vision"):
self.cache_dir = cache_dir
os.makedirs(self.cache_dir, exist_ok=True)
def load_ui_vision_dataset(self, token: Optional[str] = None) -> Any:
"""
Loads the ServiceNow/ui-vision dataset using Hugging Face datasets.
Requires Hugging Face auth token if the dataset is gated or requires agreement.
"""
logger.info("Loading ServiceNow/ui-vision dataset from Hugging Face Hub...")
try:
# Load dataset utilizing Hugging Face token if supplied
dataset = load_dataset(
"ServiceNow/ui-vision",
cache_dir=self.cache_dir,
token=token
)
logger.info("UI-Vision dataset loaded successfully!")
return dataset
except Exception as e:
logger.error(f"Error loading ServiceNow/ui-vision dataset: {e}")
logger.warning("Ensure you have run `huggingface-cli login` or provided a valid 'token' parameter.")
return None
def _parse_dataset_row(self, row: Dict[str, Any]) -> Dict[str, Any]:
"""
Formats a raw row from the UI-Vision dataset into a structured schema
compatible with Aura Assist's step-by-step action reasoning models.
"""
# Extract screenshot image (can be PIL Image object or path)
image = row.get("image")
# Extract instruction / action plan
instruction = row.get("instruction", "")
if not instruction:
instruction = row.get("task", "")
# Extract target actions and screen bounding boxes
actions = row.get("actions", [])
bbox = row.get("bbox", [])
return {
"instruction": instruction,
"screenshot": image,
"actions": actions,
"bounding_boxes": bbox,
"metadata": {
"source": "ServiceNow/ui-vision",
"difficulty": row.get("difficulty", "medium"),
"domain": row.get("domain", "web_app")
}
}
def stream_split(self, split: str = "train", token: Optional[str] = None) -> Iterator[Dict[str, Any]]:
"""
Streams structured screen-action-vision sequences from a specific dataset split (train, validation, or test).
Can be piped directly to fine-tune local reasoning layers.
"""
dataset = self.load_ui_vision_dataset(token=token)
if not dataset:
logger.warning("No active dataset loaded. Aborting stream.")
return
if split not in dataset:
logger.error(f"Requested split '{split}' not found in UI-Vision dataset. Available: {list(dataset.keys())}")
return
logger.info(f"Streaming UI-Vision dataset records from split: '{split}'...")
for row in dataset[split]:
try:
yield self._parse_dataset_row(row)
except Exception as e:
logger.error(f"Skipping corrupt UI-Vision row: {e}")
continue
# Quick test execution scaffold
if __name__ == "__main__":
manager = UIVisionDatasetManager()
logger.info("ServiceNow UI-Vision Dataset Manager Scaffolding initialized.")
print("UI-Vision Dataset Ready for visual computer-use agent training!")