Spaces:
Running
Running
File size: 4,071 Bytes
188709e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 | import os
import logging
from typing import Iterator, Dict, Any, Optional
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("UIVisionDataset")
try:
from datasets import load_dataset
except ImportError:
logger.warning("Hugging Face 'datasets' package is not installed. To run this loader, install it using: pip install datasets")
class UIVisionDatasetManager:
"""
Manager to load, stream, and parse ServiceNow's UI-Vision dataset from Hugging Face.
Used to train Aura Assist's visual computer-use agents, multi-step workflow planning, and automation intelligence.
Dataset Reference: "ServiceNow/ui-vision"
"""
def __init__(self, cache_dir: str = "./memory/datasets/ui_vision"):
self.cache_dir = cache_dir
os.makedirs(self.cache_dir, exist_ok=True)
def load_ui_vision_dataset(self, token: Optional[str] = None) -> Any:
"""
Loads the ServiceNow/ui-vision dataset using Hugging Face datasets.
Requires Hugging Face auth token if the dataset is gated or requires agreement.
"""
logger.info("Loading ServiceNow/ui-vision dataset from Hugging Face Hub...")
try:
# Load dataset utilizing Hugging Face token if supplied
dataset = load_dataset(
"ServiceNow/ui-vision",
cache_dir=self.cache_dir,
token=token
)
logger.info("UI-Vision dataset loaded successfully!")
return dataset
except Exception as e:
logger.error(f"Error loading ServiceNow/ui-vision dataset: {e}")
logger.warning("Ensure you have run `huggingface-cli login` or provided a valid 'token' parameter.")
return None
def _parse_dataset_row(self, row: Dict[str, Any]) -> Dict[str, Any]:
"""
Formats a raw row from the UI-Vision dataset into a structured schema
compatible with Aura Assist's step-by-step action reasoning models.
"""
# Extract screenshot image (can be PIL Image object or path)
image = row.get("image")
# Extract instruction / action plan
instruction = row.get("instruction", "")
if not instruction:
instruction = row.get("task", "")
# Extract target actions and screen bounding boxes
actions = row.get("actions", [])
bbox = row.get("bbox", [])
return {
"instruction": instruction,
"screenshot": image,
"actions": actions,
"bounding_boxes": bbox,
"metadata": {
"source": "ServiceNow/ui-vision",
"difficulty": row.get("difficulty", "medium"),
"domain": row.get("domain", "web_app")
}
}
def stream_split(self, split: str = "train", token: Optional[str] = None) -> Iterator[Dict[str, Any]]:
"""
Streams structured screen-action-vision sequences from a specific dataset split (train, validation, or test).
Can be piped directly to fine-tune local reasoning layers.
"""
dataset = self.load_ui_vision_dataset(token=token)
if not dataset:
logger.warning("No active dataset loaded. Aborting stream.")
return
if split not in dataset:
logger.error(f"Requested split '{split}' not found in UI-Vision dataset. Available: {list(dataset.keys())}")
return
logger.info(f"Streaming UI-Vision dataset records from split: '{split}'...")
for row in dataset[split]:
try:
yield self._parse_dataset_row(row)
except Exception as e:
logger.error(f"Skipping corrupt UI-Vision row: {e}")
continue
# Quick test execution scaffold
if __name__ == "__main__":
manager = UIVisionDatasetManager()
logger.info("ServiceNow UI-Vision Dataset Manager Scaffolding initialized.")
print("UI-Vision Dataset Ready for visual computer-use agent training!")
|