File size: 4,071 Bytes
188709e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
import os
import logging
from typing import Iterator, Dict, Any, Optional

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("UIVisionDataset")

try:
    from datasets import load_dataset
except ImportError:
    logger.warning("Hugging Face 'datasets' package is not installed. To run this loader, install it using: pip install datasets")

class UIVisionDatasetManager:
    """
    Manager to load, stream, and parse ServiceNow's UI-Vision dataset from Hugging Face.
    Used to train Aura Assist's visual computer-use agents, multi-step workflow planning, and automation intelligence.
    
    Dataset Reference: "ServiceNow/ui-vision"
    """
    def __init__(self, cache_dir: str = "./memory/datasets/ui_vision"):
        self.cache_dir = cache_dir
        os.makedirs(self.cache_dir, exist_ok=True)

    def load_ui_vision_dataset(self, token: Optional[str] = None) -> Any:
        """
        Loads the ServiceNow/ui-vision dataset using Hugging Face datasets.
        Requires Hugging Face auth token if the dataset is gated or requires agreement.
        """
        logger.info("Loading ServiceNow/ui-vision dataset from Hugging Face Hub...")
        try:
            # Load dataset utilizing Hugging Face token if supplied
            dataset = load_dataset(
                "ServiceNow/ui-vision",
                cache_dir=self.cache_dir,
                token=token
            )
            logger.info("UI-Vision dataset loaded successfully!")
            return dataset
        except Exception as e:
            logger.error(f"Error loading ServiceNow/ui-vision dataset: {e}")
            logger.warning("Ensure you have run `huggingface-cli login` or provided a valid 'token' parameter.")
            return None

    def _parse_dataset_row(self, row: Dict[str, Any]) -> Dict[str, Any]:
        """
        Formats a raw row from the UI-Vision dataset into a structured schema
        compatible with Aura Assist's step-by-step action reasoning models.
        """
        # Extract screenshot image (can be PIL Image object or path)
        image = row.get("image")
        
        # Extract instruction / action plan
        instruction = row.get("instruction", "")
        if not instruction:
            instruction = row.get("task", "")
            
        # Extract target actions and screen bounding boxes
        actions = row.get("actions", [])
        bbox = row.get("bbox", [])
        
        return {
            "instruction": instruction,
            "screenshot": image,
            "actions": actions,
            "bounding_boxes": bbox,
            "metadata": {
                "source": "ServiceNow/ui-vision",
                "difficulty": row.get("difficulty", "medium"),
                "domain": row.get("domain", "web_app")
            }
        }

    def stream_split(self, split: str = "train", token: Optional[str] = None) -> Iterator[Dict[str, Any]]:
        """
        Streams structured screen-action-vision sequences from a specific dataset split (train, validation, or test).
        Can be piped directly to fine-tune local reasoning layers.
        """
        dataset = self.load_ui_vision_dataset(token=token)
        if not dataset:
            logger.warning("No active dataset loaded. Aborting stream.")
            return

        if split not in dataset:
            logger.error(f"Requested split '{split}' not found in UI-Vision dataset. Available: {list(dataset.keys())}")
            return

        logger.info(f"Streaming UI-Vision dataset records from split: '{split}'...")
        for row in dataset[split]:
            try:
                yield self._parse_dataset_row(row)
            except Exception as e:
                logger.error(f"Skipping corrupt UI-Vision row: {e}")
                continue

# Quick test execution scaffold
if __name__ == "__main__":
    manager = UIVisionDatasetManager()
    logger.info("ServiceNow UI-Vision Dataset Manager Scaffolding initialized.")
    print("UI-Vision Dataset Ready for visual computer-use agent training!")