File size: 6,602 Bytes
ac5551d
8302f42
ac5551d
 
 
 
 
 
8302f42
ac5551d
 
 
 
8302f42
ac5551d
92faea1
ac5551d
 
 
 
 
 
 
92faea1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac5551d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8302f42
ac5551d
 
 
 
 
 
 
 
 
 
8302f42
ac5551d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8302f42
ac5551d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""
api/routes/datasets.py β€” Dataset Discovery REST API (Cloud Registry Version).
"""
from __future__ import annotations

from typing import Optional
from datetime import datetime

from fastapi import APIRouter, HTTPException, Query

from adapters.roboflow_adapter import RoboflowAdapter
from datasets import registry as ds_reg
from models.dataset import (
    Dataset, DatasetSummary, RoboflowSearchRequest,
)
from models.analytics import DatasetAnalytics, SplitAnalytics, QualityIssues, ClassDistributionItem
from observability.logger import audit, get_logger

log = get_logger("datasets_route")

router = APIRouter(prefix="/datasets", tags=["datasets"])


# ── Analytics ─────────────────────────────────────────────────────────────────

@router.get("/{dataset_id}/analytics", response_model=DatasetAnalytics)
async def get_dataset_analytics(dataset_id: str):
    """
    Fetch comprehensive analytics for a dataset.
    """
    ds = await ds_reg.get_dataset(dataset_id)
    if not ds:
        raise HTTPException(404, f"Dataset {dataset_id!r} not found")
    
    stats = ds.stats
    
    analytics = DatasetAnalytics(
        dataset_id=dataset_id,
        healthScore=stats.health_score,
        split=SplitAnalytics(
            train=stats.split.train if stats.split.total > 0 else 70,
            val=stats.split.val if stats.split.total > 0 else 20,
            test=stats.split.test if stats.split.total > 0 else 10
        ),
        qualityIssues=QualityIssues(
            missingLabels=stats.missing_labels,
            emptyImages=stats.empty_images,
            duplicates=stats.duplicate_count,
            outliers=int(ds.images * 0.005) # placeholder
        ),
        classDistribution=[
            ClassDistributionItem(name=name, count=int(ds.images / ds.classes) if ds.classes > 0 else 0)
            for name in ds.class_names[:20]
        ]
    )
    
    return analytics


# ── List / Search datasets ────────────────────────────────────────────────────

@router.get("", response_model=list[DatasetSummary])
async def list_datasets(
    task:    Optional[str]  = Query(None),
    format:  Optional[str]  = Query(None),
    source:  Optional[str]  = Query(None),
    status:  Optional[str]  = Query(None),
    search:  Optional[str]  = Query(None),
    starred: Optional[bool] = Query(None),
    limit:   int            = Query(100, ge=1, le=1000),
    offset:  int            = Query(0,   ge=0),
):
    try:
        datasets = await ds_reg.get_all_datasets(
            task=task, format=format, source=source,
            status=status, search=search, starred=starred,
            limit=limit, offset=offset,
        )
        return [_to_summary(d) for d in datasets]
    except Exception as exc:
        log.exception("list_datasets_error")
        raise HTTPException(status_code=500, detail=str(exc))


# ── Roboflow Search & Sync ────────────────────────────────────────────────────

@router.post("/search/roboflow", response_model=list[DatasetSummary])
async def search_roboflow(req: RoboflowSearchRequest):
    """
    Live search Roboflow Universe. Results are cached for 1 hour.
    """
    try:
        datasets = await RoboflowAdapter.search_datasets(
            api_key   = req.api_key,
            query     = req.query,
            workspace = req.workspace,
            page      = req.page,
            page_size = req.page_size,
        )
    except Exception as exc:
        log.error("roboflow_search_error", error=str(exc))
        raise HTTPException(502, f"Roboflow API error: {exc}")

    # Upsert to registry DB
    await ds_reg.bulk_upsert_datasets(datasets)
    await audit("roboflow_search", {"query": req.query, "count": len(datasets)})
    return [_to_summary(d) for d in datasets]


@router.post("/sync/roboflow", response_model=dict)
async def sync_roboflow_workspace(
    api_key: str = Query(..., description="Roboflow API key"),
    workspace: str = Query(..., description="Workspace slug"),
):
    """Sync all datasets from a Roboflow workspace into the global registry."""
    try:
        datasets = await RoboflowAdapter.list_workspace_datasets(api_key, workspace)
    except Exception as exc:
        raise HTTPException(502, f"Roboflow API error: {exc}")
    count = await ds_reg.bulk_upsert_datasets(datasets)
    return {"synced": count, "workspace": workspace}


# ── Dataset detail ────────────────────────────────────────────────────────────

@router.get("/{dataset_id}", response_model=Dataset)
async def get_dataset(dataset_id: str):
    ds = await ds_reg.get_dataset(dataset_id)
    if not ds:
        raise HTTPException(404, f"Dataset {dataset_id!r} not found")
    return ds


# ── Star ─────────────────────────────────────────────────────────────

@router.post("/{dataset_id}/star", response_model=dict)
async def toggle_star(dataset_id: str):
    new_val = await ds_reg.toggle_starred(dataset_id)
    return {"dataset_id": dataset_id, "starred": new_val}


# ── Helper ────────────────────────────────────────────────────────────────────

def _to_summary(d: Dataset) -> DatasetSummary:
    health_score = 0.0
    try:
        if hasattr(d, 'stats') and d.stats:
            health_score = getattr(d.stats, 'health_score', 0.0)
    except Exception:
        pass

    return DatasetSummary(
        id              = d.id,
        name            = d.name,
        task            = str(d.task),
        format          = str(d.format),
        source          = str(d.source),
        status          = str(d.status),
        images          = d.images,
        classes         = d.classes,
        size_label      = d.size_label,
        tags            = d.tags,
        starred         = d.starred,
        import_progress = d.import_progress,
        health_score    = health_score,
        created_at      = d.created_at,
        updated_at      = d.updated_at,
    )