File size: 6,005 Bytes
225aa59
 
 
 
 
 
 
 
 
 
8327deb
 
 
225aa59
6f2d89f
b5181f8
bb4e560
b043cfa
5790374
71dcdb9
89dfd64
 
c14446f
529ea75
225aa59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8327deb
 
 
225aa59
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import json
import os
import re
from typing import Dict, List
from urllib.parse import quote, unquote, urlparse
from urllib.error import HTTPError
from urllib.request import Request, urlopen

from fastapi import FastAPI, HTTPException, Query
from fastapi.responses import HTMLResponse, Response

app = FastAPI()

DEFAULT_GROUP_LINKS = [
    "https://huggingface.co/spaces/CyberAl/Traffic-Tracker/tree/main/data",
    "https://huggingface.co/spaces/niangmariame513/traffic-monitor/tree/main/data",
    "https://huggingface.co/spaces/danielle2035/TRAFFIC_ROAD_APP/tree/main/data",
    "https://huggingface.co/spaces/Rafiatou/trafficvision-group10/tree/main/data",
    "https://huggingface.co/spaces/Binta26/computer_vision/tree/main/data",
    "https://huggingface.co/datasets/ccspoet/ProjectCV/tree/main/Data",
    "https://huggingface.co/spaces/AhmedSouley01/traffic-monitoring/tree/main/data",
    "https://huggingface.co/datasets/conde621gmail/dataset/tree/main",
    "https://huggingface.co/datasets/ioget/aims-traffic-cv-data/tree/main",
    "https://huggingface.co/datasets/luclintos/traffic_object_detection/tree/main",
]

VIDEO_EXTENSIONS = (".mp4", ".webm", ".mov", ".avi", ".mkv", ".m4v")


def configured_group_links() -> List[str]:
    raw = os.getenv("HF_GROUP_LINKS", "").strip()
    if not raw:
        return DEFAULT_GROUP_LINKS

    if raw.startswith("["):
        try:
            links = json.loads(raw)
            return [str(link).strip() for link in links if str(link).strip()]
        except json.JSONDecodeError as exc:
            raise HTTPException(status_code=500, detail=f"Invalid HF_GROUP_LINKS JSON: {exc}")

    return [link.strip() for link in re.split(r"[\n,]+", raw) if link.strip()]


def parse_hf_tree_url(url: str) -> Dict[str, str]:
    parsed = urlparse(url.strip())
    if parsed.netloc != "huggingface.co":
        raise HTTPException(status_code=400, detail="Only huggingface.co links are supported.")

    parts = [unquote(part) for part in parsed.path.strip("/").split("/") if part]
    if len(parts) < 5 or parts[3] != "tree":
        raise HTTPException(
            status_code=400,
            detail="Expected a Hugging Face tree URL like https://huggingface.co/spaces/user/repo/tree/main/data",
        )

    repo_kind = parts[0]
    if repo_kind not in {"spaces", "datasets"}:
        raise HTTPException(status_code=400, detail="Only Hugging Face spaces and datasets are supported.")

    owner = parts[1]
    repo = parts[2]
    revision = parts[4]
    path_parts = parts[5:]
    folder_path = "/".join(path_parts) or ""

    return {
        "repo_kind": repo_kind,
        "owner": owner,
        "repo": repo,
        "revision": revision,
        "folder_path": folder_path,
        "repo_id": f"{owner}/{repo}",
    }


def hf_api_prefix(repo_kind: str) -> str:
    return "spaces" if repo_kind == "spaces" else "datasets"


def hf_raw_prefix(repo_kind: str) -> str:
    return "spaces" if repo_kind == "spaces" else "datasets"


def fetch_json(url: str):
    req = Request(url, headers={"User-Agent": "TrafficSense-dashboard"})
    with urlopen(req, timeout=20) as response:
        return json.loads(response.read().decode("utf-8"))


def discover_source(tree_url: str) -> Dict:
    info = parse_hf_tree_url(tree_url)
    api_path = quote(info["folder_path"], safe="/")
    repo_id = quote(info["repo_id"], safe="/")
    api_url = (
        f"https://huggingface.co/api/{hf_api_prefix(info['repo_kind'])}/"
        f"{repo_id}/tree/{quote(info['revision'], safe='')}/{api_path}"
    ).rstrip("/") + "?recursive=1"
    warning = None
    try:
        items = fetch_json(api_url)
    except HTTPError as exc:
        if exc.code != 404:
            raise
        warning = f"Folder '{info['folder_path']}' was not found."
        items = []

    files = []
    for item in items:
        if item.get("type") != "file":
            continue
        file_path = item.get("path", "")
        lower_path = file_path.lower()
        raw_url = (
            f"https://huggingface.co/{hf_raw_prefix(info['repo_kind'])}/{info['repo_id']}"
            f"/resolve/{quote(info['revision'], safe='')}/{quote(file_path, safe='/')}"
        )
        kind = "other"
        if lower_path.endswith(".csv"):
            kind = "csv"
        elif lower_path.endswith(VIDEO_EXTENSIONS):
            kind = "video"
        files.append({
            "name": file_path.split("/")[-1],
            "path": file_path,
            "size": item.get("size"),
            "kind": kind,
            "url": raw_url,
        })

    return {
        "link": tree_url,
        "repo": info["repo_id"],
        "folder": info["folder_path"],
        "warning": warning,
        "csv_files": [file for file in files if file["kind"] == "csv"],
        "video_files": [file for file in files if file["kind"] == "video"],
    }

@app.get("/", response_class=HTMLResponse)
def read_root():
    with open("index.html", "r", encoding="utf-8") as f:
        return f.read()


@app.get("/api/group-sources")
def group_sources():
    sources = []
    errors = []
    for link in configured_group_links():
        try:
            sources.append(discover_source(link))
        except Exception as exc:
            errors.append({"link": link, "error": str(exc)})
    return {"sources": sources, "errors": errors}


@app.get("/api/hf-source")
def hf_source(url: str = Query(..., min_length=1)):
    return discover_source(url)


@app.get("/api/proxy")
def proxy(url: str = Query(..., min_length=1)):
    parsed = urlparse(url)
    if parsed.netloc != "huggingface.co" or "/resolve/" not in parsed.path:
        raise HTTPException(status_code=400, detail="Only Hugging Face resolve URLs can be proxied.")

    req = Request(url, headers={"User-Agent": "TrafficSense-dashboard"})
    with urlopen(req, timeout=30) as response:
        content_type = response.headers.get("content-type", "application/octet-stream")
        return Response(content=response.read(), media_type=content_type)