File size: 3,670 Bytes
7914ed2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
from __future__ import annotations

"""
Core logic to find Spaces duplicated from a given source within a time window.
Comments are in English (per user preference for code comments).
"""

from datetime import datetime, timedelta, timezone
from typing import Iterable, List, Optional

import requests
from huggingface_hub import HfApi


def iso_to_datetime(value: str) -> datetime:
    """Parse ISO 8601 timestamps returned by the Hub to aware datetime in UTC."""
    try:
        dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ")
    except ValueError:
        dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
    return dt.replace(tzinfo=timezone.utc)


def readme_frontmatter_duplicated_from(space_id: str) -> Optional[str]:
    """Fetch README raw and try to extract duplicated_from from YAML frontmatter."""
    url = f"https://huggingface.co/spaces/{space_id}/raw/README.md"
    try:
        resp = requests.get(url, timeout=10)
        if resp.status_code != 200:
            return None
        text = resp.text
    except requests.RequestException:
        return None

    lines = text.splitlines()
    in_frontmatter = False
    for line in lines:
        if line.strip() == "---":
            in_frontmatter = not in_frontmatter
            if not in_frontmatter:
                break
            continue
        if in_frontmatter and line.strip().startswith("duplicated_from:"):
            value = line.split(":", 1)[1].strip().strip("'\"")
            return value or None
    return None


def get_recent_spaces(api: HfApi, days: int) -> Iterable:
    """Yield Spaces created within the last `days` days, iterating newest first if possible."""
    cutoff = datetime.now(timezone.utc) - timedelta(days=days)
    try:
        spaces_iter = api.list_spaces(full=True, sort="created", direction=-1)
    except TypeError:
        spaces_iter = api.list_spaces(full=True)

    for space in spaces_iter:
        created_at_raw = getattr(space, "created_at", None) or getattr(space, "createdAt", None)
        if not created_at_raw:
            yield space
            continue
        created_at = (
            created_at_raw if isinstance(created_at_raw, datetime) else iso_to_datetime(str(created_at_raw))
        )
        if created_at >= cutoff:
            yield space
        else:
            # We cannot guarantee sort order when falling back; continue to be safe.
            continue


def find_duplicated_spaces(api: HfApi, source: str, days: int, deep_detection: bool) -> List[str]:
    """Return list of Space IDs that were duplicated from `source` within `days`."""
    source = source.strip().strip("/ ")
    results: List[str] = []
    for space in get_recent_spaces(api, days=days):
        space_id = getattr(space, "id", None) or getattr(space, "repo_id", None)
        if not space_id:
            continue

        card = getattr(space, "cardData", None) or getattr(space, "card_data", None)
        duplicated_from_value: Optional[str] = None
        if isinstance(card, dict):
            for key in ("duplicated_from", "duplicatedFrom", "duplicated-from"):
                if key in card and isinstance(card[key], str):
                    duplicated_from_value = card[key].strip().strip("/ ")
                    break

        if not duplicated_from_value and deep_detection:
            duplicated_from_value = readme_frontmatter_duplicated_from(space_id)
            if duplicated_from_value:
                duplicated_from_value = duplicated_from_value.strip().strip("/ ")

        if duplicated_from_value and duplicated_from_value.lower() == source.lower():
            results.append(space_id)

    return results