Spaces:
Running
Running
File size: 3,670 Bytes
7914ed2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 |
from __future__ import annotations
"""
Core logic to find Spaces duplicated from a given source within a time window.
Comments are in English (per user preference for code comments).
"""
from datetime import datetime, timedelta, timezone
from typing import Iterable, List, Optional
import requests
from huggingface_hub import HfApi
def iso_to_datetime(value: str) -> datetime:
"""Parse ISO 8601 timestamps returned by the Hub to aware datetime in UTC."""
try:
dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ")
except ValueError:
dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ")
return dt.replace(tzinfo=timezone.utc)
def readme_frontmatter_duplicated_from(space_id: str) -> Optional[str]:
"""Fetch README raw and try to extract duplicated_from from YAML frontmatter."""
url = f"https://huggingface.co/spaces/{space_id}/raw/README.md"
try:
resp = requests.get(url, timeout=10)
if resp.status_code != 200:
return None
text = resp.text
except requests.RequestException:
return None
lines = text.splitlines()
in_frontmatter = False
for line in lines:
if line.strip() == "---":
in_frontmatter = not in_frontmatter
if not in_frontmatter:
break
continue
if in_frontmatter and line.strip().startswith("duplicated_from:"):
value = line.split(":", 1)[1].strip().strip("'\"")
return value or None
return None
def get_recent_spaces(api: HfApi, days: int) -> Iterable:
"""Yield Spaces created within the last `days` days, iterating newest first if possible."""
cutoff = datetime.now(timezone.utc) - timedelta(days=days)
try:
spaces_iter = api.list_spaces(full=True, sort="created", direction=-1)
except TypeError:
spaces_iter = api.list_spaces(full=True)
for space in spaces_iter:
created_at_raw = getattr(space, "created_at", None) or getattr(space, "createdAt", None)
if not created_at_raw:
yield space
continue
created_at = (
created_at_raw if isinstance(created_at_raw, datetime) else iso_to_datetime(str(created_at_raw))
)
if created_at >= cutoff:
yield space
else:
# We cannot guarantee sort order when falling back; continue to be safe.
continue
def find_duplicated_spaces(api: HfApi, source: str, days: int, deep_detection: bool) -> List[str]:
"""Return list of Space IDs that were duplicated from `source` within `days`."""
source = source.strip().strip("/ ")
results: List[str] = []
for space in get_recent_spaces(api, days=days):
space_id = getattr(space, "id", None) or getattr(space, "repo_id", None)
if not space_id:
continue
card = getattr(space, "cardData", None) or getattr(space, "card_data", None)
duplicated_from_value: Optional[str] = None
if isinstance(card, dict):
for key in ("duplicated_from", "duplicatedFrom", "duplicated-from"):
if key in card and isinstance(card[key], str):
duplicated_from_value = card[key].strip().strip("/ ")
break
if not duplicated_from_value and deep_detection:
duplicated_from_value = readme_frontmatter_duplicated_from(space_id)
if duplicated_from_value:
duplicated_from_value = duplicated_from_value.strip().strip("/ ")
if duplicated_from_value and duplicated_from_value.lower() == source.lower():
results.append(space_id)
return results
|