from __future__ import annotations """ Core logic to find Spaces duplicated from a given source within a time window. Comments are in English (per user preference for code comments). """ from datetime import datetime, timedelta, timezone from typing import Iterable, List, Optional import requests from huggingface_hub import HfApi def iso_to_datetime(value: str) -> datetime: """Parse ISO 8601 timestamps returned by the Hub to aware datetime in UTC.""" try: dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%S.%fZ") except ValueError: dt = datetime.strptime(value, "%Y-%m-%dT%H:%M:%SZ") return dt.replace(tzinfo=timezone.utc) def readme_frontmatter_duplicated_from(space_id: str) -> Optional[str]: """Fetch README raw and try to extract duplicated_from from YAML frontmatter.""" url = f"https://huggingface.co/spaces/{space_id}/raw/README.md" try: resp = requests.get(url, timeout=10) if resp.status_code != 200: return None text = resp.text except requests.RequestException: return None lines = text.splitlines() in_frontmatter = False for line in lines: if line.strip() == "---": in_frontmatter = not in_frontmatter if not in_frontmatter: break continue if in_frontmatter and line.strip().startswith("duplicated_from:"): value = line.split(":", 1)[1].strip().strip("'\"") return value or None return None def get_recent_spaces(api: HfApi, days: int) -> Iterable: """Yield Spaces created within the last `days` days, iterating newest first if possible.""" cutoff = datetime.now(timezone.utc) - timedelta(days=days) try: spaces_iter = api.list_spaces(full=True, sort="created", direction=-1) except TypeError: spaces_iter = api.list_spaces(full=True) for space in spaces_iter: created_at_raw = getattr(space, "created_at", None) or getattr(space, "createdAt", None) if not created_at_raw: yield space continue created_at = ( created_at_raw if isinstance(created_at_raw, datetime) else iso_to_datetime(str(created_at_raw)) ) if created_at >= cutoff: yield space else: # We cannot guarantee sort order when falling back; continue to be safe. continue def find_duplicated_spaces(api: HfApi, source: str, days: int, deep_detection: bool) -> List[str]: """Return list of Space IDs that were duplicated from `source` within `days`.""" source = source.strip().strip("/ ") results: List[str] = [] for space in get_recent_spaces(api, days=days): space_id = getattr(space, "id", None) or getattr(space, "repo_id", None) if not space_id: continue card = getattr(space, "cardData", None) or getattr(space, "card_data", None) duplicated_from_value: Optional[str] = None if isinstance(card, dict): for key in ("duplicated_from", "duplicatedFrom", "duplicated-from"): if key in card and isinstance(card[key], str): duplicated_from_value = card[key].strip().strip("/ ") break if not duplicated_from_value and deep_detection: duplicated_from_value = readme_frontmatter_duplicated_from(space_id) if duplicated_from_value: duplicated_from_value = duplicated_from_value.strip().strip("/ ") if duplicated_from_value and duplicated_from_value.lower() == source.lower(): results.append(space_id) return results