Spaces:
Sleeping
Sleeping
| import asyncio | |
| import json | |
| import re | |
| import time | |
| import logging | |
| from pathlib import Path | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI | |
| from fastapi.staticfiles import StaticFiles | |
| from fastapi.responses import JSONResponse | |
| import httpx | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| CACHE_FILE = Path("/tmp/spaces_cache.json") | |
| REFRESH_INTERVAL = 1800 # 30 minutes | |
| cache = {"spaces": [], "last_updated": 0} | |
| HF_API_URL = "https://huggingface.co/api/spaces" | |
| CDN_THUMB_BASE = "https://cdn-thumbnails.huggingface.co/social-thumbnails/spaces" | |
| OG_IMAGE_RE = re.compile(r'og:image"\s+content="([^"]+)"') | |
| TEMPLATE_RE = re.compile(r'^template:\s*["\']?(article|paper)["\']?', re.MULTILINE) | |
| ARTICLE_MDX_PATH = "app/src/content/article.mdx" | |
| async def scrape_og_image(client: httpx.AsyncClient, space_id: str) -> str: | |
| try: | |
| resp = await client.get( | |
| f"https://huggingface.co/spaces/{space_id}", | |
| follow_redirects=True, | |
| ) | |
| if resp.status_code == 200: | |
| match = OG_IMAGE_RE.search(resp.text) | |
| if match: | |
| return match.group(1) | |
| except Exception: | |
| pass | |
| return "" | |
| async def detect_template(client: httpx.AsyncClient, space_id: str) -> str: | |
| try: | |
| resp = await client.get( | |
| f"https://huggingface.co/spaces/{space_id}/resolve/main/{ARTICLE_MDX_PATH}", | |
| follow_redirects=True, | |
| ) | |
| if resp.status_code == 200: | |
| match = TEMPLATE_RE.search(resp.text[:2000]) | |
| if match: | |
| return match.group(1) | |
| except Exception: | |
| pass | |
| return "article" | |
| async def fetch_spaces(): | |
| async with httpx.AsyncClient(timeout=30, limits=httpx.Limits(max_connections=10)) as client: | |
| resp = await client.get( | |
| HF_API_URL, | |
| params={ | |
| "filter": "research-article-template", | |
| "limit": 100, | |
| "full": "true", | |
| "sort": "likes", | |
| "direction": "-1", | |
| }, | |
| ) | |
| resp.raise_for_status() | |
| raw = resp.json() | |
| og_tasks = [scrape_og_image(client, s["id"]) for s in raw] | |
| template_tasks = [detect_template(client, s["id"]) for s in raw] | |
| og_images = await asyncio.gather(*og_tasks) | |
| templates = await asyncio.gather(*template_tasks) | |
| results = [] | |
| for s, og_img, tpl in zip(raw, og_images, templates): | |
| card = s.get("cardData") or {} | |
| space_id = s["id"] | |
| thumbnail = og_img | |
| if not thumbnail: | |
| thumbnail = f"{CDN_THUMB_BASE}/{space_id}.png" | |
| results.append( | |
| { | |
| "id": space_id, | |
| "author": space_id.split("/")[0], | |
| "name": space_id.split("/")[-1], | |
| "title": card.get("title") or space_id.split("/")[-1], | |
| "description": card.get("short_description") or "", | |
| "emoji": card.get("emoji") or "", | |
| "likes": s.get("likes", 0), | |
| "createdAt": s.get("createdAt", ""), | |
| "lastModified": s.get("lastModified", ""), | |
| "thumbnail": thumbnail, | |
| "sdk": s.get("sdk", ""), | |
| "template": tpl, | |
| "runtime_stage": (s.get("runtime") or {}).get("stage", ""), | |
| } | |
| ) | |
| # Deduplicate: for spaces sharing the same title, keep only the one | |
| # with the most likes (the original). The rest are forks. | |
| best_by_title = {} | |
| for s in results: | |
| title = s["title"].strip().lower() | |
| prev = best_by_title.get(title) | |
| if prev is None or s["likes"] > prev["likes"]: | |
| best_by_title[title] = s | |
| originals = set(s["id"] for s in best_by_title.values()) | |
| results = [s for s in results if s["id"] in originals] | |
| return results | |
| async def refresh_cache(): | |
| try: | |
| data = await fetch_spaces() | |
| cache["spaces"] = data | |
| cache["last_updated"] = time.time() | |
| CACHE_FILE.write_text(json.dumps(cache)) | |
| logger.info("Cache refreshed: %d spaces", len(data)) | |
| except Exception as e: | |
| logger.error("Cache refresh failed: %s", e) | |
| async def periodic_refresh(): | |
| while True: | |
| await asyncio.sleep(REFRESH_INTERVAL) | |
| await refresh_cache() | |
| async def lifespan(_app: FastAPI): | |
| if CACHE_FILE.exists(): | |
| try: | |
| cache.update(json.loads(CACHE_FILE.read_text())) | |
| logger.info("Loaded %d spaces from disk cache", len(cache["spaces"])) | |
| except Exception: | |
| pass | |
| await refresh_cache() | |
| task = asyncio.create_task(periodic_refresh()) | |
| yield | |
| task.cancel() | |
| app = FastAPI(lifespan=lifespan) | |
| async def get_spaces(): | |
| return JSONResponse( | |
| { | |
| "spaces": cache["spaces"], | |
| "last_updated": cache["last_updated"], | |
| "count": len(cache["spaces"]), | |
| } | |
| ) | |
| app.mount("/", StaticFiles(directory="static", html=True), name="static") | |