import asyncio import json import re import time import logging from pathlib import Path from contextlib import asynccontextmanager from fastapi import FastAPI from fastapi.staticfiles import StaticFiles from fastapi.responses import JSONResponse import httpx logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) CACHE_FILE = Path("/tmp/spaces_cache.json") REFRESH_INTERVAL = 1800 # 30 minutes cache = {"spaces": [], "last_updated": 0} HF_API_URL = "https://huggingface.co/api/spaces" CDN_THUMB_BASE = "https://cdn-thumbnails.huggingface.co/social-thumbnails/spaces" OG_IMAGE_RE = re.compile(r'og:image"\s+content="([^"]+)"') TEMPLATE_RE = re.compile(r'^template:\s*["\']?(article|paper)["\']?', re.MULTILINE) ARTICLE_MDX_PATH = "app/src/content/article.mdx" async def scrape_og_image(client: httpx.AsyncClient, space_id: str) -> str: try: resp = await client.get( f"https://huggingface.co/spaces/{space_id}", follow_redirects=True, ) if resp.status_code == 200: match = OG_IMAGE_RE.search(resp.text) if match: return match.group(1) except Exception: pass return "" async def detect_template(client: httpx.AsyncClient, space_id: str) -> str: try: resp = await client.get( f"https://huggingface.co/spaces/{space_id}/resolve/main/{ARTICLE_MDX_PATH}", follow_redirects=True, ) if resp.status_code == 200: match = TEMPLATE_RE.search(resp.text[:2000]) if match: return match.group(1) except Exception: pass return "article" async def fetch_spaces(): async with httpx.AsyncClient(timeout=30, limits=httpx.Limits(max_connections=10)) as client: resp = await client.get( HF_API_URL, params={ "filter": "research-article-template", "limit": 100, "full": "true", "sort": "likes", "direction": "-1", }, ) resp.raise_for_status() raw = resp.json() og_tasks = [scrape_og_image(client, s["id"]) for s in raw] template_tasks = [detect_template(client, s["id"]) for s in raw] og_images = await asyncio.gather(*og_tasks) templates = await asyncio.gather(*template_tasks) results = [] for s, og_img, tpl in zip(raw, og_images, templates): card = s.get("cardData") or {} space_id = s["id"] thumbnail = og_img if not thumbnail: thumbnail = f"{CDN_THUMB_BASE}/{space_id}.png" results.append( { "id": space_id, "author": space_id.split("/")[0], "name": space_id.split("/")[-1], "title": card.get("title") or space_id.split("/")[-1], "description": card.get("short_description") or "", "emoji": card.get("emoji") or "", "likes": s.get("likes", 0), "createdAt": s.get("createdAt", ""), "lastModified": s.get("lastModified", ""), "thumbnail": thumbnail, "sdk": s.get("sdk", ""), "template": tpl, "runtime_stage": (s.get("runtime") or {}).get("stage", ""), } ) # Deduplicate: for spaces sharing the same title, keep only the one # with the most likes (the original). The rest are forks. best_by_title = {} for s in results: title = s["title"].strip().lower() prev = best_by_title.get(title) if prev is None or s["likes"] > prev["likes"]: best_by_title[title] = s originals = set(s["id"] for s in best_by_title.values()) results = [s for s in results if s["id"] in originals] return results async def refresh_cache(): try: data = await fetch_spaces() cache["spaces"] = data cache["last_updated"] = time.time() CACHE_FILE.write_text(json.dumps(cache)) logger.info("Cache refreshed: %d spaces", len(data)) except Exception as e: logger.error("Cache refresh failed: %s", e) async def periodic_refresh(): while True: await asyncio.sleep(REFRESH_INTERVAL) await refresh_cache() @asynccontextmanager async def lifespan(_app: FastAPI): if CACHE_FILE.exists(): try: cache.update(json.loads(CACHE_FILE.read_text())) logger.info("Loaded %d spaces from disk cache", len(cache["spaces"])) except Exception: pass await refresh_cache() task = asyncio.create_task(periodic_refresh()) yield task.cancel() app = FastAPI(lifespan=lifespan) @app.get("/api/spaces") async def get_spaces(): return JSONResponse( { "spaces": cache["spaces"], "last_updated": cache["last_updated"], "count": len(cache["spaces"]), } ) app.mount("/", StaticFiles(directory="static", html=True), name="static")