tfrere's picture
tfrere HF Staff
feat: detect and display article/paper template type
0f277af
import asyncio
import json
import re
import time
import logging
from pathlib import Path
from contextlib import asynccontextmanager
from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles
from fastapi.responses import JSONResponse
import httpx
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
CACHE_FILE = Path("/tmp/spaces_cache.json")
REFRESH_INTERVAL = 1800 # 30 minutes
cache = {"spaces": [], "last_updated": 0}
HF_API_URL = "https://huggingface.co/api/spaces"
CDN_THUMB_BASE = "https://cdn-thumbnails.huggingface.co/social-thumbnails/spaces"
OG_IMAGE_RE = re.compile(r'og:image"\s+content="([^"]+)"')
TEMPLATE_RE = re.compile(r'^template:\s*["\']?(article|paper)["\']?', re.MULTILINE)
ARTICLE_MDX_PATH = "app/src/content/article.mdx"
async def scrape_og_image(client: httpx.AsyncClient, space_id: str) -> str:
try:
resp = await client.get(
f"https://huggingface.co/spaces/{space_id}",
follow_redirects=True,
)
if resp.status_code == 200:
match = OG_IMAGE_RE.search(resp.text)
if match:
return match.group(1)
except Exception:
pass
return ""
async def detect_template(client: httpx.AsyncClient, space_id: str) -> str:
try:
resp = await client.get(
f"https://huggingface.co/spaces/{space_id}/resolve/main/{ARTICLE_MDX_PATH}",
follow_redirects=True,
)
if resp.status_code == 200:
match = TEMPLATE_RE.search(resp.text[:2000])
if match:
return match.group(1)
except Exception:
pass
return "article"
async def fetch_spaces():
async with httpx.AsyncClient(timeout=30, limits=httpx.Limits(max_connections=10)) as client:
resp = await client.get(
HF_API_URL,
params={
"filter": "research-article-template",
"limit": 100,
"full": "true",
"sort": "likes",
"direction": "-1",
},
)
resp.raise_for_status()
raw = resp.json()
og_tasks = [scrape_og_image(client, s["id"]) for s in raw]
template_tasks = [detect_template(client, s["id"]) for s in raw]
og_images = await asyncio.gather(*og_tasks)
templates = await asyncio.gather(*template_tasks)
results = []
for s, og_img, tpl in zip(raw, og_images, templates):
card = s.get("cardData") or {}
space_id = s["id"]
thumbnail = og_img
if not thumbnail:
thumbnail = f"{CDN_THUMB_BASE}/{space_id}.png"
results.append(
{
"id": space_id,
"author": space_id.split("/")[0],
"name": space_id.split("/")[-1],
"title": card.get("title") or space_id.split("/")[-1],
"description": card.get("short_description") or "",
"emoji": card.get("emoji") or "",
"likes": s.get("likes", 0),
"createdAt": s.get("createdAt", ""),
"lastModified": s.get("lastModified", ""),
"thumbnail": thumbnail,
"sdk": s.get("sdk", ""),
"template": tpl,
"runtime_stage": (s.get("runtime") or {}).get("stage", ""),
}
)
# Deduplicate: for spaces sharing the same title, keep only the one
# with the most likes (the original). The rest are forks.
best_by_title = {}
for s in results:
title = s["title"].strip().lower()
prev = best_by_title.get(title)
if prev is None or s["likes"] > prev["likes"]:
best_by_title[title] = s
originals = set(s["id"] for s in best_by_title.values())
results = [s for s in results if s["id"] in originals]
return results
async def refresh_cache():
try:
data = await fetch_spaces()
cache["spaces"] = data
cache["last_updated"] = time.time()
CACHE_FILE.write_text(json.dumps(cache))
logger.info("Cache refreshed: %d spaces", len(data))
except Exception as e:
logger.error("Cache refresh failed: %s", e)
async def periodic_refresh():
while True:
await asyncio.sleep(REFRESH_INTERVAL)
await refresh_cache()
@asynccontextmanager
async def lifespan(_app: FastAPI):
if CACHE_FILE.exists():
try:
cache.update(json.loads(CACHE_FILE.read_text()))
logger.info("Loaded %d spaces from disk cache", len(cache["spaces"]))
except Exception:
pass
await refresh_cache()
task = asyncio.create_task(periodic_refresh())
yield
task.cancel()
app = FastAPI(lifespan=lifespan)
@app.get("/api/spaces")
async def get_spaces():
return JSONResponse(
{
"spaces": cache["spaces"],
"last_updated": cache["last_updated"],
"count": len(cache["spaces"]),
}
)
app.mount("/", StaticFiles(directory="static", html=True), name="static")