XciD's picture
XciD HF Staff
feat: switch to Playwright with 48h refresh
702687b unverified
import asyncio
import json
import logging
from datetime import datetime, timedelta, timezone
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from playwright.async_api import async_playwright
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
log = logging.getLogger("trustpilot-proxy")
TRUSTPILOT_URL = "https://www.trustpilot.com/review/collectionsdarchitectes.fr"
REFRESH_INTERVAL = timedelta(hours=48)
USER_AGENT = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
)
app = FastAPI()
app.add_middleware(
CORSMiddleware,
allow_origins=["https://collectionsdarchitectes.fr"],
allow_methods=["GET"],
allow_headers=["*"],
)
state = {"data": None, "fetched_at": None, "error": None}
_lock = asyncio.Lock()
async def scrape_once() -> dict:
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
ctx = await browser.new_context(
user_agent=USER_AGENT,
locale="fr-FR",
viewport={"width": 1280, "height": 900},
)
page = await ctx.new_page()
await page.goto(TRUSTPILOT_URL, wait_until="domcontentloaded", timeout=60_000)
await page.wait_for_selector("script#__NEXT_DATA__", state="attached", timeout=45_000)
raw = await page.locator("script#__NEXT_DATA__").inner_text()
await browser.close()
next_data = json.loads(raw)
props = next_data.get("props", {}).get("pageProps", {})
biz = props.get("businessUnit", {}) or {}
reviews = props.get("reviews", []) or []
return {
"score": biz.get("trustScore"),
"total": biz.get("numberOfReviews"),
"reviews": [
{
"id": r.get("id"),
"stars": r.get("rating"),
"title": r.get("title"),
"text": r.get("text"),
"author": (r.get("consumer") or {}).get("displayName"),
"date": (r.get("dates") or {}).get("publishedDate"),
}
for r in reviews
],
}
async def refresh() -> None:
async with _lock:
try:
data = await scrape_once()
state["data"] = data
state["fetched_at"] = datetime.now(timezone.utc)
state["error"] = None
log.info("refreshed: score=%s total=%s reviews=%d",
data.get("score"), data.get("total"), len(data.get("reviews", [])))
except Exception as e:
log.exception("scrape failed")
state["error"] = f"{type(e).__name__}: {e}"
async def refresh_loop() -> None:
while True:
await refresh()
await asyncio.sleep(REFRESH_INTERVAL.total_seconds())
@app.on_event("startup")
async def _startup() -> None:
asyncio.create_task(refresh_loop())
@app.get("/reviews")
async def get_reviews():
if state["data"] is None:
return {"error": state["error"] or "warming up"}
return state["data"]
@app.get("/health")
async def health():
return {
"has_data": state["data"] is not None,
"fetched_at": state["fetched_at"].isoformat() if state["fetched_at"] else None,
"error": state["error"],
}