XciD HF Staff commited on
Commit
702687b
·
unverified ·
1 Parent(s): ec60df8

feat: switch to Playwright with 48h refresh

Browse files

Trustpilot now serves an AWS WAF JS interstitial that blocks plain HTTP
fetches. Use Playwright/Chromium to solve the challenge, refresh in the
background every 48h, and serve cached data from /reviews.

Files changed (3) hide show
  1. Dockerfile +7 -1
  2. app.py +74 -38
  3. requirements.txt +1 -1
Dockerfile CHANGED
@@ -1,6 +1,12 @@
1
- FROM python:3.11-slim
 
2
  WORKDIR /app
3
  COPY requirements.txt .
4
  RUN pip install --no-cache-dir -r requirements.txt
 
5
  COPY app.py .
 
 
 
 
6
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
+ FROM mcr.microsoft.com/playwright/python:v1.49.0-jammy
2
+
3
  WORKDIR /app
4
  COPY requirements.txt .
5
  RUN pip install --no-cache-dir -r requirements.txt
6
+
7
  COPY app.py .
8
+
9
+ ENV HOME=/tmp
10
+ ENV PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
11
+
12
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,12 +1,23 @@
 
 
 
 
 
1
  from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
- import httpx
4
- import json
5
- import asyncio
6
- from datetime import datetime, timedelta
7
 
8
- app = FastAPI()
 
9
 
 
 
 
 
 
 
 
 
10
  app.add_middleware(
11
  CORSMiddleware,
12
  allow_origins=["https://collectionsdarchitectes.fr"],
@@ -14,36 +25,29 @@ app.add_middleware(
14
  allow_headers=["*"],
15
  )
16
 
17
- TRUSTPILOT_URL = "https://www.trustpilot.com/review/collectionsdarchitectes.fr"
18
- USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
19
-
20
- cache = {"data": None, "expires": None}
21
 
22
 
23
- async def fetch_reviews():
24
- now = datetime.utcnow()
25
- if cache["data"] and cache["expires"] and now < cache["expires"]:
26
- return cache["data"]
 
 
 
 
 
 
 
 
 
27
 
28
- async with httpx.AsyncClient() as client:
29
- r = await client.get(TRUSTPILOT_URL, headers={"User-Agent": USER_AGENT}, follow_redirects=True)
30
-
31
- html = r.text
32
- marker = '__NEXT_DATA__'
33
- start = html.find(marker)
34
- if start == -1:
35
- return cache.get("data") or {"error": "Could not parse Trustpilot page"}
36
-
37
- start = html.find('>', start) + 1
38
- end = html.find('</script>', start)
39
- raw = html[start:end]
40
-
41
- data = json.loads(raw)
42
- props = data.get("props", {}).get("pageProps", {})
43
- biz = props.get("businessUnit", {})
44
- reviews = props.get("reviews", [])
45
-
46
- result = {
47
  "score": biz.get("trustScore"),
48
  "total": biz.get("numberOfReviews"),
49
  "reviews": [
@@ -52,18 +56,50 @@ async def fetch_reviews():
52
  "stars": r.get("rating"),
53
  "title": r.get("title"),
54
  "text": r.get("text"),
55
- "author": r.get("consumer", {}).get("displayName"),
56
- "date": r.get("dates", {}).get("publishedDate"),
57
  }
58
  for r in reviews
59
  ],
60
  }
61
 
62
- cache["data"] = result
63
- cache["expires"] = now + timedelta(hours=24)
64
- return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
 
67
  @app.get("/reviews")
68
  async def get_reviews():
69
- return await fetch_reviews()
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ from datetime import datetime, timedelta, timezone
5
+
6
  from fastapi import FastAPI
7
  from fastapi.middleware.cors import CORSMiddleware
8
+ from playwright.async_api import async_playwright
 
 
 
9
 
10
+ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
11
+ log = logging.getLogger("trustpilot-proxy")
12
 
13
+ TRUSTPILOT_URL = "https://www.trustpilot.com/review/collectionsdarchitectes.fr"
14
+ REFRESH_INTERVAL = timedelta(hours=48)
15
+ USER_AGENT = (
16
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
17
+ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
18
+ )
19
+
20
+ app = FastAPI()
21
  app.add_middleware(
22
  CORSMiddleware,
23
  allow_origins=["https://collectionsdarchitectes.fr"],
 
25
  allow_headers=["*"],
26
  )
27
 
28
+ state = {"data": None, "fetched_at": None, "error": None}
29
+ _lock = asyncio.Lock()
 
 
30
 
31
 
32
+ async def scrape_once() -> dict:
33
+ async with async_playwright() as p:
34
+ browser = await p.chromium.launch(headless=True)
35
+ ctx = await browser.new_context(
36
+ user_agent=USER_AGENT,
37
+ locale="fr-FR",
38
+ viewport={"width": 1280, "height": 900},
39
+ )
40
+ page = await ctx.new_page()
41
+ await page.goto(TRUSTPILOT_URL, wait_until="domcontentloaded", timeout=60_000)
42
+ await page.wait_for_selector("script#__NEXT_DATA__", state="attached", timeout=45_000)
43
+ raw = await page.locator("script#__NEXT_DATA__").inner_text()
44
+ await browser.close()
45
 
46
+ next_data = json.loads(raw)
47
+ props = next_data.get("props", {}).get("pageProps", {})
48
+ biz = props.get("businessUnit", {}) or {}
49
+ reviews = props.get("reviews", []) or []
50
+ return {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  "score": biz.get("trustScore"),
52
  "total": biz.get("numberOfReviews"),
53
  "reviews": [
 
56
  "stars": r.get("rating"),
57
  "title": r.get("title"),
58
  "text": r.get("text"),
59
+ "author": (r.get("consumer") or {}).get("displayName"),
60
+ "date": (r.get("dates") or {}).get("publishedDate"),
61
  }
62
  for r in reviews
63
  ],
64
  }
65
 
66
+
67
+ async def refresh() -> None:
68
+ async with _lock:
69
+ try:
70
+ data = await scrape_once()
71
+ state["data"] = data
72
+ state["fetched_at"] = datetime.now(timezone.utc)
73
+ state["error"] = None
74
+ log.info("refreshed: score=%s total=%s reviews=%d",
75
+ data.get("score"), data.get("total"), len(data.get("reviews", [])))
76
+ except Exception as e:
77
+ log.exception("scrape failed")
78
+ state["error"] = f"{type(e).__name__}: {e}"
79
+
80
+
81
+ async def refresh_loop() -> None:
82
+ while True:
83
+ await refresh()
84
+ await asyncio.sleep(REFRESH_INTERVAL.total_seconds())
85
+
86
+
87
+ @app.on_event("startup")
88
+ async def _startup() -> None:
89
+ asyncio.create_task(refresh_loop())
90
 
91
 
92
  @app.get("/reviews")
93
  async def get_reviews():
94
+ if state["data"] is None:
95
+ return {"error": state["error"] or "warming up"}
96
+ return state["data"]
97
+
98
+
99
+ @app.get("/health")
100
+ async def health():
101
+ return {
102
+ "has_data": state["data"] is not None,
103
+ "fetched_at": state["fetched_at"].isoformat() if state["fetched_at"] else None,
104
+ "error": state["error"],
105
+ }
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  fastapi
2
  uvicorn[standard]
3
- httpx
 
1
  fastapi
2
  uvicorn[standard]
3
+ playwright==1.49.0