XciD HF Staff commited on
Commit
ad7cdc3
·
1 Parent(s): e681fab

fix: scrape public Trustpilot page instead of blocked API

Browse files
Files changed (1) hide show
  1. app.py +50 -24
app.py CHANGED
@@ -1,7 +1,9 @@
1
- from fastapi import FastAPI, Response
2
  from fastapi.middleware.cors import CORSMiddleware
3
  import httpx
4
- import os
 
 
5
 
6
  app = FastAPI()
7
 
@@ -12,31 +14,55 @@ app.add_middleware(
12
  allow_headers=["*"],
13
  )
14
 
15
- API_KEY = os.environ.get("TRUSTPILOT_API_KEY")
16
- BUSINESS_UNIT_ID = os.environ.get("TRUSTPILOT_BUSINESS_UNIT_ID")
17
- BASE_URL = "https://api.trustpilot.com/v1"
18
 
 
 
 
 
 
 
 
19
 
20
- @app.get("/business")
21
- async def get_business():
22
  async with httpx.AsyncClient() as client:
23
- r = await client.get(
24
- f"{BASE_URL}/business-units/{BUSINESS_UNIT_ID}",
25
- params={"apikey": API_KEY},
26
- )
27
- return Response(content=r.content, media_type="application/json", status_code=r.status_code)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
 
30
  @app.get("/reviews")
31
- async def get_reviews(perPage: int = 6, stars: str = "4,5"):
32
- async with httpx.AsyncClient() as client:
33
- r = await client.get(
34
- f"{BASE_URL}/business-units/{BUSINESS_UNIT_ID}/reviews",
35
- params={
36
- "apikey": API_KEY,
37
- "perPage": perPage,
38
- "orderBy": "createdat.desc",
39
- "stars": stars,
40
- },
41
- )
42
- return Response(content=r.content, media_type="application/json", status_code=r.status_code)
 
1
+ from fastapi import FastAPI
2
  from fastapi.middleware.cors import CORSMiddleware
3
  import httpx
4
+ import json
5
+ import asyncio
6
+ from datetime import datetime, timedelta
7
 
8
  app = FastAPI()
9
 
 
14
  allow_headers=["*"],
15
  )
16
 
17
+ TRUSTPILOT_URL = "https://www.trustpilot.com/review/collectionsdarchitectes.fr"
18
+ USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
 
19
 
20
+ cache = {"data": None, "expires": None}
21
+
22
+
23
+ async def fetch_reviews():
24
+ now = datetime.utcnow()
25
+ if cache["data"] and cache["expires"] and now < cache["expires"]:
26
+ return cache["data"]
27
 
 
 
28
  async with httpx.AsyncClient() as client:
29
+ r = await client.get(TRUSTPILOT_URL, headers={"User-Agent": USER_AGENT}, follow_redirects=True)
30
+
31
+ html = r.text
32
+ marker = '__NEXT_DATA__'
33
+ start = html.find(marker)
34
+ if start == -1:
35
+ return cache.get("data") or {"error": "Could not parse Trustpilot page"}
36
+
37
+ start = html.find('>', start) + 1
38
+ end = html.find('</script>', start)
39
+ raw = html[start:end]
40
+
41
+ data = json.loads(raw)
42
+ props = data.get("props", {}).get("pageProps", {})
43
+ biz = props.get("businessUnit", {})
44
+ reviews = props.get("reviews", [])
45
+
46
+ result = {
47
+ "score": biz.get("trustScore"),
48
+ "total": biz.get("numberOfReviews"),
49
+ "reviews": [
50
+ {
51
+ "stars": r.get("rating"),
52
+ "title": r.get("title"),
53
+ "text": r.get("text"),
54
+ "author": r.get("consumer", {}).get("displayName"),
55
+ "date": r.get("dates", {}).get("publishedDate"),
56
+ }
57
+ for r in reviews
58
+ ],
59
+ }
60
+
61
+ cache["data"] = result
62
+ cache["expires"] = now + timedelta(hours=1)
63
+ return result
64
 
65
 
66
  @app.get("/reviews")
67
+ async def get_reviews():
68
+ return await fetch_reviews()