Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
fix: scrape public Trustpilot page instead of blocked API
Browse files
app.py
CHANGED
|
@@ -1,7 +1,9 @@
|
|
| 1 |
-
from fastapi import FastAPI
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
import httpx
|
| 4 |
-
import
|
|
|
|
|
|
|
| 5 |
|
| 6 |
app = FastAPI()
|
| 7 |
|
|
@@ -12,31 +14,55 @@ app.add_middleware(
|
|
| 12 |
allow_headers=["*"],
|
| 13 |
)
|
| 14 |
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
BASE_URL = "https://api.trustpilot.com/v1"
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
@app.get("/business")
|
| 21 |
-
async def get_business():
|
| 22 |
async with httpx.AsyncClient() as client:
|
| 23 |
-
r = await client.get(
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
|
| 30 |
@app.get("/reviews")
|
| 31 |
-
async def get_reviews(
|
| 32 |
-
|
| 33 |
-
r = await client.get(
|
| 34 |
-
f"{BASE_URL}/business-units/{BUSINESS_UNIT_ID}/reviews",
|
| 35 |
-
params={
|
| 36 |
-
"apikey": API_KEY,
|
| 37 |
-
"perPage": perPage,
|
| 38 |
-
"orderBy": "createdat.desc",
|
| 39 |
-
"stars": stars,
|
| 40 |
-
},
|
| 41 |
-
)
|
| 42 |
-
return Response(content=r.content, media_type="application/json", status_code=r.status_code)
|
|
|
|
| 1 |
+
from fastapi import FastAPI
|
| 2 |
from fastapi.middleware.cors import CORSMiddleware
|
| 3 |
import httpx
|
| 4 |
+
import json
|
| 5 |
+
import asyncio
|
| 6 |
+
from datetime import datetime, timedelta
|
| 7 |
|
| 8 |
app = FastAPI()
|
| 9 |
|
|
|
|
| 14 |
allow_headers=["*"],
|
| 15 |
)
|
| 16 |
|
| 17 |
+
TRUSTPILOT_URL = "https://www.trustpilot.com/review/collectionsdarchitectes.fr"
|
| 18 |
+
USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
|
|
|
| 19 |
|
| 20 |
+
cache = {"data": None, "expires": None}
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
async def fetch_reviews():
|
| 24 |
+
now = datetime.utcnow()
|
| 25 |
+
if cache["data"] and cache["expires"] and now < cache["expires"]:
|
| 26 |
+
return cache["data"]
|
| 27 |
|
|
|
|
|
|
|
| 28 |
async with httpx.AsyncClient() as client:
|
| 29 |
+
r = await client.get(TRUSTPILOT_URL, headers={"User-Agent": USER_AGENT}, follow_redirects=True)
|
| 30 |
+
|
| 31 |
+
html = r.text
|
| 32 |
+
marker = '__NEXT_DATA__'
|
| 33 |
+
start = html.find(marker)
|
| 34 |
+
if start == -1:
|
| 35 |
+
return cache.get("data") or {"error": "Could not parse Trustpilot page"}
|
| 36 |
+
|
| 37 |
+
start = html.find('>', start) + 1
|
| 38 |
+
end = html.find('</script>', start)
|
| 39 |
+
raw = html[start:end]
|
| 40 |
+
|
| 41 |
+
data = json.loads(raw)
|
| 42 |
+
props = data.get("props", {}).get("pageProps", {})
|
| 43 |
+
biz = props.get("businessUnit", {})
|
| 44 |
+
reviews = props.get("reviews", [])
|
| 45 |
+
|
| 46 |
+
result = {
|
| 47 |
+
"score": biz.get("trustScore"),
|
| 48 |
+
"total": biz.get("numberOfReviews"),
|
| 49 |
+
"reviews": [
|
| 50 |
+
{
|
| 51 |
+
"stars": r.get("rating"),
|
| 52 |
+
"title": r.get("title"),
|
| 53 |
+
"text": r.get("text"),
|
| 54 |
+
"author": r.get("consumer", {}).get("displayName"),
|
| 55 |
+
"date": r.get("dates", {}).get("publishedDate"),
|
| 56 |
+
}
|
| 57 |
+
for r in reviews
|
| 58 |
+
],
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
cache["data"] = result
|
| 62 |
+
cache["expires"] = now + timedelta(hours=1)
|
| 63 |
+
return result
|
| 64 |
|
| 65 |
|
| 66 |
@app.get("/reviews")
|
| 67 |
+
async def get_reviews():
|
| 68 |
+
return await fetch_reviews()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|