sghorbal commited on
Commit
aace7de
·
1 Parent(s): 696d285

add flaresolverr support

Browse files
Files changed (3) hide show
  1. .env.example +1 -0
  2. src/main.py +18 -1
  3. src/service/scrapper.py +79 -25
.env.example CHANGED
@@ -1,5 +1,6 @@
1
  DATABASE_URL=
2
  REDIS_URL=
 
3
 
4
  # If set, protects the API from unauthorized called
5
  FASTAPI_API_KEY=
 
1
  DATABASE_URL=
2
  REDIS_URL=
3
+ FLARESOLVERR_API=
4
 
5
  # If set, protects the API from unauthorized called
6
  FASTAPI_API_KEY=
src/main.py CHANGED
@@ -310,6 +310,23 @@ async def check_health(session: Annotated[Session, Depends(get_session)]):
310
  session.execute(text("SELECT 1"))
311
  except Exception as e:
312
  logger.error(f"DB check failed: {e}")
313
- return JSONResponse(content={"status": "unhealthy"}, status_code=HTTP_503_SERVICE_UNAVAILABLE)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
314
 
315
  return JSONResponse(content={"status": "healthy"}, status_code=HTTP_200_OK)
 
310
  session.execute(text("SELECT 1"))
311
  except Exception as e:
312
  logger.error(f"DB check failed: {e}")
313
+ return JSONResponse(content={"status": "unhealthy", "detail": "Database not reachable"},
314
+ status_code=HTTP_503_SERVICE_UNAVAILABLE)
315
+
316
+ # Check if the scraper endpoint is reachable
317
+ if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
318
+ import requests
319
+
320
+ try:
321
+ # Ping the scraper endpoint
322
+ response = requests.get(FLARESOLVERR_API + "health", timeout=5)
323
+ if response.status_code != HTTP_200_OK:
324
+ logger.error(f"Scraper check failed: {response.status_code}")
325
+ return JSONResponse(content={"status": "unhealthy", "detail": "Flaresolverr not reachable"},
326
+ status_code=HTTP_503_SERVICE_UNAVAILABLE)
327
+ except requests.RequestException as e:
328
+ logger.error(f"Scraper check failed: {e}")
329
+ return JSONResponse(content={"status": "unhealthy", "detail": "Flaresolverr not reachable"},
330
+ status_code=HTTP_503_SERVICE_UNAVAILABLE)
331
 
332
  return JSONResponse(content={"status": "healthy"}, status_code=HTTP_200_OK)
src/service/scrapper.py CHANGED
@@ -1,5 +1,72 @@
1
- import requests
2
  from typing import List, Dict
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
 
4
  def search_player(raw_name: str) -> List[Dict]:
5
  """
@@ -16,20 +83,11 @@ def search_player(raw_name: str) -> List[Dict]:
16
  # Construct the URL for the ATP Tour search
17
  url = f"https://www.atptour.com/en/-/www/site-search/{last_name.lower()}/"
18
 
19
- # Ajax request to fetch player data
20
- headers = {
21
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
22
- }
23
-
24
- # Uncomment the following lines to make an actual request
25
- response = requests.get(url, headers=headers)
26
-
27
- # Check if the request was successful
28
- if response.status_code != 200:
29
- raise Exception(f"Failed to fetch data: {response.status_code}")
30
-
31
- # Parse the JSON response
32
- data = response.json()
33
 
34
  # Check if the response contains player data
35
  if 'Players' not in data or not data['Players']:
@@ -65,17 +123,13 @@ def get_personal_details(playerId: str) -> Dict:
65
  details = get_personal_details(playerId)
66
  print(json.dumps(output, indent=4))
67
  """
68
- # AJAX request to fetch player details
69
- headers = {
70
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
71
- }
72
 
73
- # Uncomment the following lines to make an actual request
74
- response = requests.get(f"https://www.atptour.com/en/-/www/players/hero/{playerId}", headers=headers)
75
-
76
- response.raise_for_status()
77
-
78
- data = response.json()
79
 
80
  # Extract personal details
81
  personal_details = {
 
 
1
  from typing import List, Dict
2
+ import requests
3
+ import json
4
+ import os
5
+ import re
6
+ from dotenv import load_dotenv
7
+ import logging
8
+ from starlette.status import HTTP_200_OK
9
+
10
+ # Set up logging
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # Load environment variables from .env file
14
+ load_dotenv()
15
+
16
+ def get_without_flaresolverr(url: str) -> str:
17
+ """
18
+ Bypass Cloudflare protection using a standard request.
19
+ This function sends a request to the given URL and returns the response content.
20
+ """
21
+ # Define request parameters
22
+ headers = {
23
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
24
+ }
25
+
26
+ # Send request to the URL
27
+ response = requests.get(url, headers=headers)
28
+
29
+ # Check if the request was successful
30
+ if response.status_code != HTTP_200_OK:
31
+ raise Exception(f"Failed to fetch data: {response.status_code}")
32
+
33
+ # Parse and return the JSON response
34
+ return response.json()
35
+
36
+ def get_with_flaresolverr(url: str, flare_api: str) -> str:
37
+ """
38
+ Bypass Cloudflare protection using a dedicated service like FlareSolverr.
39
+ This function sends a request to FlareSolverr and returns the response content.
40
+ """
41
+ # Define request parameters
42
+ data = {
43
+ "cmd": "request.get",
44
+ "url": url,
45
+ "maxTimeout": 60000
46
+ }
47
+
48
+ # Send request to FlareSolverr
49
+ endpoint = flare_api + 'v1'
50
+ response = requests.post(endpoint,
51
+ headers={"Content-Type": "application/json"},
52
+ json=data)
53
+
54
+ response.raise_for_status() # Raise an error for bad responses
55
+
56
+ # Extract page content
57
+ page_content = response.json().get('solution', {}).get('response', '')
58
+
59
+ # Parse the HTML to extract the JSON inside <pre>
60
+ match = re.search(r"<pre.*?>(.*?)</pre>", page_content, re.DOTALL)
61
+ if match:
62
+ json_text = match.group(1).strip()
63
+ try:
64
+ return json.loads(json_text)
65
+ except json.JSONDecodeError:
66
+ logger.error("Invalid JSON", json_text)
67
+ raise
68
+ else:
69
+ raise ValueError("No <pre> tag found in the response")
70
 
71
  def search_player(raw_name: str) -> List[Dict]:
72
  """
 
83
  # Construct the URL for the ATP Tour search
84
  url = f"https://www.atptour.com/en/-/www/site-search/{last_name.lower()}/"
85
 
86
+ if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
87
+ # Use FlareSolverr to bypass Cloudflare
88
+ data = get_with_flaresolverr(url=url, flare_api=FLARESOLVERR_API)
89
+ else:
90
+ data = get_without_flaresolverr(url)
 
 
 
 
 
 
 
 
 
91
 
92
  # Check if the response contains player data
93
  if 'Players' not in data or not data['Players']:
 
123
  details = get_personal_details(playerId)
124
  print(json.dumps(output, indent=4))
125
  """
126
+ url = f"https://www.atptour.com/en/-/www/players/hero/{playerId}/"
 
 
 
127
 
128
+ if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
129
+ # Use FlareSolverr to bypass Cloudflare
130
+ data = get_with_flaresolverr(url=url, flare_api=FLARESOLVERR_API)
131
+ else:
132
+ data = get_without_flaresolverr(url)
 
133
 
134
  # Extract personal details
135
  personal_details = {