Spaces:
Running
Running
sghorbal
commited on
Commit
·
aace7de
1
Parent(s):
696d285
add flaresolverr support
Browse files- .env.example +1 -0
- src/main.py +18 -1
- src/service/scrapper.py +79 -25
.env.example
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
DATABASE_URL=
|
| 2 |
REDIS_URL=
|
|
|
|
| 3 |
|
| 4 |
# If set, protects the API from unauthorized called
|
| 5 |
FASTAPI_API_KEY=
|
|
|
|
| 1 |
DATABASE_URL=
|
| 2 |
REDIS_URL=
|
| 3 |
+
FLARESOLVERR_API=
|
| 4 |
|
| 5 |
# If set, protects the API from unauthorized called
|
| 6 |
FASTAPI_API_KEY=
|
src/main.py
CHANGED
|
@@ -310,6 +310,23 @@ async def check_health(session: Annotated[Session, Depends(get_session)]):
|
|
| 310 |
session.execute(text("SELECT 1"))
|
| 311 |
except Exception as e:
|
| 312 |
logger.error(f"DB check failed: {e}")
|
| 313 |
-
return JSONResponse(content={"status": "unhealthy"},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 314 |
|
| 315 |
return JSONResponse(content={"status": "healthy"}, status_code=HTTP_200_OK)
|
|
|
|
| 310 |
session.execute(text("SELECT 1"))
|
| 311 |
except Exception as e:
|
| 312 |
logger.error(f"DB check failed: {e}")
|
| 313 |
+
return JSONResponse(content={"status": "unhealthy", "detail": "Database not reachable"},
|
| 314 |
+
status_code=HTTP_503_SERVICE_UNAVAILABLE)
|
| 315 |
+
|
| 316 |
+
# Check if the scraper endpoint is reachable
|
| 317 |
+
if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
|
| 318 |
+
import requests
|
| 319 |
+
|
| 320 |
+
try:
|
| 321 |
+
# Ping the scraper endpoint
|
| 322 |
+
response = requests.get(FLARESOLVERR_API + "health", timeout=5)
|
| 323 |
+
if response.status_code != HTTP_200_OK:
|
| 324 |
+
logger.error(f"Scraper check failed: {response.status_code}")
|
| 325 |
+
return JSONResponse(content={"status": "unhealthy", "detail": "Flaresolverr not reachable"},
|
| 326 |
+
status_code=HTTP_503_SERVICE_UNAVAILABLE)
|
| 327 |
+
except requests.RequestException as e:
|
| 328 |
+
logger.error(f"Scraper check failed: {e}")
|
| 329 |
+
return JSONResponse(content={"status": "unhealthy", "detail": "Flaresolverr not reachable"},
|
| 330 |
+
status_code=HTTP_503_SERVICE_UNAVAILABLE)
|
| 331 |
|
| 332 |
return JSONResponse(content={"status": "healthy"}, status_code=HTTP_200_OK)
|
src/service/scrapper.py
CHANGED
|
@@ -1,5 +1,72 @@
|
|
| 1 |
-
import requests
|
| 2 |
from typing import List, Dict
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
def search_player(raw_name: str) -> List[Dict]:
|
| 5 |
"""
|
|
@@ -16,20 +83,11 @@ def search_player(raw_name: str) -> List[Dict]:
|
|
| 16 |
# Construct the URL for the ATP Tour search
|
| 17 |
url = f"https://www.atptour.com/en/-/www/site-search/{last_name.lower()}/"
|
| 18 |
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
# Uncomment the following lines to make an actual request
|
| 25 |
-
response = requests.get(url, headers=headers)
|
| 26 |
-
|
| 27 |
-
# Check if the request was successful
|
| 28 |
-
if response.status_code != 200:
|
| 29 |
-
raise Exception(f"Failed to fetch data: {response.status_code}")
|
| 30 |
-
|
| 31 |
-
# Parse the JSON response
|
| 32 |
-
data = response.json()
|
| 33 |
|
| 34 |
# Check if the response contains player data
|
| 35 |
if 'Players' not in data or not data['Players']:
|
|
@@ -65,17 +123,13 @@ def get_personal_details(playerId: str) -> Dict:
|
|
| 65 |
details = get_personal_details(playerId)
|
| 66 |
print(json.dumps(output, indent=4))
|
| 67 |
"""
|
| 68 |
-
|
| 69 |
-
headers = {
|
| 70 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
| 71 |
-
}
|
| 72 |
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
data = response.json()
|
| 79 |
|
| 80 |
# Extract personal details
|
| 81 |
personal_details = {
|
|
|
|
|
|
|
| 1 |
from typing import List, Dict
|
| 2 |
+
import requests
|
| 3 |
+
import json
|
| 4 |
+
import os
|
| 5 |
+
import re
|
| 6 |
+
from dotenv import load_dotenv
|
| 7 |
+
import logging
|
| 8 |
+
from starlette.status import HTTP_200_OK
|
| 9 |
+
|
| 10 |
+
# Set up logging
|
| 11 |
+
logger = logging.getLogger(__name__)
|
| 12 |
+
|
| 13 |
+
# Load environment variables from .env file
|
| 14 |
+
load_dotenv()
|
| 15 |
+
|
| 16 |
+
def get_without_flaresolverr(url: str) -> str:
|
| 17 |
+
"""
|
| 18 |
+
Bypass Cloudflare protection using a standard request.
|
| 19 |
+
This function sends a request to the given URL and returns the response content.
|
| 20 |
+
"""
|
| 21 |
+
# Define request parameters
|
| 22 |
+
headers = {
|
| 23 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
|
| 24 |
+
}
|
| 25 |
+
|
| 26 |
+
# Send request to the URL
|
| 27 |
+
response = requests.get(url, headers=headers)
|
| 28 |
+
|
| 29 |
+
# Check if the request was successful
|
| 30 |
+
if response.status_code != HTTP_200_OK:
|
| 31 |
+
raise Exception(f"Failed to fetch data: {response.status_code}")
|
| 32 |
+
|
| 33 |
+
# Parse and return the JSON response
|
| 34 |
+
return response.json()
|
| 35 |
+
|
| 36 |
+
def get_with_flaresolverr(url: str, flare_api: str) -> str:
|
| 37 |
+
"""
|
| 38 |
+
Bypass Cloudflare protection using a dedicated service like FlareSolverr.
|
| 39 |
+
This function sends a request to FlareSolverr and returns the response content.
|
| 40 |
+
"""
|
| 41 |
+
# Define request parameters
|
| 42 |
+
data = {
|
| 43 |
+
"cmd": "request.get",
|
| 44 |
+
"url": url,
|
| 45 |
+
"maxTimeout": 60000
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
# Send request to FlareSolverr
|
| 49 |
+
endpoint = flare_api + 'v1'
|
| 50 |
+
response = requests.post(endpoint,
|
| 51 |
+
headers={"Content-Type": "application/json"},
|
| 52 |
+
json=data)
|
| 53 |
+
|
| 54 |
+
response.raise_for_status() # Raise an error for bad responses
|
| 55 |
+
|
| 56 |
+
# Extract page content
|
| 57 |
+
page_content = response.json().get('solution', {}).get('response', '')
|
| 58 |
+
|
| 59 |
+
# Parse the HTML to extract the JSON inside <pre>
|
| 60 |
+
match = re.search(r"<pre.*?>(.*?)</pre>", page_content, re.DOTALL)
|
| 61 |
+
if match:
|
| 62 |
+
json_text = match.group(1).strip()
|
| 63 |
+
try:
|
| 64 |
+
return json.loads(json_text)
|
| 65 |
+
except json.JSONDecodeError:
|
| 66 |
+
logger.error("Invalid JSON", json_text)
|
| 67 |
+
raise
|
| 68 |
+
else:
|
| 69 |
+
raise ValueError("No <pre> tag found in the response")
|
| 70 |
|
| 71 |
def search_player(raw_name: str) -> List[Dict]:
|
| 72 |
"""
|
|
|
|
| 83 |
# Construct the URL for the ATP Tour search
|
| 84 |
url = f"https://www.atptour.com/en/-/www/site-search/{last_name.lower()}/"
|
| 85 |
|
| 86 |
+
if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
|
| 87 |
+
# Use FlareSolverr to bypass Cloudflare
|
| 88 |
+
data = get_with_flaresolverr(url=url, flare_api=FLARESOLVERR_API)
|
| 89 |
+
else:
|
| 90 |
+
data = get_without_flaresolverr(url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
# Check if the response contains player data
|
| 93 |
if 'Players' not in data or not data['Players']:
|
|
|
|
| 123 |
details = get_personal_details(playerId)
|
| 124 |
print(json.dumps(output, indent=4))
|
| 125 |
"""
|
| 126 |
+
url = f"https://www.atptour.com/en/-/www/players/hero/{playerId}/"
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
+
if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
|
| 129 |
+
# Use FlareSolverr to bypass Cloudflare
|
| 130 |
+
data = get_with_flaresolverr(url=url, flare_api=FLARESOLVERR_API)
|
| 131 |
+
else:
|
| 132 |
+
data = get_without_flaresolverr(url)
|
|
|
|
| 133 |
|
| 134 |
# Extract personal details
|
| 135 |
personal_details = {
|