Spaces:

SlimG
/

tennis-api

Running

App Files Files Community

sghorbal commited on Apr 20, 2025

Commit

aace7de

1 Parent(s): 696d285

add flaresolverr support

Browse files

Files changed (3) hide show

.env.example +1 -0
src/main.py +18 -1
src/service/scrapper.py +79 -25

.env.example CHANGED Viewed

@@ -1,5 +1,6 @@
 DATABASE_URL=
 REDIS_URL=
 # If set, protects the API from unauthorized called
 FASTAPI_API_KEY=

 DATABASE_URL=
 REDIS_URL=
+FLARESOLVERR_API=
 # If set, protects the API from unauthorized called
 FASTAPI_API_KEY=

src/main.py CHANGED Viewed

@@ -310,6 +310,23 @@ async def check_health(session: Annotated[Session, Depends(get_session)]):
         session.execute(text("SELECT 1"))
     except Exception as e:
         logger.error(f"DB check failed: {e}")
-        return JSONResponse(content={"status": "unhealthy"}, status_code=HTTP_503_SERVICE_UNAVAILABLE)
     return JSONResponse(content={"status": "healthy"}, status_code=HTTP_200_OK)

         session.execute(text("SELECT 1"))
     except Exception as e:
         logger.error(f"DB check failed: {e}")
+        return JSONResponse(content={"status": "unhealthy", "detail": "Database not reachable"},
+                            status_code=HTTP_503_SERVICE_UNAVAILABLE)
+    # Check if the scraper endpoint is reachable
+    if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
+        import requests
+        try:
+            # Ping the scraper endpoint
+            response = requests.get(FLARESOLVERR_API + "health", timeout=5)
+            if response.status_code != HTTP_200_OK:
+                logger.error(f"Scraper check failed: {response.status_code}")
+                return JSONResponse(content={"status": "unhealthy", "detail": "Flaresolverr not reachable"},
+                                    status_code=HTTP_503_SERVICE_UNAVAILABLE)
+        except requests.RequestException as e:
+            logger.error(f"Scraper check failed: {e}")
+            return JSONResponse(content={"status": "unhealthy", "detail": "Flaresolverr not reachable"},
+                                status_code=HTTP_503_SERVICE_UNAVAILABLE)
     return JSONResponse(content={"status": "healthy"}, status_code=HTTP_200_OK)

src/service/scrapper.py CHANGED Viewed

@@ -1,5 +1,72 @@
-import requests
 from typing import List, Dict
 def search_player(raw_name: str) -> List[Dict]:
     """
@@ -16,20 +83,11 @@ def search_player(raw_name: str) -> List[Dict]:
     # Construct the URL for the ATP Tour search
     url = f"https://www.atptour.com/en/-/www/site-search/{last_name.lower()}/"
-    # Ajax request to fetch player data
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-    }
-    # Uncomment the following lines to make an actual request
-    response = requests.get(url, headers=headers)
-    # Check if the request was successful
-    if response.status_code != 200:
-        raise Exception(f"Failed to fetch data: {response.status_code}")
-    # Parse the JSON response
-    data = response.json()
     # Check if the response contains player data
     if 'Players' not in data or not data['Players']:
@@ -65,17 +123,13 @@ def get_personal_details(playerId: str) -> Dict:
     details = get_personal_details(playerId)
     print(json.dumps(output, indent=4))
     """
-    # AJAX request to fetch player details
-    headers = {
-        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
-    }
-    # Uncomment the following lines to make an actual request
-    response = requests.get(f"https://www.atptour.com/en/-/www/players/hero/{playerId}", headers=headers)
-    response.raise_for_status()
-    data = response.json()
     # Extract personal details
     personal_details = {

 from typing import List, Dict
+import requests
+import json
+import os
+import re
+from dotenv import load_dotenv
+import logging
+from starlette.status import HTTP_200_OK
+# Set up logging
+logger = logging.getLogger(__name__)
+# Load environment variables from .env file
+load_dotenv()
+def get_without_flaresolverr(url: str) -> str:
+    """
+    Bypass Cloudflare protection using a standard request.
+    This function sends a request to the given URL and returns the response content.
+    """
+    # Define request parameters
+    headers = {
+        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
+    }
+    # Send request to the URL
+    response = requests.get(url, headers=headers)
+    # Check if the request was successful
+    if response.status_code != HTTP_200_OK:
+        raise Exception(f"Failed to fetch data: {response.status_code}")
+    # Parse and return the JSON response
+    return response.json()
+def get_with_flaresolverr(url: str, flare_api: str) -> str:
+    """
+    Bypass Cloudflare protection using a dedicated service like FlareSolverr.
+    This function sends a request to FlareSolverr and returns the response content.
+    """
+    # Define request parameters
+    data = {
+        "cmd": "request.get",
+        "url": url,
+        "maxTimeout": 60000
+    }
+    # Send request to FlareSolverr
+    endpoint = flare_api + 'v1'
+    response = requests.post(endpoint,
+                             headers={"Content-Type": "application/json"},
+                             json=data)
+    response.raise_for_status()  # Raise an error for bad responses
+    # Extract page content
+    page_content = response.json().get('solution', {}).get('response', '')
+    # Parse the HTML to extract the JSON inside <pre>
+    match = re.search(r"<pre.*?>(.*?)</pre>", page_content, re.DOTALL)
+    if match:
+        json_text = match.group(1).strip()
+        try:
+            return json.loads(json_text)
+        except json.JSONDecodeError:
+            logger.error("Invalid JSON", json_text)
+            raise
+    else:
+        raise ValueError("No <pre> tag found in the response")
 def search_player(raw_name: str) -> List[Dict]:
     """
     # Construct the URL for the ATP Tour search
     url = f"https://www.atptour.com/en/-/www/site-search/{last_name.lower()}/"
+    if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
+        # Use FlareSolverr to bypass Cloudflare
+        data = get_with_flaresolverr(url=url, flare_api=FLARESOLVERR_API)
+    else:
+        data = get_without_flaresolverr(url)
     # Check if the response contains player data
     if 'Players' not in data or not data['Players']:
     details = get_personal_details(playerId)
     print(json.dumps(output, indent=4))
     """
+    url = f"https://www.atptour.com/en/-/www/players/hero/{playerId}/"
+    if FLARESOLVERR_API := os.getenv("FLARESOLVERR_API"):
+        # Use FlareSolverr to bypass Cloudflare
+        data = get_with_flaresolverr(url=url, flare_api=FLARESOLVERR_API)
+    else:
+        data = get_without_flaresolverr(url)
     # Extract personal details
     personal_details = {