Spaces:

bparekh99
/

WebsiteOptimizer

Sleeping

App Files Files Community

bparekh99 commited on Jan 3

Commit

30ed6f9

verified ·

1 Parent(s): 3fa7a1b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -10

app.py CHANGED Viewed

@@ -55,6 +55,26 @@ def validate_url(url: str) -> Tuple[bool, str]:
         return False, f"URL validation error: {str(e)}"
 # -----------------------------
 # Enhanced Content Extraction
 # -----------------------------
@@ -116,21 +136,25 @@ def fetch_website_text(url: str) -> Tuple[str, bool]:
     for attempt in range(MAX_RETRIES):
         try:
-            response = requests.get(
-                url,
-                headers=headers,
-                timeout=TIMEOUT,
-                allow_redirects=True,
-            )
-            response.raise_for_status()
-            break
         except requests.exceptions.RequestException as e:
             if attempt == MAX_RETRIES - 1:
                 raise
             time.sleep(1)
-    soup = BeautifulSoup(response.text, "html.parser")
     # Remove noisy tags
     for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
         tag.decompose()

         return False, f"URL validation error: {str(e)}"
+# -----------------------------
+# Proxy Option (if AFC blocks direct requests)
+# -----------------------------
+USE_PROXY = False  # Set to True if you need to use a proxy service
+def fetch_via_proxy(url: str) -> str:
+    """Fetch content via a proxy service (for AFC restrictions)."""
+    # Option 1: ScraperAPI (free tier available)
+    # proxy_url = f"http://api.scraperapi.com?api_key=YOUR_KEY&url={url}"
+    # Option 2: WebScraping.AI (free tier available)
+    # proxy_url = f"https://api.webscraping.ai/html?api_key=YOUR_KEY&url={url}"
+    # Option 3: ScrapingBee (free tier available)
+    proxy_url = f"https://app.scrapingbee.com/api/v1/?api_key=YOUR_KEY&url={url}"
+    response = requests.get(proxy_url, timeout=30)
+    response.raise_for_status()
+    return response.text
 # -----------------------------
 # Enhanced Content Extraction
 # -----------------------------
     for attempt in range(MAX_RETRIES):
         try:
+            if USE_PROXY:
+                html_content = fetch_via_proxy(url)
+                soup = BeautifulSoup(html_content, "html.parser")
+                break
+            else:
+                response = requests.get(
+                    url,
+                    headers=headers,
+                    timeout=TIMEOUT,
+                    allow_redirects=True,
+                )
+                response.raise_for_status()
+                soup = BeautifulSoup(response.text, "html.parser")
+                break
         except requests.exceptions.RequestException as e:
             if attempt == MAX_RETRIES - 1:
                 raise
             time.sleep(1)
     # Remove noisy tags
     for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
         tag.decompose()