Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -55,6 +55,26 @@ def validate_url(url: str) -> Tuple[bool, str]:
|
|
| 55 |
return False, f"URL validation error: {str(e)}"
|
| 56 |
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
# -----------------------------
|
| 59 |
# Enhanced Content Extraction
|
| 60 |
# -----------------------------
|
|
@@ -116,21 +136,25 @@ def fetch_website_text(url: str) -> Tuple[str, bool]:
|
|
| 116 |
|
| 117 |
for attempt in range(MAX_RETRIES):
|
| 118 |
try:
|
| 119 |
-
|
| 120 |
-
url
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
except requests.exceptions.RequestException as e:
|
| 128 |
if attempt == MAX_RETRIES - 1:
|
| 129 |
raise
|
| 130 |
time.sleep(1)
|
| 131 |
|
| 132 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
| 133 |
-
|
| 134 |
# Remove noisy tags
|
| 135 |
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
|
| 136 |
tag.decompose()
|
|
|
|
| 55 |
return False, f"URL validation error: {str(e)}"
|
| 56 |
|
| 57 |
|
| 58 |
+
# -----------------------------
|
| 59 |
+
# Proxy Option (if AFC blocks direct requests)
|
| 60 |
+
# -----------------------------
|
| 61 |
+
USE_PROXY = False # Set to True if you need to use a proxy service
|
| 62 |
+
|
| 63 |
+
def fetch_via_proxy(url: str) -> str:
|
| 64 |
+
"""Fetch content via a proxy service (for AFC restrictions)."""
|
| 65 |
+
# Option 1: ScraperAPI (free tier available)
|
| 66 |
+
# proxy_url = f"http://api.scraperapi.com?api_key=YOUR_KEY&url={url}"
|
| 67 |
+
|
| 68 |
+
# Option 2: WebScraping.AI (free tier available)
|
| 69 |
+
# proxy_url = f"https://api.webscraping.ai/html?api_key=YOUR_KEY&url={url}"
|
| 70 |
+
|
| 71 |
+
# Option 3: ScrapingBee (free tier available)
|
| 72 |
+
proxy_url = f"https://app.scrapingbee.com/api/v1/?api_key=YOUR_KEY&url={url}"
|
| 73 |
+
|
| 74 |
+
response = requests.get(proxy_url, timeout=30)
|
| 75 |
+
response.raise_for_status()
|
| 76 |
+
return response.text
|
| 77 |
+
|
| 78 |
# -----------------------------
|
| 79 |
# Enhanced Content Extraction
|
| 80 |
# -----------------------------
|
|
|
|
| 136 |
|
| 137 |
for attempt in range(MAX_RETRIES):
|
| 138 |
try:
|
| 139 |
+
if USE_PROXY:
|
| 140 |
+
html_content = fetch_via_proxy(url)
|
| 141 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
| 142 |
+
break
|
| 143 |
+
else:
|
| 144 |
+
response = requests.get(
|
| 145 |
+
url,
|
| 146 |
+
headers=headers,
|
| 147 |
+
timeout=TIMEOUT,
|
| 148 |
+
allow_redirects=True,
|
| 149 |
+
)
|
| 150 |
+
response.raise_for_status()
|
| 151 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 152 |
+
break
|
| 153 |
except requests.exceptions.RequestException as e:
|
| 154 |
if attempt == MAX_RETRIES - 1:
|
| 155 |
raise
|
| 156 |
time.sleep(1)
|
| 157 |
|
|
|
|
|
|
|
| 158 |
# Remove noisy tags
|
| 159 |
for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
|
| 160 |
tag.decompose()
|