bparekh99 commited on
Commit
30ed6f9
·
verified ·
1 Parent(s): 3fa7a1b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -10
app.py CHANGED
@@ -55,6 +55,26 @@ def validate_url(url: str) -> Tuple[bool, str]:
55
  return False, f"URL validation error: {str(e)}"
56
 
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  # -----------------------------
59
  # Enhanced Content Extraction
60
  # -----------------------------
@@ -116,21 +136,25 @@ def fetch_website_text(url: str) -> Tuple[str, bool]:
116
 
117
  for attempt in range(MAX_RETRIES):
118
  try:
119
- response = requests.get(
120
- url,
121
- headers=headers,
122
- timeout=TIMEOUT,
123
- allow_redirects=True,
124
- )
125
- response.raise_for_status()
126
- break
 
 
 
 
 
 
127
  except requests.exceptions.RequestException as e:
128
  if attempt == MAX_RETRIES - 1:
129
  raise
130
  time.sleep(1)
131
 
132
- soup = BeautifulSoup(response.text, "html.parser")
133
-
134
  # Remove noisy tags
135
  for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
136
  tag.decompose()
 
55
  return False, f"URL validation error: {str(e)}"
56
 
57
 
58
+ # -----------------------------
59
+ # Proxy Option (if AFC blocks direct requests)
60
+ # -----------------------------
61
+ USE_PROXY = False # Set to True if you need to use a proxy service
62
+
63
+ def fetch_via_proxy(url: str) -> str:
64
+ """Fetch content via a proxy service (for AFC restrictions)."""
65
+ # Option 1: ScraperAPI (free tier available)
66
+ # proxy_url = f"http://api.scraperapi.com?api_key=YOUR_KEY&url={url}"
67
+
68
+ # Option 2: WebScraping.AI (free tier available)
69
+ # proxy_url = f"https://api.webscraping.ai/html?api_key=YOUR_KEY&url={url}"
70
+
71
+ # Option 3: ScrapingBee (free tier available)
72
+ proxy_url = f"https://app.scrapingbee.com/api/v1/?api_key=YOUR_KEY&url={url}"
73
+
74
+ response = requests.get(proxy_url, timeout=30)
75
+ response.raise_for_status()
76
+ return response.text
77
+
78
  # -----------------------------
79
  # Enhanced Content Extraction
80
  # -----------------------------
 
136
 
137
  for attempt in range(MAX_RETRIES):
138
  try:
139
+ if USE_PROXY:
140
+ html_content = fetch_via_proxy(url)
141
+ soup = BeautifulSoup(html_content, "html.parser")
142
+ break
143
+ else:
144
+ response = requests.get(
145
+ url,
146
+ headers=headers,
147
+ timeout=TIMEOUT,
148
+ allow_redirects=True,
149
+ )
150
+ response.raise_for_status()
151
+ soup = BeautifulSoup(response.text, "html.parser")
152
+ break
153
  except requests.exceptions.RequestException as e:
154
  if attempt == MAX_RETRIES - 1:
155
  raise
156
  time.sleep(1)
157
 
 
 
158
  # Remove noisy tags
159
  for tag in soup(["script", "style", "noscript", "iframe", "nav", "footer"]):
160
  tag.decompose()