Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -69,7 +69,7 @@ def is_valid_url(url):
|
|
| 69 |
except ValueError:
|
| 70 |
return False
|
| 71 |
|
| 72 |
-
def scrape_with_bs4(url, session):
|
| 73 |
try:
|
| 74 |
response = session.get(url, timeout=5)
|
| 75 |
response.raise_for_status()
|
|
@@ -78,20 +78,20 @@ def scrape_with_bs4(url, session):
|
|
| 78 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
| 79 |
|
| 80 |
if main_content:
|
| 81 |
-
content = main_content.get_text(strip=True)
|
| 82 |
else:
|
| 83 |
-
content = soup.get_text(strip=True)
|
| 84 |
|
| 85 |
-
return content
|
| 86 |
except Exception as e:
|
| 87 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
| 88 |
return ""
|
| 89 |
|
| 90 |
-
def scrape_with_trafilatura(url):
|
| 91 |
try:
|
| 92 |
downloaded = fetch_url(url)
|
| 93 |
-
content = extract(downloaded)
|
| 94 |
-
return content or ""
|
| 95 |
except Exception as e:
|
| 96 |
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
| 97 |
return ""
|
|
@@ -371,72 +371,71 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
| 371 |
break
|
| 372 |
|
| 373 |
for result in results:
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
# Configure trafilatura to use a specific user agent
|
| 407 |
-
config = use_config()
|
| 408 |
-
config.set("DEFAULT", "USER_AGENT", ua)
|
| 409 |
-
|
| 410 |
-
content = extract(downloaded, config=config)
|
| 411 |
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
|
|
|
| 422 |
continue
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
|
|
|
| 426 |
continue
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
|
|
|
|
|
|
|
|
|
| 440 |
|
| 441 |
page += 1
|
| 442 |
|
|
|
|
| 69 |
except ValueError:
|
| 70 |
return False
|
| 71 |
|
| 72 |
+
def scrape_with_bs4(url, session, max_chars=None):
|
| 73 |
try:
|
| 74 |
response = session.get(url, timeout=5)
|
| 75 |
response.raise_for_status()
|
|
|
|
| 78 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', class_='content')
|
| 79 |
|
| 80 |
if main_content:
|
| 81 |
+
content = main_content.get_text(strip=True, separator='\n')
|
| 82 |
else:
|
| 83 |
+
content = soup.get_text(strip=True, separator='\n')
|
| 84 |
|
| 85 |
+
return content[:max_chars] if max_chars else content
|
| 86 |
except Exception as e:
|
| 87 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
| 88 |
return ""
|
| 89 |
|
| 90 |
+
def scrape_with_trafilatura(url, max_chars=None):
|
| 91 |
try:
|
| 92 |
downloaded = fetch_url(url)
|
| 93 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
| 94 |
+
return (content or "")[:max_chars] if max_chars else (content or "")
|
| 95 |
except Exception as e:
|
| 96 |
logger.error(f"Error scraping {url} with Trafilatura: {e}")
|
| 97 |
return ""
|
|
|
|
| 371 |
break
|
| 372 |
|
| 373 |
for result in results:
|
| 374 |
+
if len(scraped_content) >= num_results:
|
| 375 |
+
break
|
| 376 |
+
|
| 377 |
+
url = result.get('url', '')
|
| 378 |
+
title = result.get('title', 'No title')
|
| 379 |
+
|
| 380 |
+
if not is_valid_url(url):
|
| 381 |
+
logger.warning(f"Invalid URL: {url}")
|
| 382 |
+
continue
|
| 383 |
+
|
| 384 |
+
try:
|
| 385 |
+
logger.info(f"Scraping content from: {url}")
|
| 386 |
+
|
| 387 |
+
# Implement a retry mechanism with different user agents
|
| 388 |
+
user_agents = [
|
| 389 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 390 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15',
|
| 391 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 392 |
+
]
|
| 393 |
+
|
| 394 |
+
content = ""
|
| 395 |
+
for ua in user_agents:
|
| 396 |
+
try:
|
| 397 |
+
if scraper == "bs4":
|
| 398 |
+
session.headers.update({'User-Agent': ua})
|
| 399 |
+
content = scrape_with_bs4(url, session, max_chars)
|
| 400 |
+
else: # trafilatura
|
| 401 |
+
# Use urllib to handle custom headers for trafilatura
|
| 402 |
+
req = Request(url, headers={'User-Agent': ua})
|
| 403 |
+
with urlopen(req) as response:
|
| 404 |
+
downloaded = response.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
|
| 406 |
+
# Configure trafilatura to use a specific user agent
|
| 407 |
+
config = use_config()
|
| 408 |
+
config.set("DEFAULT", "USER_AGENT", ua)
|
| 409 |
+
|
| 410 |
+
content = scrape_with_trafilatura(url, max_chars)
|
| 411 |
+
|
| 412 |
+
if content:
|
| 413 |
+
break
|
| 414 |
+
except requests.exceptions.HTTPError as e:
|
| 415 |
+
if e.response.status_code == 403:
|
| 416 |
+
logger.warning(f"403 Forbidden error with User-Agent: {ua}. Trying next...")
|
| 417 |
continue
|
| 418 |
+
else:
|
| 419 |
+
raise
|
| 420 |
+
except Exception as e:
|
| 421 |
+
logger.error(f"Error scraping {url} with User-Agent {ua}: {str(e)}")
|
| 422 |
continue
|
| 423 |
+
|
| 424 |
+
if not content:
|
| 425 |
+
logger.warning(f"Failed to scrape content from {url} after trying multiple User-Agents")
|
| 426 |
+
continue
|
| 427 |
+
|
| 428 |
+
scraped_content.append({
|
| 429 |
+
"title": title,
|
| 430 |
+
"url": url,
|
| 431 |
+
"content": content, # No need to slice here as it's already limited
|
| 432 |
+
"scraper": scraper
|
| 433 |
+
})
|
| 434 |
+
logger.info(f"Successfully scraped content from {url}. Total scraped: {len(scraped_content)}")
|
| 435 |
+
except requests.exceptions.RequestException as e:
|
| 436 |
+
logger.error(f"Error scraping {url}: {e}")
|
| 437 |
+
except Exception as e:
|
| 438 |
+
logger.error(f"Unexpected error while scraping {url}: {e}")
|
| 439 |
|
| 440 |
page += 1
|
| 441 |
|