Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -123,7 +123,7 @@ def scrape_with_bs4(url, session):
|
|
| 123 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 124 |
def scrape_with_trafilatura(url):
|
| 125 |
try:
|
| 126 |
-
downloaded = fetch_url(url
|
| 127 |
if downloaded is None:
|
| 128 |
raise ScrapingError("Failed to download content")
|
| 129 |
content = extract(downloaded)
|
|
@@ -433,11 +433,19 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
| 433 |
else: # trafilatura
|
| 434 |
content = scrape_with_trafilatura(url)
|
| 435 |
|
| 436 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 437 |
scraped_content.append({
|
| 438 |
"title": title,
|
| 439 |
"url": url,
|
| 440 |
-
"content": content
|
| 441 |
"scraper": scraper
|
| 442 |
})
|
| 443 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 123 |
@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
|
| 124 |
def scrape_with_trafilatura(url):
|
| 125 |
try:
|
| 126 |
+
downloaded = fetch_url(url) # Remove the timeout parameter
|
| 127 |
if downloaded is None:
|
| 128 |
raise ScrapingError("Failed to download content")
|
| 129 |
content = extract(downloaded)
|
|
|
|
| 433 |
else: # trafilatura
|
| 434 |
content = scrape_with_trafilatura(url)
|
| 435 |
|
| 436 |
+
# Handle different types of content and limit to max_chars
|
| 437 |
+
if isinstance(content, dict) and 'content' in content:
|
| 438 |
+
content['content'] = content['content'][:max_chars]
|
| 439 |
+
elif isinstance(content, str):
|
| 440 |
+
content = content[:max_chars]
|
| 441 |
+
else:
|
| 442 |
+
logger.warning(f"Unexpected content type for URL: {url}")
|
| 443 |
+
content = str(content)[:max_chars]
|
| 444 |
+
|
| 445 |
scraped_content.append({
|
| 446 |
"title": title,
|
| 447 |
"url": url,
|
| 448 |
+
"content": content,
|
| 449 |
"scraper": scraper
|
| 450 |
})
|
| 451 |
except requests.exceptions.RequestException as e:
|