Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -20,6 +20,9 @@ from datetime import datetime
|
|
| 20 |
import os
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
import certifi
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
# Load environment variables from a .env file
|
| 25 |
load_dotenv()
|
|
@@ -88,12 +91,24 @@ def scrape_with_bs4(url, session, max_chars=None):
|
|
| 88 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
| 89 |
return ""
|
| 90 |
|
| 91 |
-
def scrape_with_trafilatura(url, max_chars=None, timeout=5):
|
| 92 |
try:
|
| 93 |
response = requests.get(url, timeout=timeout)
|
| 94 |
response.raise_for_status()
|
| 95 |
downloaded = response.text
|
| 96 |
-
content =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
| 98 |
except Timeout:
|
| 99 |
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
|
@@ -252,7 +267,7 @@ def scrape_full_content(url, scraper="trafilatura", max_chars=3000, timeout=5):
|
|
| 252 |
else:
|
| 253 |
content = soup.get_text(strip=True, separator='\n')
|
| 254 |
else: # trafilatura
|
| 255 |
-
content = scrape_with_trafilatura(url, max_chars, timeout)
|
| 256 |
|
| 257 |
# Limit the content to max_chars
|
| 258 |
return content[:max_chars] if content else ""
|
|
@@ -378,7 +393,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
| 378 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
| 379 |
break
|
| 380 |
|
| 381 |
-
|
| 382 |
if len(scraped_content) >= num_results:
|
| 383 |
break
|
| 384 |
|
|
@@ -415,7 +430,7 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
|
|
| 415 |
config = use_config()
|
| 416 |
config.set("DEFAULT", "USER_AGENT", ua)
|
| 417 |
|
| 418 |
-
content = scrape_with_trafilatura(url, max_chars, timeout=timeout)
|
| 419 |
|
| 420 |
if content:
|
| 421 |
break
|
|
|
|
| 20 |
import os
|
| 21 |
from dotenv import load_dotenv
|
| 22 |
import certifi
|
| 23 |
+
from bs4 import BeautifulSoup
|
| 24 |
+
from trafilatura import extract
|
| 25 |
+
from trafilatura.htmlprocessing import convert_tree
|
| 26 |
|
| 27 |
# Load environment variables from a .env file
|
| 28 |
load_dotenv()
|
|
|
|
| 91 |
logger.error(f"Error scraping {url} with BeautifulSoup: {e}")
|
| 92 |
return ""
|
| 93 |
|
| 94 |
+
def scrape_with_trafilatura(url, max_chars=None, timeout=5, use_beautifulsoup=False):
|
| 95 |
try:
|
| 96 |
response = requests.get(url, timeout=timeout)
|
| 97 |
response.raise_for_status()
|
| 98 |
downloaded = response.text
|
| 99 |
+
content = ""
|
| 100 |
+
if use_beautifulsoup:
|
| 101 |
+
soup = BeautifulSoup(downloaded, "lxml")
|
| 102 |
+
lxml_tree = convert_tree(soup)[0]
|
| 103 |
+
content = extract(lxml_tree, include_comments=False, include_tables=True, no_fallback=False)
|
| 104 |
+
|
| 105 |
+
# Fallback mechanism: if BeautifulSoup didn't yield results, try without it
|
| 106 |
+
if not content and use_beautifulsoup:
|
| 107 |
+
logger.info("BeautifulSoup method failed to extract content. Trying without BeautifulSoup.")
|
| 108 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
| 109 |
+
# If still no content, use the direct method
|
| 110 |
+
if not content:
|
| 111 |
+
content = extract(downloaded, include_comments=False, include_tables=True, no_fallback=False)
|
| 112 |
return (content or "")[:max_chars] if max_chars else (content or "")
|
| 113 |
except Timeout:
|
| 114 |
logger.error(f"Timeout error while scraping {url} with Trafilatura")
|
|
|
|
| 267 |
else:
|
| 268 |
content = soup.get_text(strip=True, separator='\n')
|
| 269 |
else: # trafilatura
|
| 270 |
+
content = scrape_with_trafilatura(url, max_chars, timeout, use_beautifulsoup=True)
|
| 271 |
|
| 272 |
# Limit the content to max_chars
|
| 273 |
return content[:max_chars] if content else ""
|
|
|
|
| 393 |
logger.warning(f"No more results returned from SearXNG on page {page}.")
|
| 394 |
break
|
| 395 |
|
| 396 |
+
for result in results:
|
| 397 |
if len(scraped_content) >= num_results:
|
| 398 |
break
|
| 399 |
|
|
|
|
| 430 |
config = use_config()
|
| 431 |
config.set("DEFAULT", "USER_AGENT", ua)
|
| 432 |
|
| 433 |
+
content = scrape_with_trafilatura(url, max_chars, timeout=timeout, use_beautifulsoup=True)
|
| 434 |
|
| 435 |
if content:
|
| 436 |
break
|