Sentinel-AI-2.0

Sleeping

Shreyas094 commited on Oct 1, 2024

Commit

ef24902

verified ·

1 Parent(s): 84b4903

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -123,7 +123,7 @@ def scrape_with_bs4(url, session):
 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
 def scrape_with_trafilatura(url):
     try:
-        downloaded = fetch_url(url, timeout=10)
         if downloaded is None:
             raise ScrapingError("Failed to download content")
         content = extract(downloaded)
@@ -433,11 +433,19 @@ def search_and_scrape(query, chat_history, num_results=5, scraper="trafilatura",
                 else:  # trafilatura
                     content = scrape_with_trafilatura(url)
-                # Limit content to max_chars
                 scraped_content.append({
                     "title": title,
                     "url": url,
-                    "content": content[:max_chars],
                     "scraper": scraper
                 })
             except requests.exceptions.RequestException as e:

 @retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
 def scrape_with_trafilatura(url):
     try:
+        downloaded = fetch_url(url)  # Remove the timeout parameter
         if downloaded is None:
             raise ScrapingError("Failed to download content")
         content = extract(downloaded)
                 else:  # trafilatura
                     content = scrape_with_trafilatura(url)
+                # Handle different types of content and limit to max_chars
+                if isinstance(content, dict) and 'content' in content:
+                    content['content'] = content['content'][:max_chars]
+                elif isinstance(content, str):
+                    content = content[:max_chars]
+                else:
+                    logger.warning(f"Unexpected content type for URL: {url}")
+                    content = str(content)[:max_chars]
                 scraped_content.append({
                     "title": title,
                     "url": url,
+                    "content": content,
                     "scraper": scraper
                 })
             except requests.exceptions.RequestException as e: