# import requests # from bs4 import BeautifulSoup # from requests_html import HTMLSession # import asyncio import subprocess import json import os import sys # def scrape_website(url: str) -> str: # """Scrape visible text content from a company webpage.""" # headers = { # "User-Agent": ( # "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " # "AppleWebKit/537.36 (KHTML, like Gecko) " # "Chrome/123.0.0.0 Safari/537.36" # ), # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", # "Accept-Language": "en-US,en;q=0.9", # "Accept-Encoding": "gzip, deflate, br", # "Connection": "keep-alive", # "Upgrade-Insecure-Requests": "1", # "Sec-Fetch-Dest": "document", # "Sec-Fetch-Mode": "navigate", # "Sec-Fetch-Site": "none", # "Sec-Fetch-User": "?1", # } # try: # res = requests.get(url, headers=headers, timeout=10) # res.raise_for_status() # soup = BeautifulSoup(res.text, 'html.parser') # for tag in soup(["script", "style", "noscript"]): # tag.extract() # text = soup.get_text(separator="\n", strip=True) # return text[:5000] # except Exception as e: # return f"Error scraping the URL {url}: {str(e)}" def scrape_website(url: str) -> str: """ Calls scrape_worker.py as a subprocess to safely scrape a URL. This avoids async and event loop conflicts in Streamlit. """ worker_path = os.path.join(os.path.dirname(__file__), "scrape_worker.py") try: result = subprocess.run( [sys.executable, worker_path, url], capture_output=True, text=True, timeout=60 ) # print("This is the print block: ", result.returncode, result.stderr, result.stdout) if result.returncode != 0: return f"Error scraping the URL {url}: {result.stderr.strip()}" output = json.loads(result.stdout) return output.get("text", "") except subprocess.TimeoutExpired: return f"Error scraping the URL {url}: Timeout" except Exception as e: return f"Error scraping the URL {url}: {e}"