SmartLead / src /services /scraper.py
Subhajit Chakraborty
update files(2)
9cdbd5b
# import requests
# from bs4 import BeautifulSoup
# from requests_html import HTMLSession
# import asyncio
import subprocess
import json
import os
import sys
# def scrape_website(url: str) -> str:
# """Scrape visible text content from a company webpage."""
# headers = {
# "User-Agent": (
# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
# "AppleWebKit/537.36 (KHTML, like Gecko) "
# "Chrome/123.0.0.0 Safari/537.36"
# ),
# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
# "Accept-Language": "en-US,en;q=0.9",
# "Accept-Encoding": "gzip, deflate, br",
# "Connection": "keep-alive",
# "Upgrade-Insecure-Requests": "1",
# "Sec-Fetch-Dest": "document",
# "Sec-Fetch-Mode": "navigate",
# "Sec-Fetch-Site": "none",
# "Sec-Fetch-User": "?1",
# }
# try:
# res = requests.get(url, headers=headers, timeout=10)
# res.raise_for_status()
# soup = BeautifulSoup(res.text, 'html.parser')
# for tag in soup(["script", "style", "noscript"]):
# tag.extract()
# text = soup.get_text(separator="\n", strip=True)
# return text[:5000]
# except Exception as e:
# return f"Error scraping the URL {url}: {str(e)}"
def scrape_website(url: str) -> str:
"""
Calls scrape_worker.py as a subprocess to safely scrape a URL.
This avoids async and event loop conflicts in Streamlit.
"""
worker_path = os.path.join(os.path.dirname(__file__), "scrape_worker.py")
try:
result = subprocess.run(
[sys.executable, worker_path, url],
capture_output=True,
text=True,
timeout=60
)
# print("This is the print block: ", result.returncode, result.stderr, result.stdout)
if result.returncode != 0:
return f"Error scraping the URL {url}: {result.stderr.strip()}"
output = json.loads(result.stdout)
return output.get("text", "")
except subprocess.TimeoutExpired:
return f"Error scraping the URL {url}: Timeout"
except Exception as e:
return f"Error scraping the URL {url}: {e}"