Spaces:

Subhajit01
/

SmartLead

Running

Subhajit Chakraborty

update files(2)

9cdbd5b 4 months ago

2.37 kB

	# import requests
	# from bs4 import BeautifulSoup
	# from requests_html import HTMLSession
	# import asyncio
	import subprocess
	import json
	import os
	import sys

	# def scrape_website(url: str) -> str:
	# """Scrape visible text content from a company webpage."""
	# headers = {
	# "User-Agent": (
	# "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	# "AppleWebKit/537.36 (KHTML, like Gecko) "
	# "Chrome/123.0.0.0 Safari/537.36"
	# ),
	# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,/;q=0.8",
	# "Accept-Language": "en-US,en;q=0.9",
	# "Accept-Encoding": "gzip, deflate, br",
	# "Connection": "keep-alive",
	# "Upgrade-Insecure-Requests": "1",
	# "Sec-Fetch-Dest": "document",
	# "Sec-Fetch-Mode": "navigate",
	# "Sec-Fetch-Site": "none",
	# "Sec-Fetch-User": "?1",
	# }
	# try:
	# res = requests.get(url, headers=headers, timeout=10)
	# res.raise_for_status()
	# soup = BeautifulSoup(res.text, 'html.parser')
	# for tag in soup(["script", "style", "noscript"]):
	# tag.extract()
	# text = soup.get_text(separator="\n", strip=True)
	# return text[:5000]
	# except Exception as e:
	# return f"Error scraping the URL {url}: {str(e)}"

	def scrape_website(url: str) -> str:
	"""
	Calls scrape_worker.py as a subprocess to safely scrape a URL.
	This avoids async and event loop conflicts in Streamlit.
	"""
	worker_path = os.path.join(os.path.dirname(__file__), "scrape_worker.py")
	try:
	result = subprocess.run(
	[sys.executable, worker_path, url],
	capture_output=True,
	text=True,
	timeout=60
	)
	# print("This is the print block: ", result.returncode, result.stderr, result.stdout)
	if result.returncode != 0:
	return f"Error scraping the URL {url}: {result.stderr.strip()}"

	output = json.loads(result.stdout)
	return output.get("text", "")
	except subprocess.TimeoutExpired:
	return f"Error scraping the URL {url}: Timeout"
	except Exception as e:
	return f"Error scraping the URL {url}: {e}"