Spaces:

Subhajit01
/

SmartLead

Running

SmartLead / src /services /scrape_worker.py

Subhajit Chakraborty

update files(4)

925c4eb 4 months ago

895 Bytes

	import sys
	import json
	import os

	from requests_html import HTMLSession
	if os.path.exists("/usr/bin/chromium"):
	os.environ['PYPPETEER_CHROMIUM_REVISION'] = '/usr/bin/chromium'

	def scrape_website(url: str) -> str:
	session = HTMLSession()
	try:
	res = session.get(url, timeout=15)
	res.html.render(timeout=20)
	text = " ".join(res.html.text.split())
	return {"text": text[:8000]}
	except Exception as e:
	return {"error": f"Scraping failed for {url}: {e}"}
	finally:
	session.close()

	if __name__ == "__main__":
	if len(sys.argv) < 2:
	print(json.dumps({"error": "No URL provided"}))
	sys.exit(1)

	url = sys.argv[1]
	result = scrape_website(url)
	if "error" in result:
	print(result["error"], file=sys.stderr)
	sys.exit(1)
	else:
	print(json.dumps(result))
	sys.exit(0)