SmartLead / src /services /scrape_worker.py
Subhajit Chakraborty
update files(4)
925c4eb
raw
history blame contribute delete
895 Bytes
import sys
import json
import os
from requests_html import HTMLSession
if os.path.exists("/usr/bin/chromium"):
os.environ['PYPPETEER_CHROMIUM_REVISION'] = '/usr/bin/chromium'
def scrape_website(url: str) -> str:
session = HTMLSession()
try:
res = session.get(url, timeout=15)
res.html.render(timeout=20)
text = " ".join(res.html.text.split())
return {"text": text[:8000]}
except Exception as e:
return {"error": f"Scraping failed for {url}: {e}"}
finally:
session.close()
if __name__ == "__main__":
if len(sys.argv) < 2:
print(json.dumps({"error": "No URL provided"}))
sys.exit(1)
url = sys.argv[1]
result = scrape_website(url)
if "error" in result:
print(result["error"], file=sys.stderr)
sys.exit(1)
else:
print(json.dumps(result))
sys.exit(0)