Spaces:
Running
Running
File size: 895 Bytes
723bbe6 925c4eb 723bbe6 925c4eb 723bbe6 91df0bf 723bbe6 91df0bf 723bbe6 9cdbd5b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
import sys
import json
import os
from requests_html import HTMLSession
if os.path.exists("/usr/bin/chromium"):
os.environ['PYPPETEER_CHROMIUM_REVISION'] = '/usr/bin/chromium'
def scrape_website(url: str) -> str:
session = HTMLSession()
try:
res = session.get(url, timeout=15)
res.html.render(timeout=20)
text = " ".join(res.html.text.split())
return {"text": text[:8000]}
except Exception as e:
return {"error": f"Scraping failed for {url}: {e}"}
finally:
session.close()
if __name__ == "__main__":
if len(sys.argv) < 2:
print(json.dumps({"error": "No URL provided"}))
sys.exit(1)
url = sys.argv[1]
result = scrape_website(url)
if "error" in result:
print(result["error"], file=sys.stderr)
sys.exit(1)
else:
print(json.dumps(result))
sys.exit(0)
|