File size: 2,371 Bytes
723bbe6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cdbd5b
723bbe6
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
# import requests
# from bs4 import BeautifulSoup   
# from requests_html import HTMLSession
# import asyncio
import subprocess
import json
import os
import sys

# def scrape_website(url: str) -> str:
#         """Scrape visible text content from a company webpage."""
#         headers = {
#             "User-Agent": (
#                 "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
#                 "AppleWebKit/537.36 (KHTML, like Gecko) "
#                 "Chrome/123.0.0.0 Safari/537.36"
#             ),
#             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
#             "Accept-Language": "en-US,en;q=0.9",
#             "Accept-Encoding": "gzip, deflate, br",
#             "Connection": "keep-alive",
#             "Upgrade-Insecure-Requests": "1",
#             "Sec-Fetch-Dest": "document",
#             "Sec-Fetch-Mode": "navigate",
#             "Sec-Fetch-Site": "none",
#             "Sec-Fetch-User": "?1",
#         }
#         try:
#             res = requests.get(url, headers=headers, timeout=10)
#             res.raise_for_status()
#             soup = BeautifulSoup(res.text, 'html.parser')
#             for tag in soup(["script", "style", "noscript"]):
#                 tag.extract()
#             text = soup.get_text(separator="\n", strip=True)
#             return text[:5000]
#         except Exception as e:
#             return f"Error scraping the URL {url}: {str(e)}"

def scrape_website(url: str) -> str:
    """
    Calls scrape_worker.py as a subprocess to safely scrape a URL.
    This avoids async and event loop conflicts in Streamlit.
    """
    worker_path = os.path.join(os.path.dirname(__file__), "scrape_worker.py")
    try:
        result = subprocess.run(
            [sys.executable, worker_path, url],
            capture_output=True,
            text=True,
            timeout=60
        )
        # print("This is the print block: ", result.returncode, result.stderr, result.stdout)
        if result.returncode != 0:
            return f"Error scraping the URL {url}: {result.stderr.strip()}"
        
        output = json.loads(result.stdout)
        return output.get("text", "")
    except subprocess.TimeoutExpired:
        return f"Error scraping the URL {url}: Timeout"
    except Exception as e:
        return f"Error scraping the URL {url}: {e}"