usmanyousaf commited on
Commit
2a0bf8d
·
verified ·
1 Parent(s): 8d0e46c

Update scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +34 -14
scrape.py CHANGED
@@ -1,21 +1,43 @@
1
- import requests
2
- from bs4 import BeautifulSoup
3
- from dotenv import load_dotenv
 
 
4
  import os
 
5
 
 
6
  load_dotenv()
7
 
 
 
8
  def scrape_website(website):
9
- print(f"Fetching website content from {website}...")
10
- headers = {"User-Agent": "Mozilla/5.0"}
11
- response = requests.get(website, headers=headers)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- if response.status_code == 200:
14
- print("Website content fetched successfully!")
15
- return response.text
16
- else:
17
- print(f"Failed to fetch website: Status code {response.status_code}")
18
- return ""
19
 
20
  def extract_body_content(html_content):
21
  soup = BeautifulSoup(html_content, "html.parser")
@@ -27,11 +49,9 @@ def extract_body_content(html_content):
27
  def clean_body_content(body_content):
28
  soup = BeautifulSoup(body_content, "html.parser")
29
 
30
- # Remove script and style elements
31
  for script_or_style in soup(["script", "style"]):
32
  script_or_style.extract()
33
 
34
- # Extract text and clean up the content
35
  cleaned_content = soup.get_text(separator="\n")
36
  cleaned_content = "\n".join(
37
  line.strip() for line in cleaned_content.splitlines() if line.strip()
 
1
+ from selenium import webdriver # type: ignore
2
+ from selenium.webdriver.chrome.service import Service # type: ignore
3
+ from selenium.webdriver.chrome.options import Options # type: ignore
4
+ from bs4 import BeautifulSoup # type: ignore
5
+ from dotenv import load_dotenv # type: ignore
6
  import os
7
+ import time
8
 
9
+ # Load environment variables
10
  load_dotenv()
11
 
12
+ CHROME_DRIVER_PATH = os.getenv("CHROME_DRIVER_PATH", "./chrome")
13
+
14
  def scrape_website(website):
15
+ print("Connecting to Chrome Browser...")
16
+
17
+ # Setup ChromeDriver service and options for headless scraping
18
+ service = Service(CHROME_DRIVER_PATH)
19
+ options = Options()
20
+ options.add_argument("--headless") # Run Chrome in headless mode
21
+ options.add_argument("--no-sandbox")
22
+ options.add_argument("--disable-dev-shm-usage")
23
+
24
+ driver = webdriver.Chrome(service=service, options=options)
25
+
26
+ try:
27
+ driver.get(website)
28
+ print("Waiting for CAPTCHA to be solved manually (if present)...")
29
+
30
+ # Optional waiting loop for manual CAPTCHA solving
31
+ while "captcha" in driver.page_source.lower():
32
+ print("CAPTCHA detected, waiting...")
33
+ time.sleep(5)
34
 
35
+ print("CAPTCHA solved or not present. Scraping page content...")
36
+ html = driver.page_source
37
+ return html
38
+
39
+ finally:
40
+ driver.quit()
41
 
42
  def extract_body_content(html_content):
43
  soup = BeautifulSoup(html_content, "html.parser")
 
49
  def clean_body_content(body_content):
50
  soup = BeautifulSoup(body_content, "html.parser")
51
 
 
52
  for script_or_style in soup(["script", "style"]):
53
  script_or_style.extract()
54
 
 
55
  cleaned_content = soup.get_text(separator="\n")
56
  cleaned_content = "\n".join(
57
  line.strip() for line in cleaned_content.splitlines() if line.strip()