usmanyousaf commited on
Commit
6a10786
·
verified ·
1 Parent(s): 54cd769

Update scrape.py

Browse files
Files changed (1) hide show
  1. scrape.py +17 -35
scrape.py CHANGED
@@ -1,38 +1,21 @@
1
- from selenium import webdriver
2
- from webdriver_manager.chrome import ChromeDriverManager
3
- from selenium.webdriver.chrome.service import Service
4
- from selenium.webdriver.chrome.options import Options
5
- from bs4 import BeautifulSoup
6
- import time
7
 
8
- # No need for explicit CHROME_DRIVER_PATH or .env usage, WebDriverManager handles it.
9
-
10
- options = Options()
11
- options.add_argument("--headless")
12
- options.add_argument("--no-sandbox")
13
- options.add_argument("--disable-dev-shm-usage")
14
-
15
- # Use WebDriverManager to automatically download and install the correct version
16
- driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
17
 
18
  def scrape_website(website):
19
- print("Connecting to Chrome Browser...")
20
-
21
- try:
22
- driver.get(website)
23
- print("Waiting for CAPTCHA to be solved manually (if present)...")
24
-
25
- # Optional waiting loop for manual CAPTCHA solving
26
- while "captcha" in driver.page_source.lower():
27
- print("CAPTCHA detected, waiting...")
28
- time.sleep(5)
29
-
30
- print("CAPTCHA solved or not present. Scraping page content...")
31
- html = driver.page_source
32
- return html
33
-
34
- finally:
35
- driver.quit()
36
 
37
  def extract_body_content(html_content):
38
  soup = BeautifulSoup(html_content, "html.parser")
@@ -44,11 +27,11 @@ def extract_body_content(html_content):
44
  def clean_body_content(body_content):
45
  soup = BeautifulSoup(body_content, "html.parser")
46
 
47
- # Remove all <script> and <style> elements
48
  for script_or_style in soup(["script", "style"]):
49
  script_or_style.extract()
50
 
51
- # Extract and clean text
52
  cleaned_content = soup.get_text(separator="\n")
53
  cleaned_content = "\n".join(
54
  line.strip() for line in cleaned_content.splitlines() if line.strip()
@@ -57,7 +40,6 @@ def clean_body_content(body_content):
57
  return cleaned_content
58
 
59
  def split_dom_content(dom_content, max_length=6000):
60
- # Split the content into chunks of max_length characters
61
  return [
62
  dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
63
  ]
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from dotenv import load_dotenv
4
+ import os
 
 
5
 
6
+ load_dotenv()
 
 
 
 
 
 
 
 
7
 
8
  def scrape_website(website):
9
+ print(f"Fetching website content from {website}...")
10
+ headers = {"User-Agent": "Mozilla/5.0"}
11
+ response = requests.get(website, headers=headers)
12
+
13
+ if response.status_code == 200:
14
+ print("Website content fetched successfully!")
15
+ return response.text
16
+ else:
17
+ print(f"Failed to fetch website: Status code {response.status_code}")
18
+ return ""
 
 
 
 
 
 
 
19
 
20
  def extract_body_content(html_content):
21
  soup = BeautifulSoup(html_content, "html.parser")
 
27
  def clean_body_content(body_content):
28
  soup = BeautifulSoup(body_content, "html.parser")
29
 
30
+ # Remove script and style elements
31
  for script_or_style in soup(["script", "style"]):
32
  script_or_style.extract()
33
 
34
+ # Extract text and clean up the content
35
  cleaned_content = soup.get_text(separator="\n")
36
  cleaned_content = "\n".join(
37
  line.strip() for line in cleaned_content.splitlines() if line.strip()
 
40
  return cleaned_content
41
 
42
  def split_dom_content(dom_content, max_length=6000):
 
43
  return [
44
  dom_content[i : i + max_length] for i in range(0, len(dom_content), max_length)
45
  ]