khatri-indra commited on
Commit
b4f27a3
·
verified ·
1 Parent(s): f320d2a

Update scapping.py

Browse files
Files changed (1) hide show
  1. scapping.py +46 -46
scapping.py CHANGED
@@ -1,46 +1,46 @@
1
- import selenium.webdriver as webdriver
2
- from selenium.webdriver.chrome.service import Service
3
- import time
4
- from bs4 import BeautifulSoup
5
-
6
- def scrape_website(website):
7
- print("launching chrome browser...")
8
-
9
- chrome_driver_path = "chromedriver.exe"
10
- options = webdriver.ChromeOptions()
11
- driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
12
-
13
- try:
14
- driver.get(website)
15
- print('page loaded..,')
16
- html = driver.page_source
17
- time.sleep(10)
18
-
19
- return html
20
- finally:
21
- driver.quit()
22
-
23
- def extract_body_content(html_content):
24
- soup = BeautifulSoup(html_content, "html.parser")
25
- body_content = soup.body
26
- if body_content:
27
- return str(body_content)
28
- return ""
29
-
30
- def clean_body_content(body_content):
31
- soup = BeautifulSoup(body_content, "html.parser")
32
-
33
- for script_or_style in soup({"script", "style"}):
34
- script_or_style.extract()
35
-
36
- cleaned_content = soup.get_text(separator="\n")
37
- cleaned_content = "\n".join(
38
- line.strip() for line in cleaned_content.splitlines() if line.strip()
39
- )
40
-
41
- return cleaned_content
42
-
43
- def split_dom_content(dom_content,max_length=6000):
44
- return [
45
- dom_content[i: i+max_length] for i in range(0, len(dom_content), max_length)
46
- ]
 
1
+ import selenium.webdriver as webdriver
2
+ from selenium.webdriver.chrome.service import Service
3
+ import time
4
+ from bs4 import BeautifulSoup
5
+
6
+ def scrape_website(website):
7
+ print("launching chrome browser...")
8
+
9
+ chrome_driver_path = "./chromedriver.exe"
10
+ options = webdriver.ChromeOptions()
11
+ driver = webdriver.Chrome(service=Service(chrome_driver_path), options=options)
12
+
13
+ try:
14
+ driver.get(website)
15
+ print('page loaded..,')
16
+ html = driver.page_source
17
+ time.sleep(10)
18
+
19
+ return html
20
+ finally:
21
+ driver.quit()
22
+
23
+ def extract_body_content(html_content):
24
+ soup = BeautifulSoup(html_content, "html.parser")
25
+ body_content = soup.body
26
+ if body_content:
27
+ return str(body_content)
28
+ return ""
29
+
30
+ def clean_body_content(body_content):
31
+ soup = BeautifulSoup(body_content, "html.parser")
32
+
33
+ for script_or_style in soup({"script", "style"}):
34
+ script_or_style.extract()
35
+
36
+ cleaned_content = soup.get_text(separator="\n")
37
+ cleaned_content = "\n".join(
38
+ line.strip() for line in cleaned_content.splitlines() if line.strip()
39
+ )
40
+
41
+ return cleaned_content
42
+
43
+ def split_dom_content(dom_content,max_length=6000):
44
+ return [
45
+ dom_content[i: i+max_length] for i in range(0, len(dom_content), max_length)
46
+ ]