kgout commited on
Commit
d2690bc
·
1 Parent(s): 75c3c8f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -25
app.py CHANGED
@@ -1,21 +1,27 @@
1
  import os
2
- import re
3
- import time
4
  import platform
 
5
  import subprocess
 
6
  from collections import deque
7
  from html.parser import HTMLParser
8
  from urllib.parse import urljoin, urlparse
9
 
10
- import bs4
11
  import mdformat
12
  import requests
13
  import streamlit as st
14
  from bs4 import BeautifulSoup
15
  from markdownify import MarkdownConverter
 
16
  from selenium.webdriver.common.by import By
 
 
 
17
 
18
- from statwords import StatusWordItem, Items
 
 
 
19
 
20
 
21
  st.set_page_config("Minor Scrapes", "🔪", "wide")
@@ -23,8 +29,11 @@ STATE = st.session_state
23
 
24
  st.title("Minor Scrapes")
25
 
 
26
  NOTIFICATION = st.empty()
 
27
  COLUMNS = st.columns([0.618, 0.01, 0.372])
 
28
  LEFT_TABLE = COLUMNS[0].empty()
29
 
30
 
@@ -50,7 +59,7 @@ def get_matching_tags(soup, tags_plus_atrtibutes):
50
  attrs = tag_attr["attrs"]
51
  for attr in attrs:
52
  # get all tags with those attributes
53
- tags = [t for t in tags if t.has_attr(attr) or t.has_attr(f"{attr}s")]
54
  for t in tags:
55
  if not t.find_parents(tag):
56
  yield t
@@ -96,35 +105,44 @@ def chromecheck():
96
  if 'CHECKED' not in st.session_state:
97
  st.session_state['CHECKED'] = chromecheck()
98
 
99
- from selenium import webdriver
100
- from selenium.webdriver.chrome.options import Options
101
- from bs4 import BeautifulSoup
102
 
103
  class RenderedPage:
104
  def __init__(self):
105
  # Set up the headless browser
106
- chrome_options = Options()
107
- chrome_options.add_argument("--headless") # Run the browser in headless mode
108
- self.driver = webdriver.Chrome(options=chrome_options)
 
 
109
 
110
  def get_rendered_page(self, url):
111
-
112
  # Load the webpage in the headless browser
113
  self.driver.get(url)
114
 
115
- # Wait for JavaScript to execute and render the page
116
- # You can use explicit waits to wait for specific elements to appear on the page
 
 
 
 
 
 
 
 
 
117
 
118
  # Get the fully rendered HTML
119
- full_html = self.driver.page_source
120
-
 
 
121
  # Close the browser
122
  self.driver.quit()
123
-
124
  # Create a Beautiful Soup object of the fully rendered page
125
- soup = BeautifulSoup(full_html, "html5lib")
126
- return soup
127
 
 
 
128
 
129
  def convert_to_markdown(soup):
130
  """
@@ -139,12 +157,10 @@ def convert_to_markdown(soup):
139
  """
140
 
141
  converter = MarkdownConverter(
142
- code_language="python",
143
- default_title=False,
144
- escape_asterisks=False,
145
  escape_underscores=False,
146
  )
147
- return converter.convert_soup(soup)
148
 
149
 
150
  def convert_to_safe_url(text):
@@ -329,7 +345,7 @@ def crawl_website(url, tags_to_save=[], do_save=False, up_level=False):
329
  continue
330
 
331
  base_filename = f"{f'{convert_to_safe_url(parent_path)}/' if parent_path else '/'}{convert_to_safe_url(path_tail)}"
332
- soup = BeautifulSoup(content, "html5lib")
333
  tag_items = list(get_matching_tags(soup, tags_to_save))
334
  # remove duplicates starting from the last item towards the first..
335
 
@@ -469,7 +485,7 @@ def main():
469
  COLUMNS[2].multiselect(
470
  "",
471
  STATE.HTML_TAGS_LIST,
472
- ["h1", "h2", "h3", "a", "p", "pre"],
473
  key="htmltags",
474
  label_visibility="collapsed",
475
  )
 
1
  import os
 
 
2
  import platform
3
+ import re
4
  import subprocess
5
+ import time
6
  from collections import deque
7
  from html.parser import HTMLParser
8
  from urllib.parse import urljoin, urlparse
9
 
 
10
  import mdformat
11
  import requests
12
  import streamlit as st
13
  from bs4 import BeautifulSoup
14
  from markdownify import MarkdownConverter
15
+ from selenium.webdriver import Chrome, ChromeOptions
16
  from selenium.webdriver.common.by import By
17
+ from selenium.webdriver.chrome.service import Service
18
+ from webdriver_manager.chrome import ChromeDriverManager
19
+
20
 
21
+ # Title of page
22
+ from statwords import Items, StatusWordItem
23
+
24
+ # Title of page
25
 
26
 
27
  st.set_page_config("Minor Scrapes", "🔪", "wide")
 
29
 
30
  st.title("Minor Scrapes")
31
 
32
+
33
  NOTIFICATION = st.empty()
34
+ NOTIFICATION2 = st.empty()
35
  COLUMNS = st.columns([0.618, 0.01, 0.372])
36
+
37
  LEFT_TABLE = COLUMNS[0].empty()
38
 
39
 
 
59
  attrs = tag_attr["attrs"]
60
  for attr in attrs:
61
  # get all tags with those attributes
62
+ tags = [t for t in tags if t.has_attr(attr) or t.has_attr(attr + "s")]
63
  for t in tags:
64
  if not t.find_parents(tag):
65
  yield t
 
105
  if 'CHECKED' not in st.session_state:
106
  st.session_state['CHECKED'] = chromecheck()
107
 
 
 
 
108
 
109
  class RenderedPage:
110
  def __init__(self):
111
  # Set up the headless browser
112
+ chrome_options = ChromeOptions()
113
+ chrome_options.add_argument("--headless=new") # Run the browser in headless mode
114
+ chrome_options.add_argument("--no-sandbox")
115
+ chrome_options.add_argument("--no-gpu")
116
+ self.driver = Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
117
 
118
  def get_rendered_page(self, url):
 
119
  # Load the webpage in the headless browser
120
  self.driver.get(url)
121
 
122
+ prog = st.progress(0.0, f'grabbing{url}')
123
+
124
+ full_html = ""
125
+ status = ""
126
+ while status != "complete":
127
+ self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
128
+ status = self.driver.execute_script("return document.readyState")
129
+ with prog:
130
+ time.sleep(0.1)
131
+ prog.progress(0.1)
132
+
133
 
134
  # Get the fully rendered HTML
135
+ full_html = self.driver.page_source
136
+
137
+
138
+
139
  # Close the browser
140
  self.driver.quit()
 
141
  # Create a Beautiful Soup object of the fully rendered page
142
+ soup = BeautifulSoup(full_html, 'html.parser')
 
143
 
144
+ prog.empty()
145
+ return soup
146
 
147
  def convert_to_markdown(soup):
148
  """
 
157
  """
158
 
159
  converter = MarkdownConverter(
160
+ code_language ="python",
 
 
161
  escape_underscores=False,
162
  )
163
+ return converter.convert(soup.prettify())
164
 
165
 
166
  def convert_to_safe_url(text):
 
345
  continue
346
 
347
  base_filename = f"{f'{convert_to_safe_url(parent_path)}/' if parent_path else '/'}{convert_to_safe_url(path_tail)}"
348
+ soup = BeautifulSoup(content, "html.parser")
349
  tag_items = list(get_matching_tags(soup, tags_to_save))
350
  # remove duplicates starting from the last item towards the first..
351
 
 
485
  COLUMNS[2].multiselect(
486
  "",
487
  STATE.HTML_TAGS_LIST,
488
+ ["h1", "h2", "h3", "p", "pre"],
489
  key="htmltags",
490
  label_visibility="collapsed",
491
  )