Spaces:

kgout
/

minor_scrapes

Build error

App Files Files Community

kgout commited on Nov 11, 2023

Commit

d2690bc

1 Parent(s): 75c3c8f

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -25

app.py CHANGED Viewed

@@ -1,21 +1,27 @@
 import os
-import re
-import time
 import platform
 import subprocess
 from collections import deque
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
-import bs4
 import mdformat
 import requests
 import streamlit as st
 from bs4 import BeautifulSoup
 from markdownify import MarkdownConverter
 from selenium.webdriver.common.by import By
-from statwords import StatusWordItem, Items
 st.set_page_config("Minor Scrapes", "🔪", "wide")
@@ -23,8 +29,11 @@ STATE = st.session_state
 st.title("Minor Scrapes")
 NOTIFICATION = st.empty()
 COLUMNS = st.columns([0.618, 0.01, 0.372])
 LEFT_TABLE = COLUMNS[0].empty()
@@ -50,7 +59,7 @@ def get_matching_tags(soup, tags_plus_atrtibutes):
             attrs = tag_attr["attrs"]
             for attr in attrs:
                 # get all tags with those attributes
-                tags = [t for t in tags if t.has_attr(attr) or t.has_attr(f"{attr}s")]
         for t in tags:
             if not t.find_parents(tag):
                 yield t
@@ -96,35 +105,44 @@ def chromecheck():
 if 'CHECKED' not in st.session_state:
     st.session_state['CHECKED'] = chromecheck()
-from selenium import webdriver
-from selenium.webdriver.chrome.options import Options
-from bs4 import BeautifulSoup
 class RenderedPage:
     def __init__(self):
         # Set up the headless browser
-        chrome_options = Options()
-        chrome_options.add_argument("--headless")  # Run the browser in headless mode
-        self.driver = webdriver.Chrome(options=chrome_options)
     def get_rendered_page(self, url):
         # Load the webpage in the headless browser
         self.driver.get(url)
-        # Wait for JavaScript to execute and render the page
-        # You can use explicit waits to wait for specific elements to appear on the page
         # Get the fully rendered HTML
-        full_html = self.driver.page_source
         # Close the browser
         self.driver.quit()
         # Create a Beautiful Soup object of the fully rendered page
-        soup = BeautifulSoup(full_html, "html5lib")
-        return soup
 def convert_to_markdown(soup):
     """
@@ -139,12 +157,10 @@ def convert_to_markdown(soup):
     """
     converter = MarkdownConverter(
-        code_language="python",
-        default_title=False,
-        escape_asterisks=False,
         escape_underscores=False,
     )
-    return converter.convert_soup(soup)
 def convert_to_safe_url(text):
@@ -329,7 +345,7 @@ def crawl_website(url, tags_to_save=[], do_save=False, up_level=False):
             continue
         base_filename = f"{f'{convert_to_safe_url(parent_path)}/' if parent_path else '/'}{convert_to_safe_url(path_tail)}"
-        soup = BeautifulSoup(content, "html5lib")
         tag_items = list(get_matching_tags(soup, tags_to_save))
         # remove duplicates starting from the last item towards the first..
@@ -469,7 +485,7 @@ def main():
     COLUMNS[2].multiselect(
         "",
         STATE.HTML_TAGS_LIST,
-        ["h1", "h2", "h3", "a", "p", "pre"],
         key="htmltags",
         label_visibility="collapsed",
     )

 import os
 import platform
+import re
 import subprocess
+import time
 from collections import deque
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
 import mdformat
 import requests
 import streamlit as st
 from bs4 import BeautifulSoup
 from markdownify import MarkdownConverter
+from selenium.webdriver import Chrome, ChromeOptions
 from selenium.webdriver.common.by import By
+from selenium.webdriver.chrome.service import Service
+from webdriver_manager.chrome import ChromeDriverManager
+# Title of page
+from statwords import Items, StatusWordItem
+# Title of page
 st.set_page_config("Minor Scrapes", "🔪", "wide")
 st.title("Minor Scrapes")
 NOTIFICATION = st.empty()
+NOTIFICATION2 = st.empty()
 COLUMNS = st.columns([0.618, 0.01, 0.372])
 LEFT_TABLE = COLUMNS[0].empty()
             attrs = tag_attr["attrs"]
             for attr in attrs:
                 # get all tags with those attributes
+                tags = [t for t in tags if t.has_attr(attr) or t.has_attr(attr + "s")]
         for t in tags:
             if not t.find_parents(tag):
                 yield t
 if 'CHECKED' not in st.session_state:
     st.session_state['CHECKED'] = chromecheck()
 class RenderedPage:
     def __init__(self):
         # Set up the headless browser
+        chrome_options = ChromeOptions()
+        chrome_options.add_argument("--headless=new")  # Run the browser in headless mode
+        chrome_options.add_argument("--no-sandbox")
+        chrome_options.add_argument("--no-gpu")
+        self.driver = Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
     def get_rendered_page(self, url):
         # Load the webpage in the headless browser
         self.driver.get(url)
+        prog = st.progress(0.0, f'grabbing{url}')
+        full_html = ""
+        status = ""
+        while status != "complete":
+            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
+            status = self.driver.execute_script("return document.readyState")
+            with prog:
+                time.sleep(0.1)
+                prog.progress(0.1)
         # Get the fully rendered HTML
+        full_html =  self.driver.page_source
         # Close the browser
         self.driver.quit()
         # Create a Beautiful Soup object of the fully rendered page
+        soup = BeautifulSoup(full_html, 'html.parser')
+        prog.empty()
+        return soup
 def convert_to_markdown(soup):
     """
     """
     converter = MarkdownConverter(
+        code_language     ="python",
         escape_underscores=False,
     )
+    return converter.convert(soup.prettify())
 def convert_to_safe_url(text):
             continue
         base_filename = f"{f'{convert_to_safe_url(parent_path)}/' if parent_path else '/'}{convert_to_safe_url(path_tail)}"
+        soup = BeautifulSoup(content, "html.parser")
         tag_items = list(get_matching_tags(soup, tags_to_save))
         # remove duplicates starting from the last item towards the first..
     COLUMNS[2].multiselect(
         "",
         STATE.HTML_TAGS_LIST,
+        ["h1", "h2", "h3", "p", "pre"],
         key="htmltags",
         label_visibility="collapsed",
     )