Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,21 +1,27 @@
|
|
| 1 |
import os
|
| 2 |
-
import re
|
| 3 |
-
import time
|
| 4 |
import platform
|
|
|
|
| 5 |
import subprocess
|
|
|
|
| 6 |
from collections import deque
|
| 7 |
from html.parser import HTMLParser
|
| 8 |
from urllib.parse import urljoin, urlparse
|
| 9 |
|
| 10 |
-
import bs4
|
| 11 |
import mdformat
|
| 12 |
import requests
|
| 13 |
import streamlit as st
|
| 14 |
from bs4 import BeautifulSoup
|
| 15 |
from markdownify import MarkdownConverter
|
|
|
|
| 16 |
from selenium.webdriver.common.by import By
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
st.set_page_config("Minor Scrapes", "🔪", "wide")
|
|
@@ -23,8 +29,11 @@ STATE = st.session_state
|
|
| 23 |
|
| 24 |
st.title("Minor Scrapes")
|
| 25 |
|
|
|
|
| 26 |
NOTIFICATION = st.empty()
|
|
|
|
| 27 |
COLUMNS = st.columns([0.618, 0.01, 0.372])
|
|
|
|
| 28 |
LEFT_TABLE = COLUMNS[0].empty()
|
| 29 |
|
| 30 |
|
|
@@ -50,7 +59,7 @@ def get_matching_tags(soup, tags_plus_atrtibutes):
|
|
| 50 |
attrs = tag_attr["attrs"]
|
| 51 |
for attr in attrs:
|
| 52 |
# get all tags with those attributes
|
| 53 |
-
tags = [t for t in tags if t.has_attr(attr) or t.has_attr(
|
| 54 |
for t in tags:
|
| 55 |
if not t.find_parents(tag):
|
| 56 |
yield t
|
|
@@ -96,35 +105,44 @@ def chromecheck():
|
|
| 96 |
if 'CHECKED' not in st.session_state:
|
| 97 |
st.session_state['CHECKED'] = chromecheck()
|
| 98 |
|
| 99 |
-
from selenium import webdriver
|
| 100 |
-
from selenium.webdriver.chrome.options import Options
|
| 101 |
-
from bs4 import BeautifulSoup
|
| 102 |
|
| 103 |
class RenderedPage:
|
| 104 |
def __init__(self):
|
| 105 |
# Set up the headless browser
|
| 106 |
-
chrome_options =
|
| 107 |
-
chrome_options.add_argument("--headless") # Run the browser in headless mode
|
| 108 |
-
|
|
|
|
|
|
|
| 109 |
|
| 110 |
def get_rendered_page(self, url):
|
| 111 |
-
|
| 112 |
# Load the webpage in the headless browser
|
| 113 |
self.driver.get(url)
|
| 114 |
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
# Get the fully rendered HTML
|
| 119 |
-
full_html =
|
| 120 |
-
|
|
|
|
|
|
|
| 121 |
# Close the browser
|
| 122 |
self.driver.quit()
|
| 123 |
-
|
| 124 |
# Create a Beautiful Soup object of the fully rendered page
|
| 125 |
-
soup = BeautifulSoup(full_html,
|
| 126 |
-
return soup
|
| 127 |
|
|
|
|
|
|
|
| 128 |
|
| 129 |
def convert_to_markdown(soup):
|
| 130 |
"""
|
|
@@ -139,12 +157,10 @@ def convert_to_markdown(soup):
|
|
| 139 |
"""
|
| 140 |
|
| 141 |
converter = MarkdownConverter(
|
| 142 |
-
code_language="python",
|
| 143 |
-
default_title=False,
|
| 144 |
-
escape_asterisks=False,
|
| 145 |
escape_underscores=False,
|
| 146 |
)
|
| 147 |
-
return converter.
|
| 148 |
|
| 149 |
|
| 150 |
def convert_to_safe_url(text):
|
|
@@ -329,7 +345,7 @@ def crawl_website(url, tags_to_save=[], do_save=False, up_level=False):
|
|
| 329 |
continue
|
| 330 |
|
| 331 |
base_filename = f"{f'{convert_to_safe_url(parent_path)}/' if parent_path else '/'}{convert_to_safe_url(path_tail)}"
|
| 332 |
-
soup = BeautifulSoup(content, "
|
| 333 |
tag_items = list(get_matching_tags(soup, tags_to_save))
|
| 334 |
# remove duplicates starting from the last item towards the first..
|
| 335 |
|
|
@@ -469,7 +485,7 @@ def main():
|
|
| 469 |
COLUMNS[2].multiselect(
|
| 470 |
"",
|
| 471 |
STATE.HTML_TAGS_LIST,
|
| 472 |
-
["h1", "h2", "h3", "
|
| 473 |
key="htmltags",
|
| 474 |
label_visibility="collapsed",
|
| 475 |
)
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
| 2 |
import platform
|
| 3 |
+
import re
|
| 4 |
import subprocess
|
| 5 |
+
import time
|
| 6 |
from collections import deque
|
| 7 |
from html.parser import HTMLParser
|
| 8 |
from urllib.parse import urljoin, urlparse
|
| 9 |
|
|
|
|
| 10 |
import mdformat
|
| 11 |
import requests
|
| 12 |
import streamlit as st
|
| 13 |
from bs4 import BeautifulSoup
|
| 14 |
from markdownify import MarkdownConverter
|
| 15 |
+
from selenium.webdriver import Chrome, ChromeOptions
|
| 16 |
from selenium.webdriver.common.by import By
|
| 17 |
+
from selenium.webdriver.chrome.service import Service
|
| 18 |
+
from webdriver_manager.chrome import ChromeDriverManager
|
| 19 |
+
|
| 20 |
|
| 21 |
+
# Title of page
|
| 22 |
+
from statwords import Items, StatusWordItem
|
| 23 |
+
|
| 24 |
+
# Title of page
|
| 25 |
|
| 26 |
|
| 27 |
st.set_page_config("Minor Scrapes", "🔪", "wide")
|
|
|
|
| 29 |
|
| 30 |
st.title("Minor Scrapes")
|
| 31 |
|
| 32 |
+
|
| 33 |
NOTIFICATION = st.empty()
|
| 34 |
+
NOTIFICATION2 = st.empty()
|
| 35 |
COLUMNS = st.columns([0.618, 0.01, 0.372])
|
| 36 |
+
|
| 37 |
LEFT_TABLE = COLUMNS[0].empty()
|
| 38 |
|
| 39 |
|
|
|
|
| 59 |
attrs = tag_attr["attrs"]
|
| 60 |
for attr in attrs:
|
| 61 |
# get all tags with those attributes
|
| 62 |
+
tags = [t for t in tags if t.has_attr(attr) or t.has_attr(attr + "s")]
|
| 63 |
for t in tags:
|
| 64 |
if not t.find_parents(tag):
|
| 65 |
yield t
|
|
|
|
| 105 |
if 'CHECKED' not in st.session_state:
|
| 106 |
st.session_state['CHECKED'] = chromecheck()
|
| 107 |
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
class RenderedPage:
|
| 110 |
def __init__(self):
|
| 111 |
# Set up the headless browser
|
| 112 |
+
chrome_options = ChromeOptions()
|
| 113 |
+
chrome_options.add_argument("--headless=new") # Run the browser in headless mode
|
| 114 |
+
chrome_options.add_argument("--no-sandbox")
|
| 115 |
+
chrome_options.add_argument("--no-gpu")
|
| 116 |
+
self.driver = Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)
|
| 117 |
|
| 118 |
def get_rendered_page(self, url):
|
|
|
|
| 119 |
# Load the webpage in the headless browser
|
| 120 |
self.driver.get(url)
|
| 121 |
|
| 122 |
+
prog = st.progress(0.0, f'grabbing{url}')
|
| 123 |
+
|
| 124 |
+
full_html = ""
|
| 125 |
+
status = ""
|
| 126 |
+
while status != "complete":
|
| 127 |
+
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
| 128 |
+
status = self.driver.execute_script("return document.readyState")
|
| 129 |
+
with prog:
|
| 130 |
+
time.sleep(0.1)
|
| 131 |
+
prog.progress(0.1)
|
| 132 |
+
|
| 133 |
|
| 134 |
# Get the fully rendered HTML
|
| 135 |
+
full_html = self.driver.page_source
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
|
| 139 |
# Close the browser
|
| 140 |
self.driver.quit()
|
|
|
|
| 141 |
# Create a Beautiful Soup object of the fully rendered page
|
| 142 |
+
soup = BeautifulSoup(full_html, 'html.parser')
|
|
|
|
| 143 |
|
| 144 |
+
prog.empty()
|
| 145 |
+
return soup
|
| 146 |
|
| 147 |
def convert_to_markdown(soup):
|
| 148 |
"""
|
|
|
|
| 157 |
"""
|
| 158 |
|
| 159 |
converter = MarkdownConverter(
|
| 160 |
+
code_language ="python",
|
|
|
|
|
|
|
| 161 |
escape_underscores=False,
|
| 162 |
)
|
| 163 |
+
return converter.convert(soup.prettify())
|
| 164 |
|
| 165 |
|
| 166 |
def convert_to_safe_url(text):
|
|
|
|
| 345 |
continue
|
| 346 |
|
| 347 |
base_filename = f"{f'{convert_to_safe_url(parent_path)}/' if parent_path else '/'}{convert_to_safe_url(path_tail)}"
|
| 348 |
+
soup = BeautifulSoup(content, "html.parser")
|
| 349 |
tag_items = list(get_matching_tags(soup, tags_to_save))
|
| 350 |
# remove duplicates starting from the last item towards the first..
|
| 351 |
|
|
|
|
| 485 |
COLUMNS[2].multiselect(
|
| 486 |
"",
|
| 487 |
STATE.HTML_TAGS_LIST,
|
| 488 |
+
["h1", "h2", "h3", "p", "pre"],
|
| 489 |
key="htmltags",
|
| 490 |
label_visibility="collapsed",
|
| 491 |
)
|