Spaces:
Sleeping
Sleeping
File size: 3,155 Bytes
b6e91ad 1bbce87 e2ca888 7c0ed84 89a2174 54ae26b 7c33c47 7c0ed84 b6e91ad 8c1f8f7 b6e91ad cfac583 abe5773 cfac583 bfcf1e8 31bbe73 cfac583 b6e91ad 7c0ed84 cfac583 7c0ed84 cfac583 7c0ed84 cfac583 a9466f5 cb9f3f3 a9466f5 cfac583 9822f39 92c2e4d 31bbe73 7c0ed84 b6e91ad cb9f3f3 b6e91ad |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
import streamlit as st
from io import BytesIO
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
from io import BytesIO
from bs4 import BeautifulSoup
def main():
st.title("Website Content Exctractor")
# Get website URL from user input
url = st.text_input("Enter a URL:", "")
if st.button("Proceed"):
if not url:
st.warning("URL is empty.")
else:
visualize(url)
def visualize(url):
try:
# Fetch and display the website content
with st.spinner("loading website data ..."):
# innerHTML = get_innerHTML(url)
html_content, xtarget_dropdown, xurl = take_webdata(url)
#st.subheader("Website title:")
if xtarget_dropdown:
st.code(xtarget_dropdown, language='html')
if xurl:
st.code(xurl, language='html')
else:
st.warning("tidak ditemukan.")
else:
st.warning("tidak ditemukan.")
except Exception as e:
st.error(f"Error: {e}")
def take_webdata(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
try:
wd = webdriver.Chrome(options=options)
wd.set_window_size(1080, 720) # Adjust the window size here
wd.get(url)
wd.implicitly_wait(15)
# Get the page title
page_title = wd.title
#screenshot = wd.get_screenshot_as_png()
#WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.ID, "tournament-table")))
html = wd.execute_script("return document.documentElement.outerHTML;")
soup = BeautifulSoup(html, "html.parser")
target_dropdown = soup.find('div', class_='dropdown-menu', attrs={'aria-labelledby': 'navbar-match'})
if target_dropdown:
klasemenlink = target_dropdown.find('a', class_='dropdown-item',string='KLASEMEN')
if klasemenlink:
urlx = klasemenlink.get('href')
#for link in links:
#urlx = link.get('href')
#text = link.get_text(strip=True)
#print(f"Text: {text}")
#print(f"URL: {url}")
#print("---")
else:
print("Dropdown menu tidak ditemukan")
#div_find = soup.find("div", id="tournament-table", class_="tournament-table-standings")
#rows = div_find.select("div[class*=ui-table__row]")
#rows = div_find.find("ui-table__row ") if div_find else None
#rows = soup.find("div", class_="ui-table__row ")
except WebDriverException as e:
return page_title
finally:
if wd:
wd.quit()
return html ,target_dropdown, urlx
if __name__ == "__main__":
main()
|