File size: 3,155 Bytes
b6e91ad
1bbce87
e2ca888
7c0ed84
89a2174
54ae26b
7c33c47
7c0ed84
 
 
b6e91ad
 
8c1f8f7
b6e91ad
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cfac583
abe5773
cfac583
 
bfcf1e8
 
 
 
31bbe73
cfac583
b6e91ad
 
 
 
7c0ed84
 
 
 
 
 
 
 
 
 
cfac583
7c0ed84
 
 
cfac583
7c0ed84
 
cfac583
 
 
 
a9466f5
cb9f3f3
 
a9466f5
 
 
cfac583
 
 
 
 
 
 
 
9822f39
92c2e4d
31bbe73
7c0ed84
 
 
 
 
 
 
b6e91ad
cb9f3f3
b6e91ad
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import streamlit as st
from io import BytesIO
import pandas as pd
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import WebDriverException
from io import BytesIO
from bs4 import BeautifulSoup

def main():
    st.title("Website Content Exctractor")
    
    # Get website URL from user input
    url = st.text_input("Enter a URL:", "")
    if st.button("Proceed"):
        if not url:
            st.warning("URL is empty.")
        else:
            visualize(url)
  

def visualize(url):  
    try:
    # Fetch and display the website content
        with st.spinner("loading website data ..."):
            # innerHTML = get_innerHTML(url)
            html_content, xtarget_dropdown, xurl = take_webdata(url)
            #st.subheader("Website title:")
            if xtarget_dropdown:
                st.code(xtarget_dropdown, language='html')  
                if xurl:
                    st.code(xurl, language='html')                  
                else:
                    st.warning("tidak ditemukan.")  
            else:
                st.warning("tidak ditemukan.")  
    
    except Exception as e:
        st.error(f"Error: {e}")

def take_webdata(url):
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    try:
        wd = webdriver.Chrome(options=options)
        wd.set_window_size(1080, 720)  # Adjust the window size here
        wd.get(url)
        wd.implicitly_wait(15)
        # Get the page title
        page_title = wd.title
        #screenshot = wd.get_screenshot_as_png()
        #WebDriverWait(wd, 20).until(EC.presence_of_element_located((By.ID, "tournament-table")))
        html = wd.execute_script("return document.documentElement.outerHTML;")
        soup = BeautifulSoup(html, "html.parser")

        target_dropdown = soup.find('div', class_='dropdown-menu', attrs={'aria-labelledby': 'navbar-match'})

        if target_dropdown:
            klasemenlink = target_dropdown.find('a', class_='dropdown-item',string='KLASEMEN')
            if klasemenlink:
                urlx = klasemenlink.get('href')
            #for link in links:
                #urlx = link.get('href')
                #text = link.get_text(strip=True)
                #print(f"Text: {text}")
                #print(f"URL: {url}")
                #print("---")
        else:
            print("Dropdown menu tidak ditemukan")
    
        #div_find = soup.find("div", id="tournament-table", class_="tournament-table-standings")
        #rows = div_find.select("div[class*=ui-table__row]")
        
        #rows = div_find.find("ui-table__row  ") if div_find else None
        #rows = soup.find("div", class_="ui-table__row  ")
        
      
    except WebDriverException as e:
        return page_title
    finally:
        if wd:
            wd.quit()

    return html ,target_dropdown, urlx

if __name__ == "__main__":
    main()