selenium_exam / extract.py
sintamar's picture
Update extract.py
fd9b833 verified
raw
history blame
2.38 kB
from selenium import webdriver
from selenium.common.exceptions import WebDriverException
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
def take_webdata(url):
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
try:
wd = webdriver.Chrome(options=options)
wd.set_window_size(1080, 720) # Adjust the window size here
wd.get(url)
wd.implicitly_wait(5)
# Get the page title
page_title = wd.title
screenshot = wd.get_screenshot_as_png()
html = wd.execute_script("return document.documentElement.outerHTML;")
soup = BeautifulSoup(html, "html.parser")
#div_find = soup.find("div", id="tournament-table", class_="tournament-table-standings")
#table_find = div_find.find("table") if div_find else None
rows = soup.find("div", class_="ui-table__row ")
data = []
for row in rows:
rank = row.select_one(".tableCellRank")
team = row.select_one(".tableCellParticipant__name")
mp = row.select_one("span.table__cell:nth-of-type(3)")
w = row.select_one("span.table__cell:nth-of-type(4)")
d = row.select_one("span.table__cell:nth-of-type(5)")
l = row.select_one("span.table__cell:nth-of-type(6)")
g = row.select_one(".table__cell--score")
sg = row.select_one(".table__cell--goalsForAgainstDiff")
pts = row.select_one(".table__cell--points")
data.append({
"Peringkat": rank.text.strip() if rank else "",
"Tim": team.text.strip() if team else "",
"Main": mp.text.strip() if mp else "",
"Menang": w.text.strip() if w else "",
"Seri": d.text.strip() if d else "",
"Kalah": l.text.strip() if l else "",
"Gol": g.text.strip() if g else "",
"Selisih Gol": sg.text.strip() if sg else "",
"Poin": pts.text.strip() if pts else ""
})
# === 5. Buat DataFrame ===
df = pd.DataFrame(data)
except WebDriverException as e:
return page_title
finally:
if wd:
wd.quit()
return html , df