Update extract.py
Browse files- extract.py +38 -17
extract.py
CHANGED
|
@@ -8,6 +8,7 @@ import time
|
|
| 8 |
from selenium.webdriver.common.by import By
|
| 9 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
from selenium.webdriver.support import expected_conditions as EC
|
|
|
|
| 11 |
|
| 12 |
def take_webdata(url):
|
| 13 |
options = webdriver.ChromeOptions()
|
|
@@ -33,9 +34,40 @@ def take_webdata(url):
|
|
| 33 |
return Image.open(BytesIO(screenshot)) , page_title
|
| 34 |
|
| 35 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def get_vehicle_info(plate_number: str):
|
| 37 |
# Configure headless Chrome
|
| 38 |
-
# options = Options()
|
| 39 |
options = webdriver.ChromeOptions()
|
| 40 |
options.add_argument("--headless")
|
| 41 |
options.add_argument("--disable-gpu")
|
|
@@ -44,23 +76,18 @@ def get_vehicle_info(plate_number: str):
|
|
| 44 |
# Path to chromedriver (adjust if needed)
|
| 45 |
driver = webdriver.Chrome(options=options)
|
| 46 |
|
| 47 |
-
try:
|
| 48 |
-
|
| 49 |
driver.get("https://www.jambisamsat.net/infopkb.html")
|
| 50 |
time.sleep(1)
|
| 51 |
|
| 52 |
-
|
| 53 |
-
# Wait until input box is present
|
| 54 |
WebDriverWait(driver, 10).until(
|
| 55 |
EC.presence_of_element_located((By.ID, "no_polisi"))
|
| 56 |
)
|
| 57 |
|
| 58 |
-
# Fill in the plate number
|
| 59 |
input_field = driver.find_element(By.ID, "no_polisi")
|
| 60 |
input_field.clear()
|
| 61 |
input_field.send_keys(plate_number)
|
| 62 |
|
| 63 |
-
# Click the submit button by class name
|
| 64 |
submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
|
| 65 |
submit_button.click()
|
| 66 |
|
|
@@ -69,22 +96,16 @@ def get_vehicle_info(plate_number: str):
|
|
| 69 |
EC.url_contains("infopkb.php")
|
| 70 |
)
|
| 71 |
|
| 72 |
-
# # Step 2: Find the input and enter plate number
|
| 73 |
-
# input_element = driver.find_element(By.NAME, "nopol")
|
| 74 |
-
# input_element.send_keys(plate_number)
|
| 75 |
-
|
| 76 |
-
# # Step 3: Submit the form
|
| 77 |
-
# submit_button = driver.find_element(By.CSS_SELECTOR, 'input[type="submit"]')
|
| 78 |
-
# submit_button.click()
|
| 79 |
-
# time.sleep(2)
|
| 80 |
-
|
| 81 |
driver.implicitly_wait(3)
|
| 82 |
|
| 83 |
-
|
| 84 |
scroll_height = driver.execute_script("return document.body.scrollHeight")
|
| 85 |
driver.set_window_size(1920, scroll_height + 200) # force full-page height
|
| 86 |
time.sleep(1)
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
page_title = driver.title
|
| 89 |
screenshot = driver.get_screenshot_as_png()
|
| 90 |
|
|
|
|
| 8 |
from selenium.webdriver.common.by import By
|
| 9 |
from selenium.webdriver.support.ui import WebDriverWait
|
| 10 |
from selenium.webdriver.support import expected_conditions as EC
|
| 11 |
+
from bs4 import BeautifulSoup
|
| 12 |
|
| 13 |
def take_webdata(url):
|
| 14 |
options = webdriver.ChromeOptions()
|
|
|
|
| 34 |
return Image.open(BytesIO(screenshot)) , page_title
|
| 35 |
|
| 36 |
|
| 37 |
+
def scrape_vehicle(page_source):
|
| 38 |
+
soup = BeautifulSoup(page_source, "html.parser")
|
| 39 |
+
data_kendaraan = {}
|
| 40 |
+
table = soup.find("table")
|
| 41 |
+
for row in table.find_all("tr"):
|
| 42 |
+
cells = row.find_all("td")
|
| 43 |
+
if len(cells) >= 3:
|
| 44 |
+
key = cells[0].get_text(strip=True).lower().replace(".", "").replace(" ", "_")
|
| 45 |
+
value = cells[2].get_text(strip=True)
|
| 46 |
+
data_kendaraan[key] = value
|
| 47 |
+
|
| 48 |
+
rincians = []
|
| 49 |
+
rincian_div = soup.find("div", id="det_pkb")
|
| 50 |
+
if rincian_div:
|
| 51 |
+
rows = rincian_div.find_all("div", class_="row")
|
| 52 |
+
for row in rows[1:]: # baris pertama adalah header
|
| 53 |
+
cols = row.find_all("p")
|
| 54 |
+
if len(cols) >= 3:
|
| 55 |
+
rincian = {
|
| 56 |
+
"pokok": cols[0].get_text(strip=True),
|
| 57 |
+
"denda": cols[1].get_text(strip=True),
|
| 58 |
+
"total": cols[2].get_text(strip=True),
|
| 59 |
+
}
|
| 60 |
+
rincian["jenis"] = cols[3].get_text(strip=True) if len(cols) > 3 else ""
|
| 61 |
+
rincian["jenis"] = rincian["jenis"].upper()
|
| 62 |
+
rincian = {k: v for k, v in rincian.items() if v}
|
| 63 |
+
if rincian:
|
| 64 |
+
rincians.append(rincian)
|
| 65 |
+
|
| 66 |
+
return data_kendaraan, rincians
|
| 67 |
+
|
| 68 |
+
|
| 69 |
def get_vehicle_info(plate_number: str):
|
| 70 |
# Configure headless Chrome
|
|
|
|
| 71 |
options = webdriver.ChromeOptions()
|
| 72 |
options.add_argument("--headless")
|
| 73 |
options.add_argument("--disable-gpu")
|
|
|
|
| 76 |
# Path to chromedriver (adjust if needed)
|
| 77 |
driver = webdriver.Chrome(options=options)
|
| 78 |
|
| 79 |
+
try:
|
|
|
|
| 80 |
driver.get("https://www.jambisamsat.net/infopkb.html")
|
| 81 |
time.sleep(1)
|
| 82 |
|
|
|
|
|
|
|
| 83 |
WebDriverWait(driver, 10).until(
|
| 84 |
EC.presence_of_element_located((By.ID, "no_polisi"))
|
| 85 |
)
|
| 86 |
|
|
|
|
| 87 |
input_field = driver.find_element(By.ID, "no_polisi")
|
| 88 |
input_field.clear()
|
| 89 |
input_field.send_keys(plate_number)
|
| 90 |
|
|
|
|
| 91 |
submit_button = driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-primary[type="submit"]')
|
| 92 |
submit_button.click()
|
| 93 |
|
|
|
|
| 96 |
EC.url_contains("infopkb.php")
|
| 97 |
)
|
| 98 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
driver.implicitly_wait(3)
|
| 100 |
|
|
|
|
| 101 |
scroll_height = driver.execute_script("return document.body.scrollHeight")
|
| 102 |
driver.set_window_size(1920, scroll_height + 200) # force full-page height
|
| 103 |
time.sleep(1)
|
| 104 |
|
| 105 |
+
data_kendaraan, rincian = scrape_vehicle(driver.page_source)
|
| 106 |
+
|
| 107 |
+
print(data_kendaraan, rincian)
|
| 108 |
+
|
| 109 |
page_title = driver.title
|
| 110 |
screenshot = driver.get_screenshot_as_png()
|
| 111 |
|