Spaces:

dina1
/

web_scraping

Runtime error

App Files Files Community

dina1 commited on Dec 20, 2025

Commit

0f401d0

verified ·

1 Parent(s): b25bb1f

Create shoalhaven_da_scraper.py

Browse files

Files changed (1) hide show

shoalhaven_da_scraper.py +134 -0

shoalhaven_da_scraper.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import time
+import pandas as pd
+from playwright.sync_api import sync_playwright
+BASE_URL = "https://www3.shoalhaven.nsw.gov.au/masterviewUI/modules/ApplicationMaster/Default.aspx"
+OUTPUT_FILE = "results.csv"
+HEADERS = [
+    "DA_Number",
+    "Detail_URL",
+    "Description",
+    "Submitted_Date",
+    "Decision",
+    "Categories",
+    "Property_Address",
+    "Applicant",
+    "Progress",
+    "Fees",
+    "Documents",
+    "Contact_Council",
+]
+def clean_fees(text):
+    text = text.strip()
+    if text == "No fees recorded against this application.":
+        return "Not required"
+    return text
+def clean_contact(text):
+    text = text.strip()
+    if text == (
+        "Application Is Not on exhibition, please call Council on 1300 293 111 if you require assistance."
+    ):
+        return "Not required"
+    return text
+def scrape():
+    records = []
+    seen_da = set()
+    with sync_playwright() as p:
+        browser = p.chromium.launch(
+            headless=True,
+            args=["--no-sandbox", "--disable-dev-shm-usage"]
+        )
+        context = browser.new_context()
+        page = context.new_page()
+        # Step 1: Navigate & Agree
+        page.goto(BASE_URL, timeout=60000)
+        page.click("text=Agree")
+        time.sleep(3)
+        # Step 2: DA Tracking
+        page.click("text=DA Tracking")
+        time.sleep(4)
+        # Step 3: Advanced Search (robust selector)
+        page.locator("a:has-text('Advanced')").click()
+        time.sleep(3)
+        # Step 4: Date range
+        page.fill("input[name='ctl00$ContentPlaceHolder1$txtFromDate']", "01/10/2025")
+        page.fill("input[name='ctl00$ContentPlaceHolder1$txtToDate']", "31/10/2025")
+        page.click("text=Search")
+        time.sleep(4)
+        # Show results
+        page.click("text=Show")
+        time.sleep(4)
+        while True:
+            rows = page.query_selector_all("table tr")[1:]
+            for row in rows:
+                cols = row.query_selector_all("td")
+                if not cols:
+                    continue
+                da_number = cols[0].inner_text().strip()
+                if da_number in seen_da:
+                    continue
+                seen_da.add(da_number)
+                link = cols[0].query_selector("a").get_attribute("href")
+                detail_url = f"https://www3.shoalhaven.nsw.gov.au{link}"
+                detail_page = context.new_page()
+                detail_page.goto(detail_url, timeout=60000)
+                time.sleep(3)
+                def get_value(label):
+                    try:
+                        el = detail_page.query_selector(f"text={label}")
+                        return el.evaluate(
+                            "node => node.parentElement.nextElementSibling.innerText"
+                        ).strip()
+                    except:
+                        return ""
+                record = {
+                    "DA_Number": da_number,
+                    "Detail_URL": detail_url,
+                    "Description": get_value("Description"),
+                    "Submitted_Date": get_value("Submitted"),
+                    "Decision": get_value("Decision"),
+                    "Categories": get_value("Category"),
+                    "Property_Address": get_value("Property Address"),
+                    "Applicant": get_value("Applicant"),
+                    "Progress": get_value("Progress"),
+                    "Fees": clean_fees(get_value("Fees")),
+                    "Documents": get_value("Documents"),
+                    "Contact_Council": clean_contact(get_value("Contact Council")),
+                }
+                records.append(record)
+                detail_page.close()
+            # Pagination
+            next_btn = page.query_selector("text=Next")
+            if next_btn and next_btn.is_enabled():
+                next_btn.click()
+                time.sleep(4)
+            else:
+                break
+        browser.close()
+    df = pd.DataFrame(records, columns=HEADERS)
+    df.to_csv(OUTPUT_FILE, index=False)
+    print(f"Saved {len(records)} records to {OUTPUT_FILE}")
+if __name__ == "__main__":
+    scrape()