dina1 commited on
Commit
0f401d0
·
verified ·
1 Parent(s): b25bb1f

Create shoalhaven_da_scraper.py

Browse files
Files changed (1) hide show
  1. shoalhaven_da_scraper.py +134 -0
shoalhaven_da_scraper.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import pandas as pd
3
+ from playwright.sync_api import sync_playwright
4
+
5
+ BASE_URL = "https://www3.shoalhaven.nsw.gov.au/masterviewUI/modules/ApplicationMaster/Default.aspx"
6
+ OUTPUT_FILE = "results.csv"
7
+
8
+ HEADERS = [
9
+ "DA_Number",
10
+ "Detail_URL",
11
+ "Description",
12
+ "Submitted_Date",
13
+ "Decision",
14
+ "Categories",
15
+ "Property_Address",
16
+ "Applicant",
17
+ "Progress",
18
+ "Fees",
19
+ "Documents",
20
+ "Contact_Council",
21
+ ]
22
+
23
+ def clean_fees(text):
24
+ text = text.strip()
25
+ if text == "No fees recorded against this application.":
26
+ return "Not required"
27
+ return text
28
+
29
+ def clean_contact(text):
30
+ text = text.strip()
31
+ if text == (
32
+ "Application Is Not on exhibition, please call Council on 1300 293 111 if you require assistance."
33
+ ):
34
+ return "Not required"
35
+ return text
36
+
37
+ def scrape():
38
+ records = []
39
+ seen_da = set()
40
+
41
+ with sync_playwright() as p:
42
+ browser = p.chromium.launch(
43
+ headless=True,
44
+ args=["--no-sandbox", "--disable-dev-shm-usage"]
45
+ )
46
+ context = browser.new_context()
47
+ page = context.new_page()
48
+
49
+ # Step 1: Navigate & Agree
50
+ page.goto(BASE_URL, timeout=60000)
51
+ page.click("text=Agree")
52
+ time.sleep(3)
53
+
54
+ # Step 2: DA Tracking
55
+ page.click("text=DA Tracking")
56
+ time.sleep(4)
57
+
58
+ # Step 3: Advanced Search (robust selector)
59
+ page.locator("a:has-text('Advanced')").click()
60
+ time.sleep(3)
61
+
62
+ # Step 4: Date range
63
+ page.fill("input[name='ctl00$ContentPlaceHolder1$txtFromDate']", "01/10/2025")
64
+ page.fill("input[name='ctl00$ContentPlaceHolder1$txtToDate']", "31/10/2025")
65
+ page.click("text=Search")
66
+ time.sleep(4)
67
+
68
+ # Show results
69
+ page.click("text=Show")
70
+ time.sleep(4)
71
+
72
+ while True:
73
+ rows = page.query_selector_all("table tr")[1:]
74
+
75
+ for row in rows:
76
+ cols = row.query_selector_all("td")
77
+ if not cols:
78
+ continue
79
+
80
+ da_number = cols[0].inner_text().strip()
81
+ if da_number in seen_da:
82
+ continue
83
+ seen_da.add(da_number)
84
+
85
+ link = cols[0].query_selector("a").get_attribute("href")
86
+ detail_url = f"https://www3.shoalhaven.nsw.gov.au{link}"
87
+
88
+ detail_page = context.new_page()
89
+ detail_page.goto(detail_url, timeout=60000)
90
+ time.sleep(3)
91
+
92
+ def get_value(label):
93
+ try:
94
+ el = detail_page.query_selector(f"text={label}")
95
+ return el.evaluate(
96
+ "node => node.parentElement.nextElementSibling.innerText"
97
+ ).strip()
98
+ except:
99
+ return ""
100
+
101
+ record = {
102
+ "DA_Number": da_number,
103
+ "Detail_URL": detail_url,
104
+ "Description": get_value("Description"),
105
+ "Submitted_Date": get_value("Submitted"),
106
+ "Decision": get_value("Decision"),
107
+ "Categories": get_value("Category"),
108
+ "Property_Address": get_value("Property Address"),
109
+ "Applicant": get_value("Applicant"),
110
+ "Progress": get_value("Progress"),
111
+ "Fees": clean_fees(get_value("Fees")),
112
+ "Documents": get_value("Documents"),
113
+ "Contact_Council": clean_contact(get_value("Contact Council")),
114
+ }
115
+
116
+ records.append(record)
117
+ detail_page.close()
118
+
119
+ # Pagination
120
+ next_btn = page.query_selector("text=Next")
121
+ if next_btn and next_btn.is_enabled():
122
+ next_btn.click()
123
+ time.sleep(4)
124
+ else:
125
+ break
126
+
127
+ browser.close()
128
+
129
+ df = pd.DataFrame(records, columns=HEADERS)
130
+ df.to_csv(OUTPUT_FILE, index=False)
131
+ print(f"Saved {len(records)} records to {OUTPUT_FILE}")
132
+
133
+ if __name__ == "__main__":
134
+ scrape()