salmanmapkar commited on
Commit
8fd8293
·
verified ·
1 Parent(s): fd6caa1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +444 -2
app.py CHANGED
@@ -1,4 +1,446 @@
1
  import streamlit as st
 
 
 
 
 
 
 
 
2
 
3
- x = st.slider('Select a value')
4
- st.write(x, 'squared is', x * x)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import streamlit as st
2
+ import pandas as pd
3
+ import requests
4
+ import usaddress
5
+ import concurrent.futures
6
+ import re
7
+ from bs4 import BeautifulSoup
8
+ from datetime import datetime
9
+ import io
10
 
11
+ # --- Scraper functions (adapted from your scraper.py) ---
12
+
13
+ def fetch_detail(cert_number, main_data, headers):
14
+ """
15
+ For a given certification number, call the URAC detail API and return a list of rows,
16
+ one per site address. If no site records are returned, a row with blank site fields is returned.
17
+ """
18
+ detail_rows = []
19
+ url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
20
+ try:
21
+ response = requests.get(url, headers=headers)
22
+ response.raise_for_status()
23
+ detail_data = response.json()
24
+ entities = detail_data.get("certificationEntities", [])
25
+ if not entities:
26
+ # No site records: return row with blank site fields.
27
+ row = main_data.copy()
28
+ row.update({
29
+ "Site Name": None,
30
+ "Site Address": None,
31
+ "Site Street": None,
32
+ "Site City": None,
33
+ "Site State": None,
34
+ "Site ZipCode": None
35
+ })
36
+ detail_rows.append(row)
37
+ else:
38
+ for entity_item in entities:
39
+ site_entity = entity_item.get("entity", {})
40
+ site_name = site_entity.get("name", None)
41
+ # Combine the site address parts.
42
+ site_address_parts = []
43
+ for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
44
+ part = site_entity.get(key)
45
+ if part:
46
+ site_address_parts.append(part)
47
+ site_address = ', '.join(site_address_parts)
48
+ # Parse the site address using usaddress.
49
+ parsed_site = usaddress.parse(site_address)
50
+ site_street, site_city, site_state, site_zipcode = '', '', '', ''
51
+ for value, label in parsed_site:
52
+ if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
53
+ site_street += f' {value}'
54
+ elif label == 'PlaceName':
55
+ site_city = value
56
+ elif label == 'StateName':
57
+ site_state = value
58
+ elif label == 'ZipCode':
59
+ site_zipcode = value
60
+ row = main_data.copy()
61
+ row.update({
62
+ "Site Name": site_name,
63
+ "Site Address": site_address,
64
+ "Site Street": site_street.strip(),
65
+ "Site City": site_city,
66
+ "Site State": site_state,
67
+ "Site ZipCode": site_zipcode
68
+ })
69
+ detail_rows.append(row)
70
+ except Exception as e:
71
+ st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
72
+ return detail_rows
73
+
74
+ def scrape_urac():
75
+ """
76
+ Scrape URAC accreditation data:
77
+ 1. Call the main filter API.
78
+ 2. Parse organization details.
79
+ 3. For each organization, call the detail API (in parallel) to get one row per site address.
80
+ Returns a pandas DataFrame.
81
+ """
82
+ organizations = []
83
+ all_rows = []
84
+ headers = {
85
+ 'accept': '*/*',
86
+ 'accept-language': 'en-US,en;q=0.9',
87
+ 'content-type': 'application/json',
88
+ 'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
89
+ 'origin': 'https://accreditnet.urac.org',
90
+ 'priority': 'u=1, i',
91
+ 'referer': 'https://accreditnet.urac.org/directory/',
92
+ 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
93
+ 'sec-ch-ua-mobile': '?0',
94
+ 'sec-ch-ua-platform': '"Windows"',
95
+ 'sec-fetch-dest': 'empty',
96
+ 'sec-fetch-mode': 'cors',
97
+ 'sec-fetch-site': 'same-origin',
98
+ 'sec-gpc': '1',
99
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
100
+ }
101
+ json_data = {
102
+ 'filter': {
103
+ 'allParts': [
104
+ {
105
+ 'name': 'completedApplicationDecisionItem.typeDisplay.value',
106
+ 'comparator': 0,
107
+ 'valueType': 0,
108
+ 'textValue': 'Accreditation Program',
109
+ 'integerValue': None,
110
+ 'decimalValue': None,
111
+ 'dateTimeValue': None,
112
+ 'booleanValue': None,
113
+ 'innerFilter': None,
114
+ },
115
+ {
116
+ 'name': 'certificateType.programName',
117
+ 'comparator': 0,
118
+ 'valueType': 0,
119
+ 'textValue': 'Specialty Pharmacy',
120
+ 'integerValue': None,
121
+ 'decimalValue': None,
122
+ 'dateTimeValue': None,
123
+ 'booleanValue': None,
124
+ 'innerFilter': None,
125
+ },
126
+ ],
127
+ 'anyParts': [],
128
+ 'notParts': [],
129
+ },
130
+ 'orderBy': 'certificationNumber',
131
+ 'pageSize': 15,
132
+ 'limit': 100,
133
+ }
134
+ try:
135
+ response = requests.post(
136
+ 'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
137
+ headers=headers,
138
+ json=json_data
139
+ )
140
+ response.raise_for_status()
141
+ data = response.json()
142
+ except Exception as e:
143
+ st.write("Error processing URAC main API:", e)
144
+ return pd.DataFrame()
145
+
146
+ # Parse each organization item.
147
+ for item in data.get('items', []):
148
+ entity = item.get('entity', {})
149
+ org_name = entity.get('name', None)
150
+ decision = item.get('completedApplicationDecisionItem', {})
151
+ outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
152
+ status = outcome if outcome is not None else item.get('effectiveStatusName', None)
153
+ srt_date = item.get('issuedDate', None)
154
+ exp_date = item.get('expirationDate', None)
155
+ program = item.get('certificateType', {}).get('displayName', None)
156
+ address_parts = []
157
+ for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
158
+ part = entity.get(key)
159
+ if part:
160
+ address_parts.append(part)
161
+ address = ', '.join(address_parts)
162
+ parsed_address = usaddress.parse(address)
163
+ street, city, state, zipcode = '', '', '', ''
164
+ for value, label in parsed_address:
165
+ if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
166
+ street += f' {value}'
167
+ elif label == 'PlaceName':
168
+ city = value
169
+ elif label == 'StateName':
170
+ state = value
171
+ elif label == 'ZipCode':
172
+ zipcode = value
173
+ # Get certification number.
174
+ cert_number = item.get("primaryCertification", {}).get("certificationNumber")
175
+ if not cert_number:
176
+ cert_number = item.get("certificationNumber")
177
+ org_data = {
178
+ "Organization Name": org_name,
179
+ "Accreditation Status": status,
180
+ "Start Date": srt_date,
181
+ "Expiration Date": exp_date,
182
+ "Program": program,
183
+ "Address": address,
184
+ "Street": street.strip(),
185
+ "City": city,
186
+ "State": state,
187
+ "ZipCode": zipcode,
188
+ "Certification Number": cert_number
189
+ }
190
+ organizations.append(org_data)
191
+
192
+ # Use a thread pool to fetch details in parallel.
193
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
194
+ future_to_org = {
195
+ executor.submit(fetch_detail, org["Certification Number"], org, headers): org
196
+ for org in organizations if org["Certification Number"]
197
+ }
198
+ total = len(future_to_org)
199
+ completed = 0
200
+ for future in concurrent.futures.as_completed(future_to_org):
201
+ try:
202
+ detail_rows = future.result()
203
+ all_rows.extend(detail_rows)
204
+ except Exception as exc:
205
+ org = future_to_org[future]
206
+ st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
207
+ completed += 1
208
+ return pd.DataFrame(all_rows)
209
+
210
+ def _parse_accreditation_blocks(detail_soup):
211
+ """
212
+ Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts.
213
+ """
214
+ results = []
215
+ blocks = detail_soup.find_all('div', class_='main_cont_det')
216
+ for block in blocks:
217
+ start_date, expiration_date = '', ''
218
+ site_program, site_service = '', ''
219
+ for p in block.find_all('p'):
220
+ text = p.get_text(strip=True)
221
+ if 'Date:' in text:
222
+ m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text)
223
+ if m:
224
+ start_date = m.group(1)
225
+ expiration_date = m.group(2)
226
+ elif 'Program:' in text:
227
+ site_program = text.split('Program:')[-1].strip()
228
+ elif 'Service:' in text:
229
+ site_service = text.split('Service:')[-1].strip()
230
+ results.append({
231
+ "Start Date": start_date,
232
+ "Expiration Date": expiration_date,
233
+ "SiteProgram": site_program,
234
+ "SiteService": site_service
235
+ })
236
+ return results
237
+
238
+ def _extract_original_program(detail_soup):
239
+ """
240
+ Extract the original Program value from the detail soup.
241
+ """
242
+ program = ''
243
+ for p in detail_soup.find_all('p'):
244
+ if 'Program:' in p.get_text():
245
+ program = p.get_text(strip=True).split('Program:')[-1].strip()
246
+ break
247
+ return program
248
+
249
+ def _fetch_detail_for_company(company, base_url, headers, cookies):
250
+ """
251
+ For a given company from the ACHC main API, fetch the detail API,
252
+ parse the HTML detail, and return one or more rows.
253
+ """
254
+ rows = []
255
+ company_id = company["company_id"]
256
+ detail_payload = f'action=view_provider_details&data_company_id={company_id}'
257
+ try:
258
+ detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
259
+ detail_resp.raise_for_status()
260
+ detail_json = detail_resp.json()
261
+ detail_html = detail_json.get('response_html', '')
262
+ detail_soup = BeautifulSoup(detail_html, 'html.parser')
263
+
264
+ original_program = _extract_original_program(detail_soup)
265
+ acc_blocks = _parse_accreditation_blocks(detail_soup)
266
+ if not acc_blocks:
267
+ rows.append({
268
+ "Organization Name": company["org_name"],
269
+ "Start Date": '',
270
+ "Expiration Date": '',
271
+ "Accreditation Status": "N/A",
272
+ "Program": original_program,
273
+ "SiteProgram": '',
274
+ "SiteService": '',
275
+ "Address": company["address"],
276
+ "Street": company["street"],
277
+ "City": company["city"],
278
+ "State": company["state"],
279
+ "ZipCode": company["zipcode"]
280
+ })
281
+ else:
282
+ for block in acc_blocks:
283
+ rows.append({
284
+ "Organization Name": company["org_name"],
285
+ "Start Date": block["Start Date"],
286
+ "Expiration Date": block["Expiration Date"],
287
+ "Accreditation Status": "N/A",
288
+ "Program": original_program,
289
+ "SiteProgram": block["SiteProgram"],
290
+ "SiteService": block["SiteService"],
291
+ "Address": company["address"],
292
+ "Street": company["street"],
293
+ "City": company["city"],
294
+ "State": company["state"],
295
+ "ZipCode": company["zipcode"]
296
+ })
297
+ except Exception as e:
298
+ st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
299
+ return rows
300
+
301
+ def scrape_achc():
302
+ """
303
+ Scrape ACHC data:
304
+ 1. Call the main API to get HTML.
305
+ 2. Parse each company’s info.
306
+ 3. In parallel, call the detail API to get accreditation details.
307
+ Returns a pandas DataFrame.
308
+ """
309
+ headers = {
310
+ 'accept': 'application/json, text/javascript, */*; q=0.01',
311
+ 'accept-language': 'en-US,en;q=0.8',
312
+ 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
313
+ 'origin': 'https://www.achc.org',
314
+ 'priority': 'u=1, i',
315
+ 'referer': 'https://www.achc.org/find-a-provider/',
316
+ 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
317
+ 'sec-ch-ua-mobile': '?0',
318
+ 'sec-ch-ua-platform': '"Windows"',
319
+ 'sec-fetch-dest': 'empty',
320
+ 'sec-fetch-mode': 'cors',
321
+ 'sec-fetch-site': 'same-origin',
322
+ 'sec-gpc': '1',
323
+ 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
324
+ }
325
+ cookies = {
326
+ 'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
327
+ }
328
+ base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
329
+ main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='
330
+
331
+ try:
332
+ main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
333
+ main_resp.raise_for_status()
334
+ main_json = main_resp.json()
335
+ except Exception as e:
336
+ st.write(f"Error fetching ACHC main API: {e}")
337
+ return pd.DataFrame()
338
+
339
+ main_html = main_json.get('response_html', '')
340
+ main_soup = BeautifulSoup(main_html, 'html.parser')
341
+ company_items = main_soup.find_all('li')
342
+ companies = []
343
+ for item in company_items:
344
+ list_box = item.find('div', class_='list_cont_box')
345
+ if not list_box:
346
+ continue
347
+ org_tag = list_box.find('b', class_='company_name')
348
+ org_name = org_tag.get_text(strip=True) if org_tag else ''
349
+ # Join all <p> texts for the address.
350
+ address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
351
+ address = ' '.join(address_parts)
352
+ parsed = usaddress.parse(address)
353
+ street, city, state, zipcode = '', '', '', ''
354
+ for value, label in parsed:
355
+ if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
356
+ street += f' {value}'
357
+ elif label == 'PlaceName':
358
+ city = value
359
+ elif label == 'StateName':
360
+ state = value
361
+ elif label == 'ZipCode':
362
+ zipcode = value
363
+ view_more = item.find('p', class_='view_more_eye')
364
+ if not view_more or not view_more.has_attr('data-company-id'):
365
+ continue
366
+ company_id = view_more['data-company-id']
367
+ companies.append({
368
+ "company_id": company_id,
369
+ "org_name": org_name,
370
+ "address": address,
371
+ "street": street.strip(),
372
+ "city": city,
373
+ "state": state,
374
+ "zipcode": zipcode
375
+ })
376
+
377
+ detail_rows_all = []
378
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
379
+ futures = [
380
+ executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
381
+ for comp in companies
382
+ ]
383
+ total = len(futures)
384
+ completed = 0
385
+ for future in concurrent.futures.as_completed(futures):
386
+ try:
387
+ rows = future.result()
388
+ detail_rows_all.extend(rows)
389
+ except Exception as exc:
390
+ st.write(f"Error fetching ACHC detail: {exc}")
391
+ completed += 1
392
+ df = pd.DataFrame(detail_rows_all, columns=[
393
+ "Organization Name",
394
+ "Start Date",
395
+ "Expiration Date",
396
+ "Accreditation Status",
397
+ "Program",
398
+ "SiteProgram",
399
+ "SiteService",
400
+ "Address",
401
+ "Street",
402
+ "City",
403
+ "State",
404
+ "ZipCode"
405
+ ])
406
+ return df
407
+
408
+ # --- Streamlit UI ---
409
+
410
+ st.title("Accreditation Data Scraper")
411
+ st.write("Click the button below to start scraping and generate an Excel file.")
412
+
413
+ def run_scraper():
414
+ progress_bar = st.progress(0)
415
+
416
+ with st.spinner("Scraping URAC data..."):
417
+ urac_df = scrape_urac()
418
+ progress_bar.progress(33)
419
+
420
+ with st.spinner("Scraping ACHC data..."):
421
+ achc_df = scrape_achc()
422
+ progress_bar.progress(66)
423
+
424
+ with st.spinner("Merging data and generating Excel..."):
425
+ merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
426
+ suffixes=("_URAC", "_ACHC"))
427
+ # Write to an in-memory bytes buffer.
428
+ output = io.BytesIO()
429
+ with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
430
+ urac_df.to_excel(writer, sheet_name="URAC", index=False)
431
+ achc_df.to_excel(writer, sheet_name="ACHC", index=False)
432
+ merged_df.to_excel(writer, sheet_name="Merged", index=False)
433
+ writer.save()
434
+ output.seek(0)
435
+ progress_bar.progress(100)
436
+ return output
437
+
438
+ if st.button("Start Scraping"):
439
+ excel_data = run_scraper()
440
+ st.success("Scraping completed!")
441
+ st.download_button(
442
+ label="Download Excel File",
443
+ data=excel_data,
444
+ file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
445
+ mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
446
+ )