Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import requests | |
| import usaddress | |
| import concurrent.futures | |
| import re | |
| from bs4 import BeautifulSoup | |
| from datetime import datetime | |
| import io | |
| # --- Scraper functions (adapted from your scraper.py) --- | |
| def fetch_detail(cert_number, main_data, headers): | |
| """ | |
| For a given certification number, call the URAC detail API and return a list of rows. | |
| If no site records are returned, a row with blank site fields is returned. | |
| """ | |
| detail_rows = [] | |
| url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation" | |
| try: | |
| response = requests.get(url, headers=headers) | |
| response.raise_for_status() | |
| detail_data = response.json() | |
| entities = detail_data.get("certificationEntities", []) | |
| if not entities: | |
| row = main_data.copy() | |
| row.update({ | |
| "Site Name": None, | |
| "Site Address": None, | |
| "Site Street": None, | |
| "Site City": None, | |
| "Site State": None, | |
| "Site ZipCode": None | |
| }) | |
| detail_rows.append(row) | |
| else: | |
| for entity_item in entities: | |
| site_entity = entity_item.get("entity", {}) | |
| site_name = site_entity.get("name", None) | |
| # Combine the site address parts. | |
| site_address_parts = [] | |
| for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']: | |
| part = site_entity.get(key) | |
| if part: | |
| site_address_parts.append(part) | |
| site_address = ', '.join(site_address_parts) | |
| # Parse the site address using usaddress. | |
| parsed_site = usaddress.parse(site_address) | |
| site_street, site_city, site_state, site_zipcode = '', '', '', '' | |
| for value, label in parsed_site: | |
| if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): | |
| site_street += f' {value}' | |
| elif label == 'PlaceName': | |
| site_city = value | |
| elif label == 'StateName': | |
| site_state = value | |
| elif label == 'ZipCode': | |
| site_zipcode = value | |
| row = main_data.copy() | |
| row.update({ | |
| "Site Name": site_name, | |
| "Site Address": site_address, | |
| "Site Street": site_street.strip(), | |
| "Site City": site_city, | |
| "Site State": site_state, | |
| "Site ZipCode": site_zipcode | |
| }) | |
| detail_rows.append(row) | |
| except Exception as e: | |
| st.write(f"Error fetching detail for cert_number {cert_number}: {e}") | |
| return detail_rows | |
| def scrape_urac(progress_bar=None): | |
| """ | |
| Scrape URAC accreditation data: | |
| 1. Call the main filter API. | |
| 2. Parse organization details. | |
| 3. For each organization, call the detail API in parallel to get one row per site address. | |
| Returns a pandas DataFrame. | |
| """ | |
| organizations = [] | |
| all_rows = [] | |
| headers = { | |
| 'accept': '*/*', | |
| 'accept-language': 'en-US,en;q=0.9', | |
| 'content-type': 'application/json', | |
| 'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C', | |
| 'origin': 'https://accreditnet.urac.org', | |
| 'priority': 'u=1, i', | |
| 'referer': 'https://accreditnet.urac.org/directory/', | |
| 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'sec-ch-ua-platform': '"Windows"', | |
| 'sec-fetch-dest': 'empty', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-site': 'same-origin', | |
| 'sec-gpc': '1', | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' | |
| } | |
| json_data = { | |
| 'filter': { | |
| 'allParts': [ | |
| { | |
| 'name': 'completedApplicationDecisionItem.typeDisplay.value', | |
| 'comparator': 0, | |
| 'valueType': 0, | |
| 'textValue': 'Accreditation Program', | |
| 'integerValue': None, | |
| 'decimalValue': None, | |
| 'dateTimeValue': None, | |
| 'booleanValue': None, | |
| 'innerFilter': None, | |
| }, | |
| { | |
| 'name': 'certificateType.programName', | |
| 'comparator': 0, | |
| 'valueType': 0, | |
| 'textValue': 'Specialty Pharmacy', | |
| 'integerValue': None, | |
| 'decimalValue': None, | |
| 'dateTimeValue': None, | |
| 'booleanValue': None, | |
| 'innerFilter': None, | |
| }, | |
| ], | |
| 'anyParts': [], | |
| 'notParts': [], | |
| }, | |
| 'orderBy': 'certificationNumber', | |
| 'pageSize': 15, | |
| 'limit': 100, | |
| } | |
| try: | |
| response = requests.post( | |
| 'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter', | |
| headers=headers, | |
| json=json_data | |
| ) | |
| response.raise_for_status() | |
| data = response.json() | |
| except Exception as e: | |
| st.write("Error processing URAC main API:", e) | |
| return pd.DataFrame() | |
| # Parse organization items. | |
| for item in data.get('items', []): | |
| entity = item.get('entity', {}) | |
| org_name = entity.get('name', None) | |
| decision = item.get('completedApplicationDecisionItem', {}) | |
| outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value') | |
| status = outcome if outcome is not None else item.get('effectiveStatusName', None) | |
| srt_date = item.get('issuedDate', None) | |
| exp_date = item.get('expirationDate', None) | |
| program = item.get('certificateType', {}).get('displayName', None) | |
| address_parts = [] | |
| for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']: | |
| part = entity.get(key) | |
| if part: | |
| address_parts.append(part) | |
| address = ', '.join(address_parts) | |
| parsed_address = usaddress.parse(address) | |
| street, city, state, zipcode = '', '', '', '' | |
| for value, label in parsed_address: | |
| if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): | |
| street += f' {value}' | |
| elif label == 'PlaceName': | |
| city = value | |
| elif label == 'StateName': | |
| state = value | |
| elif label == 'ZipCode': | |
| zipcode = value | |
| cert_number = item.get("primaryCertification", {}).get("certificationNumber") | |
| if not cert_number: | |
| cert_number = item.get("certificationNumber") | |
| org_data = { | |
| "Organization Name": org_name, | |
| "Accreditation Status": status, | |
| "Start Date": srt_date, | |
| "Expiration Date": exp_date, | |
| "Program": program, | |
| "Address": address, | |
| "Street": street.strip(), | |
| "City": city, | |
| "State": state, | |
| "ZipCode": zipcode, | |
| "Certification Number": cert_number | |
| } | |
| organizations.append(org_data) | |
| # Fetch detail API calls in parallel and update the progress bar. | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
| future_to_org = { | |
| executor.submit(fetch_detail, org["Certification Number"], org, headers): org | |
| for org in organizations if org["Certification Number"] | |
| } | |
| total = len(future_to_org) | |
| completed = 0 | |
| for future in concurrent.futures.as_completed(future_to_org): | |
| try: | |
| detail_rows = future.result() | |
| all_rows.extend(detail_rows) | |
| except Exception as exc: | |
| org = future_to_org[future] | |
| st.write(f"Error fetching detail for {org['Organization Name']}: {exc}") | |
| completed += 1 | |
| if progress_bar is not None and total > 0: | |
| progress_bar.progress(min(100, int(100 * completed / total))) | |
| return pd.DataFrame(all_rows) | |
| def _parse_accreditation_blocks(detail_soup): | |
| """ | |
| Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts. | |
| """ | |
| results = [] | |
| blocks = detail_soup.find_all('div', class_='main_cont_det') | |
| for block in blocks: | |
| start_date, expiration_date = '', '' | |
| site_program, site_service = '', '' | |
| for p in block.find_all('p'): | |
| text = p.get_text(strip=True) | |
| if 'Date:' in text: | |
| m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text) | |
| if m: | |
| start_date = m.group(1) | |
| expiration_date = m.group(2) | |
| elif 'Program:' in text: | |
| site_program = text.split('Program:')[-1].strip() | |
| elif 'Service:' in text: | |
| site_service = text.split('Service:')[-1].strip() | |
| results.append({ | |
| "Start Date": start_date, | |
| "Expiration Date": expiration_date, | |
| "SiteProgram": site_program, | |
| "SiteService": site_service | |
| }) | |
| return results | |
| def _extract_original_program(detail_soup): | |
| """ | |
| Extract the original Program value from the detail soup. | |
| """ | |
| program = '' | |
| for p in detail_soup.find_all('p'): | |
| if 'Program:' in p.get_text(): | |
| program = p.get_text(strip=True).split('Program:')[-1].strip() | |
| break | |
| return program | |
| def _fetch_detail_for_company(company, base_url, headers, cookies): | |
| """ | |
| For a given company from the ACHC main API, fetch the detail API, | |
| parse the HTML detail, and return one or more rows. | |
| """ | |
| rows = [] | |
| company_id = company["company_id"] | |
| detail_payload = f'action=view_provider_details&data_company_id={company_id}' | |
| try: | |
| detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload) | |
| detail_resp.raise_for_status() | |
| detail_json = detail_resp.json() | |
| detail_html = detail_json.get('response_html', '') | |
| detail_soup = BeautifulSoup(detail_html, 'html.parser') | |
| original_program = _extract_original_program(detail_soup) | |
| acc_blocks = _parse_accreditation_blocks(detail_soup) | |
| if not acc_blocks: | |
| rows.append({ | |
| "Organization Name": company["org_name"], | |
| "Start Date": '', | |
| "Expiration Date": '', | |
| "Accreditation Status": "N/A", | |
| "Program": original_program, | |
| "SiteProgram": '', | |
| "SiteService": '', | |
| "Address": company["address"], | |
| "Street": company["street"], | |
| "City": company["city"], | |
| "State": company["state"], | |
| "ZipCode": company["zipcode"] | |
| }) | |
| else: | |
| for block in acc_blocks: | |
| rows.append({ | |
| "Organization Name": company["org_name"], | |
| "Start Date": block["Start Date"], | |
| "Expiration Date": block["Expiration Date"], | |
| "Accreditation Status": "N/A", | |
| "Program": original_program, | |
| "SiteProgram": block["SiteProgram"], | |
| "SiteService": block["SiteService"], | |
| "Address": company["address"], | |
| "Street": company["street"], | |
| "City": company["city"], | |
| "State": company["state"], | |
| "ZipCode": company["zipcode"] | |
| }) | |
| except Exception as e: | |
| st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}") | |
| return rows | |
| def scrape_achc(progress_bar=None): | |
| """ | |
| Scrape ACHC data: | |
| 1. Call the main API to get HTML. | |
| 2. Parse each company’s info. | |
| 3. In parallel, call the detail API to get accreditation details. | |
| Returns a pandas DataFrame. | |
| """ | |
| headers = { | |
| 'accept': 'application/json, text/javascript, */*; q=0.01', | |
| 'accept-language': 'en-US,en;q=0.8', | |
| 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', | |
| 'origin': 'https://www.achc.org', | |
| 'priority': 'u=1, i', | |
| 'referer': 'https://www.achc.org/find-a-provider/', | |
| 'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"', | |
| 'sec-ch-ua-mobile': '?0', | |
| 'sec-ch-ua-platform': '"Windows"', | |
| 'sec-fetch-dest': 'empty', | |
| 'sec-fetch-mode': 'cors', | |
| 'sec-fetch-site': 'same-origin', | |
| 'sec-gpc': '1', | |
| 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' | |
| } | |
| cookies = { | |
| 'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}' | |
| } | |
| base_url = 'https://www.achc.org/wp-admin/admin-ajax.php' | |
| main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search=' | |
| try: | |
| main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload) | |
| main_resp.raise_for_status() | |
| main_json = main_resp.json() | |
| except Exception as e: | |
| st.write(f"Error fetching ACHC main API: {e}") | |
| return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name']) | |
| main_html = main_json.get('response_html', '') | |
| main_soup = BeautifulSoup(main_html, 'html.parser') | |
| company_items = main_soup.find_all('li') | |
| companies = [] | |
| for item in company_items: | |
| list_box = item.find('div', class_='list_cont_box') | |
| if not list_box: | |
| continue | |
| org_tag = list_box.find('b', class_='company_name') | |
| org_name = org_tag.get_text(strip=True) if org_tag else '' | |
| address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')] | |
| address = ' '.join(address_parts) | |
| parsed = usaddress.parse(address) | |
| street, city, state, zipcode = '', '', '', '' | |
| for value, label in parsed: | |
| if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'): | |
| street += f' {value}' | |
| elif label == 'PlaceName': | |
| city = value | |
| elif label == 'StateName': | |
| state = value | |
| elif label == 'ZipCode': | |
| zipcode = value | |
| view_more = item.find('p', class_='view_more_eye') | |
| if not view_more or not view_more.has_attr('data-company-id'): | |
| continue | |
| company_id = view_more['data-company-id'] | |
| companies.append({ | |
| "company_id": company_id, | |
| "org_name": org_name, | |
| "address": address, | |
| "street": street.strip(), | |
| "city": city, | |
| "state": state, | |
| "zipcode": zipcode | |
| }) | |
| detail_rows_all = [] | |
| with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: | |
| futures = [ | |
| executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies) | |
| for comp in companies | |
| ] | |
| total = len(futures) | |
| completed = 0 | |
| for future in concurrent.futures.as_completed(futures): | |
| try: | |
| rows = future.result() | |
| detail_rows_all.extend(rows) | |
| except Exception as exc: | |
| st.write(f"Error fetching ACHC detail: {exc}") | |
| completed += 1 | |
| if progress_bar is not None and total > 0: | |
| progress_bar.progress(min(100, int(100 * completed / total))) | |
| df = pd.DataFrame(detail_rows_all, columns=[ | |
| "Organization Name", | |
| "Start Date", | |
| "Expiration Date", | |
| "Accreditation Status", | |
| "Program", | |
| "SiteProgram", | |
| "SiteService", | |
| "Address", | |
| "Street", | |
| "City", | |
| "State", | |
| "ZipCode" | |
| ]) | |
| return df | |
| # --- Streamlit UI --- | |
| st.title("Accreditation Data Scraper") | |
| st.write("Click the button below to start scraping and generate an Excel file.") | |
| def run_scraper(): | |
| # Scrape URAC data with its own progress bar. | |
| with st.spinner("Scraping URAC data..."): | |
| urac_progress = st.progress(0) | |
| urac_df = scrape_urac(progress_bar=urac_progress) | |
| # Scrape ACHC data with its own progress bar. | |
| with st.spinner("Scraping ACHC data..."): | |
| achc_progress = st.progress(0) | |
| achc_df = scrape_achc(progress_bar=achc_progress) | |
| # Merge data and write to an in-memory Excel file. | |
| with st.spinner("Merging data and generating Excel..."): | |
| merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer", | |
| suffixes=("_URAC", "_ACHC")) | |
| output = io.BytesIO() | |
| with pd.ExcelWriter(output, engine='xlsxwriter') as writer: | |
| if not urac_df.empty: | |
| urac_df.to_excel(writer, sheet_name="URAC", index=False) | |
| if not achc_df.empty: | |
| achc_df.to_excel(writer, sheet_name="ACHC", index=False) | |
| if not urac_df.empty and not achc_df.empty: | |
| merged_df.to_excel(writer, sheet_name="Merged", index=False) | |
| # writer.save() | |
| output.seek(0) | |
| return output | |
| if st.button("Start Scraping"): | |
| excel_data = run_scraper() | |
| st.success("Scraping completed!") | |
| st.download_button( | |
| label="Download Excel File", | |
| data=excel_data, | |
| file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx", | |
| mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | |
| ) | |