Spaces:

salmanmapkar
/

Random_Stuff

Sleeping

App Files Files Community

salmanmapkar commited on Mar 22, 2025

Commit

8fd8293

verified ·

1 Parent(s): fd6caa1

Update app.py

Browse files

Files changed (1) hide show

app.py +444 -2

app.py CHANGED Viewed

@@ -1,4 +1,446 @@
 import streamlit as st
-x = st.slider('Select a value')
-st.write(x, 'squared is', x * x)

 import streamlit as st
+import pandas as pd
+import requests
+import usaddress
+import concurrent.futures
+import re
+from bs4 import BeautifulSoup
+from datetime import datetime
+import io
+# --- Scraper functions (adapted from your scraper.py) ---
+def fetch_detail(cert_number, main_data, headers):
+    """
+    For a given certification number, call the URAC detail API and return a list of rows,
+    one per site address. If no site records are returned, a row with blank site fields is returned.
+    """
+    detail_rows = []
+    url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
+    try:
+        response = requests.get(url, headers=headers)
+        response.raise_for_status()
+        detail_data = response.json()
+        entities = detail_data.get("certificationEntities", [])
+        if not entities:
+            # No site records: return row with blank site fields.
+            row = main_data.copy()
+            row.update({
+                "Site Name": None,
+                "Site Address": None,
+                "Site Street": None,
+                "Site City": None,
+                "Site State": None,
+                "Site ZipCode": None
+            })
+            detail_rows.append(row)
+        else:
+            for entity_item in entities:
+                site_entity = entity_item.get("entity", {})
+                site_name = site_entity.get("name", None)
+                # Combine the site address parts.
+                site_address_parts = []
+                for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
+                    part = site_entity.get(key)
+                    if part:
+                        site_address_parts.append(part)
+                site_address = ', '.join(site_address_parts)
+                # Parse the site address using usaddress.
+                parsed_site = usaddress.parse(site_address)
+                site_street, site_city, site_state, site_zipcode = '', '', '', ''
+                for value, label in parsed_site:
+                    if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
+                        site_street += f' {value}'
+                    elif label == 'PlaceName':
+                        site_city = value
+                    elif label == 'StateName':
+                        site_state = value
+                    elif label == 'ZipCode':
+                        site_zipcode = value
+                row = main_data.copy()
+                row.update({
+                    "Site Name": site_name,
+                    "Site Address": site_address,
+                    "Site Street": site_street.strip(),
+                    "Site City": site_city,
+                    "Site State": site_state,
+                    "Site ZipCode": site_zipcode
+                })
+                detail_rows.append(row)
+    except Exception as e:
+        st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
+    return detail_rows
+def scrape_urac():
+    """
+    Scrape URAC accreditation data:
+      1. Call the main filter API.
+      2. Parse organization details.
+      3. For each organization, call the detail API (in parallel) to get one row per site address.
+    Returns a pandas DataFrame.
+    """
+    organizations = []
+    all_rows = []
+    headers = {
+        'accept': '*/*',
+        'accept-language': 'en-US,en;q=0.9',
+        'content-type': 'application/json',
+        'customerid': 'A20B3F2F-3426-41FA-8217-D3870E672D0C',
+        'origin': 'https://accreditnet.urac.org',
+        'priority': 'u=1, i',
+        'referer': 'https://accreditnet.urac.org/directory/',
+        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'sec-fetch-dest': 'empty',
+        'sec-fetch-mode': 'cors',
+        'sec-fetch-site': 'same-origin',
+        'sec-gpc': '1',
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
+    }
+    json_data = {
+        'filter': {
+            'allParts': [
+                {
+                    'name': 'completedApplicationDecisionItem.typeDisplay.value',
+                    'comparator': 0,
+                    'valueType': 0,
+                    'textValue': 'Accreditation Program',
+                    'integerValue': None,
+                    'decimalValue': None,
+                    'dateTimeValue': None,
+                    'booleanValue': None,
+                    'innerFilter': None,
+                },
+                {
+                    'name': 'certificateType.programName',
+                    'comparator': 0,
+                    'valueType': 0,
+                    'textValue': 'Specialty Pharmacy',
+                    'integerValue': None,
+                    'decimalValue': None,
+                    'dateTimeValue': None,
+                    'booleanValue': None,
+                    'innerFilter': None,
+                },
+            ],
+            'anyParts': [],
+            'notParts': [],
+        },
+        'orderBy': 'certificationNumber',
+        'pageSize': 15,
+        'limit': 100,
+    }
+    try:
+        response = requests.post(
+            'https://accreditnet.urac.org/api/urac/rest/directoryInfo/filter',
+            headers=headers,
+            json=json_data
+        )
+        response.raise_for_status()
+        data = response.json()
+    except Exception as e:
+        st.write("Error processing URAC main API:", e)
+        return pd.DataFrame()
+    # Parse each organization item.
+    for item in data.get('items', []):
+        entity = item.get('entity', {})
+        org_name = entity.get('name', None)
+        decision = item.get('completedApplicationDecisionItem', {})
+        outcome = decision.get('outcomeDisplay', {}).get('default', {}).get('value')
+        status = outcome if outcome is not None else item.get('effectiveStatusName', None)
+        srt_date = item.get('issuedDate', None)
+        exp_date = item.get('expirationDate', None)
+        program = item.get('certificateType', {}).get('displayName', None)
+        address_parts = []
+        for key in ['line1', 'line2', 'city', 'stateName', 'zipcode']:
+            part = entity.get(key)
+            if part:
+                address_parts.append(part)
+        address = ', '.join(address_parts)
+        parsed_address = usaddress.parse(address)
+        street, city, state, zipcode = '', '', '', ''
+        for value, label in parsed_address:
+            if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
+                street += f' {value}'
+            elif label == 'PlaceName':
+                city = value
+            elif label == 'StateName':
+                state = value
+            elif label == 'ZipCode':
+                zipcode = value
+        # Get certification number.
+        cert_number = item.get("primaryCertification", {}).get("certificationNumber")
+        if not cert_number:
+            cert_number = item.get("certificationNumber")
+        org_data = {
+            "Organization Name": org_name,
+            "Accreditation Status": status,
+            "Start Date": srt_date,
+            "Expiration Date": exp_date,
+            "Program": program,
+            "Address": address,
+            "Street": street.strip(),
+            "City": city,
+            "State": state,
+            "ZipCode": zipcode,
+            "Certification Number": cert_number
+        }
+        organizations.append(org_data)
+    # Use a thread pool to fetch details in parallel.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+        future_to_org = {
+            executor.submit(fetch_detail, org["Certification Number"], org, headers): org
+            for org in organizations if org["Certification Number"]
+        }
+        total = len(future_to_org)
+        completed = 0
+        for future in concurrent.futures.as_completed(future_to_org):
+            try:
+                detail_rows = future.result()
+                all_rows.extend(detail_rows)
+            except Exception as exc:
+                org = future_to_org[future]
+                st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
+            completed += 1
+    return pd.DataFrame(all_rows)
+def _parse_accreditation_blocks(detail_soup):
+    """
+    Parse accreditation blocks (<div class="main_cont_det">) and return a list of dicts.
+    """
+    results = []
+    blocks = detail_soup.find_all('div', class_='main_cont_det')
+    for block in blocks:
+        start_date, expiration_date = '', ''
+        site_program, site_service = '', ''
+        for p in block.find_all('p'):
+            text = p.get_text(strip=True)
+            if 'Date:' in text:
+                m = re.search(r'Date:\s*([\d/]+)\s*Through\s*([\d/]+)', text)
+                if m:
+                    start_date = m.group(1)
+                    expiration_date = m.group(2)
+            elif 'Program:' in text:
+                site_program = text.split('Program:')[-1].strip()
+            elif 'Service:' in text:
+                site_service = text.split('Service:')[-1].strip()
+        results.append({
+            "Start Date": start_date,
+            "Expiration Date": expiration_date,
+            "SiteProgram": site_program,
+            "SiteService": site_service
+        })
+    return results
+def _extract_original_program(detail_soup):
+    """
+    Extract the original Program value from the detail soup.
+    """
+    program = ''
+    for p in detail_soup.find_all('p'):
+        if 'Program:' in p.get_text():
+            program = p.get_text(strip=True).split('Program:')[-1].strip()
+            break
+    return program
+def _fetch_detail_for_company(company, base_url, headers, cookies):
+    """
+    For a given company from the ACHC main API, fetch the detail API,
+    parse the HTML detail, and return one or more rows.
+    """
+    rows = []
+    company_id = company["company_id"]
+    detail_payload = f'action=view_provider_details&data_company_id={company_id}'
+    try:
+        detail_resp = requests.post(base_url, headers=headers, cookies=cookies, data=detail_payload)
+        detail_resp.raise_for_status()
+        detail_json = detail_resp.json()
+        detail_html = detail_json.get('response_html', '')
+        detail_soup = BeautifulSoup(detail_html, 'html.parser')
+        original_program = _extract_original_program(detail_soup)
+        acc_blocks = _parse_accreditation_blocks(detail_soup)
+        if not acc_blocks:
+            rows.append({
+                "Organization Name": company["org_name"],
+                "Start Date": '',
+                "Expiration Date": '',
+                "Accreditation Status": "N/A",
+                "Program": original_program,
+                "SiteProgram": '',
+                "SiteService": '',
+                "Address": company["address"],
+                "Street": company["street"],
+                "City": company["city"],
+                "State": company["state"],
+                "ZipCode": company["zipcode"]
+            })
+        else:
+            for block in acc_blocks:
+                rows.append({
+                    "Organization Name": company["org_name"],
+                    "Start Date": block["Start Date"],
+                    "Expiration Date": block["Expiration Date"],
+                    "Accreditation Status": "N/A",
+                    "Program": original_program,
+                    "SiteProgram": block["SiteProgram"],
+                    "SiteService": block["SiteService"],
+                    "Address": company["address"],
+                    "Street": company["street"],
+                    "City": company["city"],
+                    "State": company["state"],
+                    "ZipCode": company["zipcode"]
+                })
+    except Exception as e:
+        st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
+    return rows
+def scrape_achc():
+    """
+    Scrape ACHC data:
+      1. Call the main API to get HTML.
+      2. Parse each company’s info.
+      3. In parallel, call the detail API to get accreditation details.
+    Returns a pandas DataFrame.
+    """
+    headers = {
+        'accept': 'application/json, text/javascript, */*; q=0.01',
+        'accept-language': 'en-US,en;q=0.8',
+        'content-type': 'application/x-www-form-urlencoded; charset=UTF-8',
+        'origin': 'https://www.achc.org',
+        'priority': 'u=1, i',
+        'referer': 'https://www.achc.org/find-a-provider/',
+        'sec-ch-ua': '"Chromium";v="134", "Not:A-Brand";v="24", "Brave";v="134"',
+        'sec-ch-ua-mobile': '?0',
+        'sec-ch-ua-platform': '"Windows"',
+        'sec-fetch-dest': 'empty',
+        'sec-fetch-mode': 'cors',
+        'sec-fetch-site': 'same-origin',
+        'sec-gpc': '1',
+        'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
+    }
+    cookies = {
+        'SGPBShowingLimitationDomain18418': '{"openingCount":1,"openingPage":""}'
+    }
+    base_url = 'https://www.achc.org/wp-admin/admin-ajax.php'
+    main_payload = 'action=filter_provider_data&provider_id=6&service_id=&country_id=&state_id=&quick_search='
+    try:
+        main_resp = requests.post(base_url, headers=headers, cookies=cookies, data=main_payload)
+        main_resp.raise_for_status()
+        main_json = main_resp.json()
+    except Exception as e:
+        st.write(f"Error fetching ACHC main API: {e}")
+        return pd.DataFrame()
+    main_html = main_json.get('response_html', '')
+    main_soup = BeautifulSoup(main_html, 'html.parser')
+    company_items = main_soup.find_all('li')
+    companies = []
+    for item in company_items:
+        list_box = item.find('div', class_='list_cont_box')
+        if not list_box:
+            continue
+        org_tag = list_box.find('b', class_='company_name')
+        org_name = org_tag.get_text(strip=True) if org_tag else ''
+        # Join all <p> texts for the address.
+        address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
+        address = ' '.join(address_parts)
+        parsed = usaddress.parse(address)
+        street, city, state, zipcode = '', '', '', ''
+        for value, label in parsed:
+            if label in ('AddressNumber', 'StreetName', 'StreetNamePostType'):
+                street += f' {value}'
+            elif label == 'PlaceName':
+                city = value
+            elif label == 'StateName':
+                state = value
+            elif label == 'ZipCode':
+                zipcode = value
+        view_more = item.find('p', class_='view_more_eye')
+        if not view_more or not view_more.has_attr('data-company-id'):
+            continue
+        company_id = view_more['data-company-id']
+        companies.append({
+            "company_id": company_id,
+            "org_name": org_name,
+            "address": address,
+            "street": street.strip(),
+            "city": city,
+            "state": state,
+            "zipcode": zipcode
+        })
+    detail_rows_all = []
+    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+        futures = [
+            executor.submit(_fetch_detail_for_company, comp, base_url, headers, cookies)
+            for comp in companies
+        ]
+        total = len(futures)
+        completed = 0
+        for future in concurrent.futures.as_completed(futures):
+            try:
+                rows = future.result()
+                detail_rows_all.extend(rows)
+            except Exception as exc:
+                st.write(f"Error fetching ACHC detail: {exc}")
+            completed += 1
+    df = pd.DataFrame(detail_rows_all, columns=[
+        "Organization Name",
+        "Start Date",
+        "Expiration Date",
+        "Accreditation Status",
+        "Program",
+        "SiteProgram",
+        "SiteService",
+        "Address",
+        "Street",
+        "City",
+        "State",
+        "ZipCode"
+    ])
+    return df
+# --- Streamlit UI ---
+st.title("Accreditation Data Scraper")
+st.write("Click the button below to start scraping and generate an Excel file.")
+def run_scraper():
+    progress_bar = st.progress(0)
+    with st.spinner("Scraping URAC data..."):
+        urac_df = scrape_urac()
+    progress_bar.progress(33)
+    with st.spinner("Scraping ACHC data..."):
+        achc_df = scrape_achc()
+    progress_bar.progress(66)
+    with st.spinner("Merging data and generating Excel..."):
+        merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
+                             suffixes=("_URAC", "_ACHC"))
+        # Write to an in-memory bytes buffer.
+        output = io.BytesIO()
+        with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+            urac_df.to_excel(writer, sheet_name="URAC", index=False)
+            achc_df.to_excel(writer, sheet_name="ACHC", index=False)
+            merged_df.to_excel(writer, sheet_name="Merged", index=False)
+            writer.save()
+        output.seek(0)
+    progress_bar.progress(100)
+    return output
+if st.button("Start Scraping"):
+    excel_data = run_scraper()
+    st.success("Scraping completed!")
+    st.download_button(
+        label="Download Excel File",
+        data=excel_data,
+        file_name=f"combined_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx",
+        mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
+    )