Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,8 +12,8 @@ import io
|
|
| 12 |
|
| 13 |
def fetch_detail(cert_number, main_data, headers):
|
| 14 |
"""
|
| 15 |
-
For a given certification number, call the URAC detail API and return a list of rows
|
| 16 |
-
|
| 17 |
"""
|
| 18 |
detail_rows = []
|
| 19 |
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
|
|
@@ -23,7 +23,6 @@ def fetch_detail(cert_number, main_data, headers):
|
|
| 23 |
detail_data = response.json()
|
| 24 |
entities = detail_data.get("certificationEntities", [])
|
| 25 |
if not entities:
|
| 26 |
-
# No site records: return row with blank site fields.
|
| 27 |
row = main_data.copy()
|
| 28 |
row.update({
|
| 29 |
"Site Name": None,
|
|
@@ -71,12 +70,12 @@ def fetch_detail(cert_number, main_data, headers):
|
|
| 71 |
st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
|
| 72 |
return detail_rows
|
| 73 |
|
| 74 |
-
def scrape_urac():
|
| 75 |
"""
|
| 76 |
Scrape URAC accreditation data:
|
| 77 |
1. Call the main filter API.
|
| 78 |
2. Parse organization details.
|
| 79 |
-
3. For each organization, call the detail API
|
| 80 |
Returns a pandas DataFrame.
|
| 81 |
"""
|
| 82 |
organizations = []
|
|
@@ -143,7 +142,7 @@ def scrape_urac():
|
|
| 143 |
st.write("Error processing URAC main API:", e)
|
| 144 |
return pd.DataFrame()
|
| 145 |
|
| 146 |
-
# Parse
|
| 147 |
for item in data.get('items', []):
|
| 148 |
entity = item.get('entity', {})
|
| 149 |
org_name = entity.get('name', None)
|
|
@@ -170,7 +169,6 @@ def scrape_urac():
|
|
| 170 |
state = value
|
| 171 |
elif label == 'ZipCode':
|
| 172 |
zipcode = value
|
| 173 |
-
# Get certification number.
|
| 174 |
cert_number = item.get("primaryCertification", {}).get("certificationNumber")
|
| 175 |
if not cert_number:
|
| 176 |
cert_number = item.get("certificationNumber")
|
|
@@ -189,7 +187,7 @@ def scrape_urac():
|
|
| 189 |
}
|
| 190 |
organizations.append(org_data)
|
| 191 |
|
| 192 |
-
#
|
| 193 |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
| 194 |
future_to_org = {
|
| 195 |
executor.submit(fetch_detail, org["Certification Number"], org, headers): org
|
|
@@ -205,6 +203,8 @@ def scrape_urac():
|
|
| 205 |
org = future_to_org[future]
|
| 206 |
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
|
| 207 |
completed += 1
|
|
|
|
|
|
|
| 208 |
return pd.DataFrame(all_rows)
|
| 209 |
|
| 210 |
def _parse_accreditation_blocks(detail_soup):
|
|
@@ -298,7 +298,7 @@ def _fetch_detail_for_company(company, base_url, headers, cookies):
|
|
| 298 |
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
|
| 299 |
return rows
|
| 300 |
|
| 301 |
-
def scrape_achc():
|
| 302 |
"""
|
| 303 |
Scrape ACHC data:
|
| 304 |
1. Call the main API to get HTML.
|
|
@@ -334,7 +334,7 @@ def scrape_achc():
|
|
| 334 |
main_json = main_resp.json()
|
| 335 |
except Exception as e:
|
| 336 |
st.write(f"Error fetching ACHC main API: {e}")
|
| 337 |
-
return pd.DataFrame()
|
| 338 |
|
| 339 |
main_html = main_json.get('response_html', '')
|
| 340 |
main_soup = BeautifulSoup(main_html, 'html.parser')
|
|
@@ -346,7 +346,6 @@ def scrape_achc():
|
|
| 346 |
continue
|
| 347 |
org_tag = list_box.find('b', class_='company_name')
|
| 348 |
org_name = org_tag.get_text(strip=True) if org_tag else ''
|
| 349 |
-
# Join all <p> texts for the address.
|
| 350 |
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
|
| 351 |
address = ' '.join(address_parts)
|
| 352 |
parsed = usaddress.parse(address)
|
|
@@ -389,6 +388,8 @@ def scrape_achc():
|
|
| 389 |
except Exception as exc:
|
| 390 |
st.write(f"Error fetching ACHC detail: {exc}")
|
| 391 |
completed += 1
|
|
|
|
|
|
|
| 392 |
df = pd.DataFrame(detail_rows_all, columns=[
|
| 393 |
"Organization Name",
|
| 394 |
"Start Date",
|
|
@@ -411,28 +412,28 @@ st.title("Accreditation Data Scraper")
|
|
| 411 |
st.write("Click the button below to start scraping and generate an Excel file.")
|
| 412 |
|
| 413 |
def run_scraper():
|
| 414 |
-
|
| 415 |
-
|
| 416 |
with st.spinner("Scraping URAC data..."):
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
with st.spinner("Scraping ACHC data..."):
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
with st.spinner("Merging data and generating Excel..."):
|
| 425 |
-
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
|
| 426 |
suffixes=("_URAC", "_ACHC"))
|
| 427 |
-
# Write to an in-memory bytes buffer.
|
| 428 |
output = io.BytesIO()
|
| 429 |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
| 430 |
-
urac_df.
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
|
|
|
|
|
|
|
|
|
| 434 |
output.seek(0)
|
| 435 |
-
progress_bar.progress(100)
|
| 436 |
return output
|
| 437 |
|
| 438 |
if st.button("Start Scraping"):
|
|
|
|
| 12 |
|
| 13 |
def fetch_detail(cert_number, main_data, headers):
|
| 14 |
"""
|
| 15 |
+
For a given certification number, call the URAC detail API and return a list of rows.
|
| 16 |
+
If no site records are returned, a row with blank site fields is returned.
|
| 17 |
"""
|
| 18 |
detail_rows = []
|
| 19 |
url = f"https://accreditnet.urac.org/api/urac/rest/directoryInfo/{cert_number}/certificationEntityInfo/type/Accreditation"
|
|
|
|
| 23 |
detail_data = response.json()
|
| 24 |
entities = detail_data.get("certificationEntities", [])
|
| 25 |
if not entities:
|
|
|
|
| 26 |
row = main_data.copy()
|
| 27 |
row.update({
|
| 28 |
"Site Name": None,
|
|
|
|
| 70 |
st.write(f"Error fetching detail for cert_number {cert_number}: {e}")
|
| 71 |
return detail_rows
|
| 72 |
|
| 73 |
+
def scrape_urac(progress_bar=None):
|
| 74 |
"""
|
| 75 |
Scrape URAC accreditation data:
|
| 76 |
1. Call the main filter API.
|
| 77 |
2. Parse organization details.
|
| 78 |
+
3. For each organization, call the detail API in parallel to get one row per site address.
|
| 79 |
Returns a pandas DataFrame.
|
| 80 |
"""
|
| 81 |
organizations = []
|
|
|
|
| 142 |
st.write("Error processing URAC main API:", e)
|
| 143 |
return pd.DataFrame()
|
| 144 |
|
| 145 |
+
# Parse organization items.
|
| 146 |
for item in data.get('items', []):
|
| 147 |
entity = item.get('entity', {})
|
| 148 |
org_name = entity.get('name', None)
|
|
|
|
| 169 |
state = value
|
| 170 |
elif label == 'ZipCode':
|
| 171 |
zipcode = value
|
|
|
|
| 172 |
cert_number = item.get("primaryCertification", {}).get("certificationNumber")
|
| 173 |
if not cert_number:
|
| 174 |
cert_number = item.get("certificationNumber")
|
|
|
|
| 187 |
}
|
| 188 |
organizations.append(org_data)
|
| 189 |
|
| 190 |
+
# Fetch detail API calls in parallel and update the progress bar.
|
| 191 |
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
| 192 |
future_to_org = {
|
| 193 |
executor.submit(fetch_detail, org["Certification Number"], org, headers): org
|
|
|
|
| 203 |
org = future_to_org[future]
|
| 204 |
st.write(f"Error fetching detail for {org['Organization Name']}: {exc}")
|
| 205 |
completed += 1
|
| 206 |
+
if progress_bar is not None and total > 0:
|
| 207 |
+
progress_bar.progress(min(100, int(100 * completed / total)))
|
| 208 |
return pd.DataFrame(all_rows)
|
| 209 |
|
| 210 |
def _parse_accreditation_blocks(detail_soup):
|
|
|
|
| 298 |
st.write(f"Error fetching ACHC detail for company ID {company_id}: {e}")
|
| 299 |
return rows
|
| 300 |
|
| 301 |
+
def scrape_achc(progress_bar=None):
|
| 302 |
"""
|
| 303 |
Scrape ACHC data:
|
| 304 |
1. Call the main API to get HTML.
|
|
|
|
| 334 |
main_json = main_resp.json()
|
| 335 |
except Exception as e:
|
| 336 |
st.write(f"Error fetching ACHC main API: {e}")
|
| 337 |
+
return pd.DataFrame({"Organization Name":[]}, columns=['Organization Name'])
|
| 338 |
|
| 339 |
main_html = main_json.get('response_html', '')
|
| 340 |
main_soup = BeautifulSoup(main_html, 'html.parser')
|
|
|
|
| 346 |
continue
|
| 347 |
org_tag = list_box.find('b', class_='company_name')
|
| 348 |
org_name = org_tag.get_text(strip=True) if org_tag else ''
|
|
|
|
| 349 |
address_parts = [p.get_text(strip=True) for p in list_box.find_all('p')]
|
| 350 |
address = ' '.join(address_parts)
|
| 351 |
parsed = usaddress.parse(address)
|
|
|
|
| 388 |
except Exception as exc:
|
| 389 |
st.write(f"Error fetching ACHC detail: {exc}")
|
| 390 |
completed += 1
|
| 391 |
+
if progress_bar is not None and total > 0:
|
| 392 |
+
progress_bar.progress(min(100, int(100 * completed / total)))
|
| 393 |
df = pd.DataFrame(detail_rows_all, columns=[
|
| 394 |
"Organization Name",
|
| 395 |
"Start Date",
|
|
|
|
| 412 |
st.write("Click the button below to start scraping and generate an Excel file.")
|
| 413 |
|
| 414 |
def run_scraper():
|
| 415 |
+
# Scrape URAC data with its own progress bar.
|
|
|
|
| 416 |
with st.spinner("Scraping URAC data..."):
|
| 417 |
+
urac_progress = st.progress(0)
|
| 418 |
+
urac_df = scrape_urac(progress_bar=urac_progress)
|
| 419 |
+
# Scrape ACHC data with its own progress bar.
|
| 420 |
with st.spinner("Scraping ACHC data..."):
|
| 421 |
+
achc_progress = st.progress(0)
|
| 422 |
+
achc_df = scrape_achc(progress_bar=achc_progress)
|
| 423 |
+
# Merge data and write to an in-memory Excel file.
|
| 424 |
with st.spinner("Merging data and generating Excel..."):
|
| 425 |
+
merged_df = pd.merge(urac_df, achc_df, on="Organization Name", how="outer",
|
| 426 |
suffixes=("_URAC", "_ACHC"))
|
|
|
|
| 427 |
output = io.BytesIO()
|
| 428 |
with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
|
| 429 |
+
if not urac_df.empty:
|
| 430 |
+
urac_df.to_excel(writer, sheet_name="URAC", index=False)
|
| 431 |
+
if not achc_df.empty:
|
| 432 |
+
achc_df.to_excel(writer, sheet_name="ACHC", index=False)
|
| 433 |
+
if not urac_df.empty and not achc_df.empty:
|
| 434 |
+
merged_df.to_excel(writer, sheet_name="Merged", index=False)
|
| 435 |
+
# writer.save()
|
| 436 |
output.seek(0)
|
|
|
|
| 437 |
return output
|
| 438 |
|
| 439 |
if st.button("Start Scraping"):
|