Healthmodels / src /web_scraper.py
theaniketgiri's picture
scrapper
f5dc5cd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import os
import time
import re
DATA_DIR = 'data'
BASE_URL = 'https://archive.ics.uci.edu'
DATASETS_URL = f'{BASE_URL}/datasets'
ALLOWED_EXTENSIONS = ['.csv', '.data', '.txt', '.xls']
KEYWORDS = ['medical', 'health', 'disease', 'patient']
def sanitize_filename(name):
return re.sub(r'[^\w\-_\. ]', '_', name)
def fetch_datasets():
print("[Web Scraper] Launching headless browser...")
options = Options()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--window-size=1920,1080")
driver = webdriver.Chrome(options=options)
driver.get(DATASETS_URL)
print("[Web Scraper] Waiting for JavaScript to load...")
time.sleep(5) # allow time for JS to render
soup = BeautifulSoup(driver.page_source, "html.parser")
driver.quit()
print("[Web Scraper] Extracting dataset links...")
datasets = []
for link in soup.find_all("a", href=True):
href = link['href']
text = link.text.strip().lower()
if href.startswith("/datasets/"):
# Filter for relevant keywords in name
if any(kw in text for kw in KEYWORDS):
datasets.append({
"name": link.text.strip(),
"url": BASE_URL + href
})
print(f" Found {len(datasets)} relevant datasets.")
return datasets
def download_dataset_files(dataset):
resp = requests.get(dataset['url'])
soup = BeautifulSoup(resp.text, 'html.parser')
title = sanitize_filename(dataset['name'])
found = False
for a in soup.find_all('a', href=True):
href = a['href']
if any(href.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS):
file_url = href if href.startswith('http') else BASE_URL + href
ext = os.path.splitext(href)[1]
filename = f"{title}{ext}"
filepath = os.path.join(DATA_DIR, filename)
print(f' Downloading: {file_url} -> {filepath}')
r = requests.get(file_url)
if r.ok:
with open(filepath, 'wb') as f:
f.write(r.content)
found = True
else:
print(f' Failed to download: {file_url}')
if not found:
print(f' No data files found for {title}')
def main_loop():
while True:
os.makedirs(DATA_DIR, exist_ok=True)
datasets = fetch_datasets()
for dataset in datasets:
try:
download_dataset_files(dataset)
except Exception as e:
print(f' Error processing {dataset["url"]}: {e}')
print('[Web Scraper] Scraping complete. Waiting 2 hours...')
time.sleep(7200)
if __name__ == "__main__":
main_loop()