from selenium import webdriver from selenium.webdriver.chrome.options import Options from bs4 import BeautifulSoup import requests import os import time import re DATA_DIR = 'data' BASE_URL = 'https://archive.ics.uci.edu' DATASETS_URL = f'{BASE_URL}/datasets' ALLOWED_EXTENSIONS = ['.csv', '.data', '.txt', '.xls'] KEYWORDS = ['medical', 'health', 'disease', 'patient'] def sanitize_filename(name): return re.sub(r'[^\w\-_\. ]', '_', name) def fetch_datasets(): print("[Web Scraper] Launching headless browser...") options = Options() options.add_argument("--headless") options.add_argument("--disable-gpu") options.add_argument("--window-size=1920,1080") driver = webdriver.Chrome(options=options) driver.get(DATASETS_URL) print("[Web Scraper] Waiting for JavaScript to load...") time.sleep(5) # allow time for JS to render soup = BeautifulSoup(driver.page_source, "html.parser") driver.quit() print("[Web Scraper] Extracting dataset links...") datasets = [] for link in soup.find_all("a", href=True): href = link['href'] text = link.text.strip().lower() if href.startswith("/datasets/"): # Filter for relevant keywords in name if any(kw in text for kw in KEYWORDS): datasets.append({ "name": link.text.strip(), "url": BASE_URL + href }) print(f" Found {len(datasets)} relevant datasets.") return datasets def download_dataset_files(dataset): resp = requests.get(dataset['url']) soup = BeautifulSoup(resp.text, 'html.parser') title = sanitize_filename(dataset['name']) found = False for a in soup.find_all('a', href=True): href = a['href'] if any(href.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS): file_url = href if href.startswith('http') else BASE_URL + href ext = os.path.splitext(href)[1] filename = f"{title}{ext}" filepath = os.path.join(DATA_DIR, filename) print(f' Downloading: {file_url} -> {filepath}') r = requests.get(file_url) if r.ok: with open(filepath, 'wb') as f: f.write(r.content) found = True else: print(f' Failed to download: {file_url}') if not found: print(f' No data files found for {title}') def main_loop(): while True: os.makedirs(DATA_DIR, exist_ok=True) datasets = fetch_datasets() for dataset in datasets: try: download_dataset_files(dataset) except Exception as e: print(f' Error processing {dataset["url"]}: {e}') print('[Web Scraper] Scraping complete. Waiting 2 hours...') time.sleep(7200) if __name__ == "__main__": main_loop()