Spaces:

theaniketgiri
/

Healthmodels

Sleeping

File size: 2,892 Bytes

f5dc5cd
 
902fa1b
f5dc5cd
902fa1b
 
f5dc5cd
902fa1b
f5dc5cd
 
 
 
 
902fa1b
f5dc5cd
 
902fa1b
f5dc5cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902fa1b
 
 
f5dc5cd
 
 
 
 
 
 
 
 
902fa1b

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import os
import time
import re

DATA_DIR = 'data'
BASE_URL = 'https://archive.ics.uci.edu'
DATASETS_URL = f'{BASE_URL}/datasets'
ALLOWED_EXTENSIONS = ['.csv', '.data', '.txt', '.xls']
KEYWORDS = ['medical', 'health', 'disease', 'patient']

def sanitize_filename(name):
    return re.sub(r'[^\w\-_\. ]', '_', name)

def fetch_datasets():
    print("[Web Scraper] Launching headless browser...")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=options)
    driver.get(DATASETS_URL)
    print("[Web Scraper] Waiting for JavaScript to load...")
    time.sleep(5)  # allow time for JS to render
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    print("[Web Scraper] Extracting dataset links...")
    datasets = []
    for link in soup.find_all("a", href=True):
        href = link['href']
        text = link.text.strip().lower()
        if href.startswith("/datasets/"):
            # Filter for relevant keywords in name
            if any(kw in text for kw in KEYWORDS):
                datasets.append({
                    "name": link.text.strip(),
                    "url": BASE_URL + href
                })
    print(f"  Found {len(datasets)} relevant datasets.")
    return datasets

def download_dataset_files(dataset):
    resp = requests.get(dataset['url'])
    soup = BeautifulSoup(resp.text, 'html.parser')
    title = sanitize_filename(dataset['name'])
    found = False
    for a in soup.find_all('a', href=True):
        href = a['href']
        if any(href.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS):
            file_url = href if href.startswith('http') else BASE_URL + href
            ext = os.path.splitext(href)[1]
            filename = f"{title}{ext}"
            filepath = os.path.join(DATA_DIR, filename)
            print(f'  Downloading: {file_url} -> {filepath}')
            r = requests.get(file_url)
            if r.ok:
                with open(filepath, 'wb') as f:
                    f.write(r.content)
                found = True
            else:
                print(f'    Failed to download: {file_url}')
    if not found:
        print(f'  No data files found for {title}')

def main_loop():
    while True:
        os.makedirs(DATA_DIR, exist_ok=True)
        datasets = fetch_datasets()
        for dataset in datasets:
            try:
                download_dataset_files(dataset)
            except Exception as e:
                print(f'  Error processing {dataset["url"]}: {e}')
        print('[Web Scraper] Scraping complete. Waiting 2 hours...')
        time.sleep(7200)

if __name__ == "__main__":
    main_loop()