Spaces:
Sleeping
Sleeping
| from selenium import webdriver | |
| from selenium.webdriver.chrome.options import Options | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import os | |
| import time | |
| import re | |
| DATA_DIR = 'data' | |
| BASE_URL = 'https://archive.ics.uci.edu' | |
| DATASETS_URL = f'{BASE_URL}/datasets' | |
| ALLOWED_EXTENSIONS = ['.csv', '.data', '.txt', '.xls'] | |
| KEYWORDS = ['medical', 'health', 'disease', 'patient'] | |
| def sanitize_filename(name): | |
| return re.sub(r'[^\w\-_\. ]', '_', name) | |
| def fetch_datasets(): | |
| print("[Web Scraper] Launching headless browser...") | |
| options = Options() | |
| options.add_argument("--headless") | |
| options.add_argument("--disable-gpu") | |
| options.add_argument("--window-size=1920,1080") | |
| driver = webdriver.Chrome(options=options) | |
| driver.get(DATASETS_URL) | |
| print("[Web Scraper] Waiting for JavaScript to load...") | |
| time.sleep(5) # allow time for JS to render | |
| soup = BeautifulSoup(driver.page_source, "html.parser") | |
| driver.quit() | |
| print("[Web Scraper] Extracting dataset links...") | |
| datasets = [] | |
| for link in soup.find_all("a", href=True): | |
| href = link['href'] | |
| text = link.text.strip().lower() | |
| if href.startswith("/datasets/"): | |
| # Filter for relevant keywords in name | |
| if any(kw in text for kw in KEYWORDS): | |
| datasets.append({ | |
| "name": link.text.strip(), | |
| "url": BASE_URL + href | |
| }) | |
| print(f" Found {len(datasets)} relevant datasets.") | |
| return datasets | |
| def download_dataset_files(dataset): | |
| resp = requests.get(dataset['url']) | |
| soup = BeautifulSoup(resp.text, 'html.parser') | |
| title = sanitize_filename(dataset['name']) | |
| found = False | |
| for a in soup.find_all('a', href=True): | |
| href = a['href'] | |
| if any(href.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS): | |
| file_url = href if href.startswith('http') else BASE_URL + href | |
| ext = os.path.splitext(href)[1] | |
| filename = f"{title}{ext}" | |
| filepath = os.path.join(DATA_DIR, filename) | |
| print(f' Downloading: {file_url} -> {filepath}') | |
| r = requests.get(file_url) | |
| if r.ok: | |
| with open(filepath, 'wb') as f: | |
| f.write(r.content) | |
| found = True | |
| else: | |
| print(f' Failed to download: {file_url}') | |
| if not found: | |
| print(f' No data files found for {title}') | |
| def main_loop(): | |
| while True: | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| datasets = fetch_datasets() | |
| for dataset in datasets: | |
| try: | |
| download_dataset_files(dataset) | |
| except Exception as e: | |
| print(f' Error processing {dataset["url"]}: {e}') | |
| print('[Web Scraper] Scraping complete. Waiting 2 hours...') | |
| time.sleep(7200) | |
| if __name__ == "__main__": | |
| main_loop() |