File size: 2,892 Bytes
f5dc5cd
 
902fa1b
f5dc5cd
902fa1b
 
f5dc5cd
902fa1b
f5dc5cd
 
 
 
 
902fa1b
f5dc5cd
 
902fa1b
f5dc5cd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
902fa1b
 
 
f5dc5cd
 
 
 
 
 
 
 
 
902fa1b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import os
import time
import re

DATA_DIR = 'data'
BASE_URL = 'https://archive.ics.uci.edu'
DATASETS_URL = f'{BASE_URL}/datasets'
ALLOWED_EXTENSIONS = ['.csv', '.data', '.txt', '.xls']
KEYWORDS = ['medical', 'health', 'disease', 'patient']

def sanitize_filename(name):
    return re.sub(r'[^\w\-_\. ]', '_', name)

def fetch_datasets():
    print("[Web Scraper] Launching headless browser...")
    options = Options()
    options.add_argument("--headless")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=options)
    driver.get(DATASETS_URL)
    print("[Web Scraper] Waiting for JavaScript to load...")
    time.sleep(5)  # allow time for JS to render
    soup = BeautifulSoup(driver.page_source, "html.parser")
    driver.quit()
    print("[Web Scraper] Extracting dataset links...")
    datasets = []
    for link in soup.find_all("a", href=True):
        href = link['href']
        text = link.text.strip().lower()
        if href.startswith("/datasets/"):
            # Filter for relevant keywords in name
            if any(kw in text for kw in KEYWORDS):
                datasets.append({
                    "name": link.text.strip(),
                    "url": BASE_URL + href
                })
    print(f"  Found {len(datasets)} relevant datasets.")
    return datasets

def download_dataset_files(dataset):
    resp = requests.get(dataset['url'])
    soup = BeautifulSoup(resp.text, 'html.parser')
    title = sanitize_filename(dataset['name'])
    found = False
    for a in soup.find_all('a', href=True):
        href = a['href']
        if any(href.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS):
            file_url = href if href.startswith('http') else BASE_URL + href
            ext = os.path.splitext(href)[1]
            filename = f"{title}{ext}"
            filepath = os.path.join(DATA_DIR, filename)
            print(f'  Downloading: {file_url} -> {filepath}')
            r = requests.get(file_url)
            if r.ok:
                with open(filepath, 'wb') as f:
                    f.write(r.content)
                found = True
            else:
                print(f'    Failed to download: {file_url}')
    if not found:
        print(f'  No data files found for {title}')

def main_loop():
    while True:
        os.makedirs(DATA_DIR, exist_ok=True)
        datasets = fetch_datasets()
        for dataset in datasets:
            try:
                download_dataset_files(dataset)
            except Exception as e:
                print(f'  Error processing {dataset["url"]}: {e}')
        print('[Web Scraper] Scraping complete. Waiting 2 hours...')
        time.sleep(7200)

if __name__ == "__main__":
    main_loop()