Spaces:

theaniketgiri
/

Healthmodels

Sleeping

App Files Files Community

Healthmodels / src /web_scraper.py

theaniketgiri

scrapper

f5dc5cd 7 months ago

raw

history blame contribute delete

2.89 kB

	from selenium import webdriver
	from selenium.webdriver.chrome.options import Options
	from bs4 import BeautifulSoup
	import requests
	import os
	import time
	import re

	DATA_DIR = 'data'
	BASE_URL = 'https://archive.ics.uci.edu'
	DATASETS_URL = f'{BASE_URL}/datasets'
	ALLOWED_EXTENSIONS = ['.csv', '.data', '.txt', '.xls']
	KEYWORDS = ['medical', 'health', 'disease', 'patient']

	def sanitize_filename(name):
	return re.sub(r'[^\w\-_\. ]', '_', name)

	def fetch_datasets():
	print("[Web Scraper] Launching headless browser...")
	options = Options()
	options.add_argument("--headless")
	options.add_argument("--disable-gpu")
	options.add_argument("--window-size=1920,1080")
	driver = webdriver.Chrome(options=options)
	driver.get(DATASETS_URL)
	print("[Web Scraper] Waiting for JavaScript to load...")
	time.sleep(5) # allow time for JS to render
	soup = BeautifulSoup(driver.page_source, "html.parser")
	driver.quit()
	print("[Web Scraper] Extracting dataset links...")
	datasets = []
	for link in soup.find_all("a", href=True):
	href = link['href']
	text = link.text.strip().lower()
	if href.startswith("/datasets/"):
	# Filter for relevant keywords in name
	if any(kw in text for kw in KEYWORDS):
	datasets.append({
	"name": link.text.strip(),
	"url": BASE_URL + href
	})
	print(f" Found {len(datasets)} relevant datasets.")
	return datasets

	def download_dataset_files(dataset):
	resp = requests.get(dataset['url'])
	soup = BeautifulSoup(resp.text, 'html.parser')
	title = sanitize_filename(dataset['name'])
	found = False
	for a in soup.find_all('a', href=True):
	href = a['href']
	if any(href.lower().endswith(ext) for ext in ALLOWED_EXTENSIONS):
	file_url = href if href.startswith('http') else BASE_URL + href
	ext = os.path.splitext(href)[1]
	filename = f"{title}{ext}"
	filepath = os.path.join(DATA_DIR, filename)
	print(f' Downloading: {file_url} -> {filepath}')
	r = requests.get(file_url)
	if r.ok:
	with open(filepath, 'wb') as f:
	f.write(r.content)
	found = True
	else:
	print(f' Failed to download: {file_url}')
	if not found:
	print(f' No data files found for {title}')

	def main_loop():
	while True:
	os.makedirs(DATA_DIR, exist_ok=True)
	datasets = fetch_datasets()
	for dataset in datasets:
	try:
	download_dataset_files(dataset)
	except Exception as e:
	print(f' Error processing {dataset["url"]}: {e}')
	print('[Web Scraper] Scraping complete. Waiting 2 hours...')
	time.sleep(7200)

	if __name__ == "__main__":
	main_loop()