File size: 3,348 Bytes

dcd2bd2

import os
import requests
import zipfile
import tarfile
from tqdm import tqdm

# Target directory for all datasets
BASE_DIR = "./datasets"

# URLs for the industry-standard high-res and benchmark datasets
DATASETS = {
    # High-Resolution Training & Validation (for H100s)
    
    #"Flickr2K": "https://cv.snu.ac.kr/research/EDSR/Flickr2K.tar",
    
    # Standard Benchmark Test Sets (Hosted reliably on popular CV repos)
    "Test_Datasets": "https://github.com/cszn/FFDNet/archive/refs/heads/master.zip" 
}


def download_file(url, dest_path):
    """Downloads a file with a progress bar and robust error handling."""
    if os.path.exists(dest_path):
        print(f"[*] {os.path.basename(dest_path)} already exists. Skipping download.")
        return

    print(f"[*] Downloading {url}...")
    
    # Disguise the script as a standard web browser
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    
    try:
        response = requests.get(url, stream=True, headers=headers, timeout=30)
        response.raise_for_status()
        
        total_size = int(response.headers.get('content-length', 0))
        block_size = 1024 * 1024 # 1 MB chunks
        
        with open(dest_path, 'wb') as file, tqdm(
            total=total_size, unit='B', unit_scale=True, desc=os.path.basename(dest_path)
        ) as bar:
            for data in response.iter_content(block_size):
                file.write(data)
                bar.update(len(data))
                
    except requests.exceptions.RequestException as e:
        print(f"\n[!] The server rejected the connection: {e}")
        print(f"[!] Skipping {os.path.basename(dest_path)}. You can proceed without it.")
        # Remove the partial file if it failed midway
        if os.path.exists(dest_path):
            os.remove(dest_path)

def extract_file(file_path, extract_to):
    """Extracts zip or tar files."""
    print(f"[*] Extracting {os.path.basename(file_path)}...")
    
    if file_path.endswith(".zip"):
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
    elif file_path.endswith(".tar") or file_path.endswith(".tar.gz"):
        with tarfile.open(file_path, 'r:*') as tar_ref:
            tar_ref.extractall(extract_to)
    else:
        print(f"[!] Unknown file format for {file_path}")

def main():
    os.makedirs(BASE_DIR, exist_ok=True)
    
    for name, url in DATASETS.items():
        print(f"\n--- Processing {name} ---")
        
        # Determine file extension and destination paths
        ext = ".tar" if ".tar" in url else ".zip"
        file_name = f"{name}{ext}"
        download_path = os.path.join(BASE_DIR, file_name)
        
        # Download
        download_file(url, download_path)
        
        # Extract
        extract_dir = os.path.join(BASE_DIR, name)
        os.makedirs(extract_dir, exist_ok=True)
        extract_file(download_path, extract_dir)
        
        # Clean up the archive to save disk space
        print(f"[*] Cleaning up archive {file_name}...")
        os.remove(download_path)

    print("\n[+] All datasets downloaded and extracted successfully!")
    print(f"[+] Look inside the '{BASE_DIR}' folder.")

if __name__ == "__main__":
    main()