import os
import shutil
from datasets import load_dataset
from tqdm import tqdm # You may need to run: pip install tqdm
from PIL import Image # You may need to run: pip install Pillow 

# THIS IS THE FIX: Map the 7+ dataset classes to the 4 project classes
def get_project_class(dataset_class_name):
    """Maps a detailed class name to one of the 4 project categories."""
    dcn = dataset_class_name.lower() # dcn = dataset class name
    if 'adenocarcinoma' in dcn:
        return 'Adenocarcinoma'
    if 'large.cell' in dcn:
        return 'Large_cell_carcinoma'
    if 'squamous' in dcn:
        return 'Squamous_cell_carcinoma'
    if 'normal' in dcn:
        return 'Normal'
    return None # We will ignore any class that doesn't fit

def setup_dataset():
    base_dir = "Processed_Data"
    
    # Clean up the directory if it already exists from a failed run
    if os.path.exists(base_dir):
        print(f"Removing old '{base_dir}' directory...")
        shutil.rmtree(base_dir)
        print("Done.")

    # 1. Load the dataset
    print("Loading dataset 'dorsar/lung-cancer' from Hugging Face...")
    ds = load_dataset("dorsar/lung-cancer", "default", trust_remote_code=True)
    
    # 2. Get the *dataset's* class names
    dataset_class_names = ds['train'].features['label'].names
    print(f"Found {len(dataset_class_names)} dataset classes: {dataset_class_names}")

    # 3. Define the *project's* 4 classes
    project_class_names = ['Adenocarcinoma', 'Large_cell_carcinoma', 'Normal', 'Squamous_cell_carcinoma']
    print(f"Mapping them into 4 project classes: {project_class_names}")

    # 4. Loop through each split (train, validation, test)
    for split in ds.keys():
        print(f"\nProcessing '{split}' split...")
        split_data = ds[split]
        
        for i, item in enumerate(tqdm(split_data, desc=f"Saving {split} images")):
            image = item['image']
            label_index = item['label']
            
            # Get the dataset's class name (e.g., 'adenocarcinoma_left.lower.lobe...')
            dataset_class_name = dataset_class_names[label_index]
            
            # --- THIS IS THE NEW LOGIC ---
            # Convert it to the project's class name (e.g., 'Adenocarcinoma')
            project_class_name = get_project_class(dataset_class_name)
            
            # If it's not one of our 4 classes, skip this image
            if project_class_name is None:
                continue
            # --- END OF NEW LOGIC ---

            # Create the full directory path
            # e.g., "Processed_Data/train/Adenocarcinoma"
            target_dir = os.path.join(base_dir, split, project_class_name)
            
            # Create the directories if they don't already exist
            os.makedirs(target_dir, exist_ok=True)
            
            # Convert to RGB to avoid errors with PNG/grayscale images
            if image.mode != 'RGB':
                image = image.convert('RGB')
                
            # Create a unique filename and save as JPEG
            filename = f"{split}_{project_class_name}_{i}.jpg"
            save_path = os.path.join(target_dir, filename)
            image.save(save_path, "JPEG")

    print(f"\n✅ All data downloaded and RE-MAPPED successfully!")
    print(f"Your data is ready in the '{base_dir}' folder with the correct 4 classes.")

if __name__ == "__main__":
    # This makes the script runnable from the command line
    setup_dataset()