Spaces:

abcdefghipri
/

lung-ai

Sleeping

lung-ai / preprocess.py

Shushitrashhhh

Deploy LUNG-AI to Hugging Face Space

e5b9a0f 14 days ago

3.47 kB

	import os
	import shutil
	from datasets import load_dataset
	from tqdm import tqdm # You may need to run: pip install tqdm
	from PIL import Image # You may need to run: pip install Pillow

	# THIS IS THE FIX: Map the 7+ dataset classes to the 4 project classes
	def get_project_class(dataset_class_name):
	"""Maps a detailed class name to one of the 4 project categories."""
	dcn = dataset_class_name.lower() # dcn = dataset class name
	if 'adenocarcinoma' in dcn:
	return 'Adenocarcinoma'
	if 'large.cell' in dcn:
	return 'Large_cell_carcinoma'
	if 'squamous' in dcn:
	return 'Squamous_cell_carcinoma'
	if 'normal' in dcn:
	return 'Normal'
	return None # We will ignore any class that doesn't fit

	def setup_dataset():
	base_dir = "Processed_Data"

	# Clean up the directory if it already exists from a failed run
	if os.path.exists(base_dir):
	print(f"Removing old '{base_dir}' directory...")
	shutil.rmtree(base_dir)
	print("Done.")

	# 1. Load the dataset
	print("Loading dataset 'dorsar/lung-cancer' from Hugging Face...")
	ds = load_dataset("dorsar/lung-cancer", "default", trust_remote_code=True)

	# 2. Get the dataset's class names
	dataset_class_names = ds['train'].features['label'].names
	print(f"Found {len(dataset_class_names)} dataset classes: {dataset_class_names}")

	# 3. Define the project's 4 classes
	project_class_names = ['Adenocarcinoma', 'Large_cell_carcinoma', 'Normal', 'Squamous_cell_carcinoma']
	print(f"Mapping them into 4 project classes: {project_class_names}")

	# 4. Loop through each split (train, validation, test)
	for split in ds.keys():
	print(f"\nProcessing '{split}' split...")
	split_data = ds[split]

	for i, item in enumerate(tqdm(split_data, desc=f"Saving {split} images")):
	image = item['image']
	label_index = item['label']

	# Get the dataset's class name (e.g., 'adenocarcinoma_left.lower.lobe...')
	dataset_class_name = dataset_class_names[label_index]

	# --- THIS IS THE NEW LOGIC ---
	# Convert it to the project's class name (e.g., 'Adenocarcinoma')
	project_class_name = get_project_class(dataset_class_name)

	# If it's not one of our 4 classes, skip this image
	if project_class_name is None:
	continue
	# --- END OF NEW LOGIC ---

	# Create the full directory path
	# e.g., "Processed_Data/train/Adenocarcinoma"
	target_dir = os.path.join(base_dir, split, project_class_name)

	# Create the directories if they don't already exist
	os.makedirs(target_dir, exist_ok=True)

	# Convert to RGB to avoid errors with PNG/grayscale images
	if image.mode != 'RGB':
	image = image.convert('RGB')

	# Create a unique filename and save as JPEG
	filename = f"{split}_{project_class_name}_{i}.jpg"
	save_path = os.path.join(target_dir, filename)
	image.save(save_path, "JPEG")

	print(f"\n✅ All data downloaded and RE-MAPPED successfully!")
	print(f"Your data is ready in the '{base_dir}' folder with the correct 4 classes.")

	if __name__ == "__main__":
	# This makes the script runnable from the command line
	setup_dataset()