|
|
|
|
|
|
|
|
import os
|
|
|
import shutil
|
|
|
import pandas as pd
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
|
|
|
CSV_PATH = 'outputs/csv/labels.csv'
|
|
|
|
|
|
SOURCE_DATA_DIR = 'data/Nuclear Cataract Database for Biomedical and Machine Learning Applications/Nuclear Cataract Dataset'
|
|
|
|
|
|
SPLIT_DATA_DIR = 'data/split data'
|
|
|
|
|
|
VALIDATION_SPLIT_RATIO = 0.2
|
|
|
|
|
|
|
|
|
print("Membaca file metadata labels.csv...")
|
|
|
df = pd.read_csv(CSV_PATH)
|
|
|
|
|
|
|
|
|
|
|
|
df['patient_id'] = df['image'].apply(lambda x: x.split('/')[0])
|
|
|
|
|
|
|
|
|
unique_patients = df['patient_id'].unique()
|
|
|
print(f"Total pasien ditemukan: {len(unique_patients)}")
|
|
|
|
|
|
|
|
|
train_patients, valid_patients = train_test_split(
|
|
|
unique_patients,
|
|
|
test_size=VALIDATION_SPLIT_RATIO,
|
|
|
random_state=42
|
|
|
)
|
|
|
|
|
|
print(f"Jumlah pasien untuk training: {len(train_patients)}")
|
|
|
print(f"Jumlah pasien untuk validasi: {len(valid_patients)}")
|
|
|
|
|
|
|
|
|
def copy_files(patient_list, target_folder):
|
|
|
target_path = os.path.join(SPLIT_DATA_DIR, target_folder)
|
|
|
subset_df = df[df['patient_id'].isin(patient_list)]
|
|
|
|
|
|
print(f"\nMemproses {len(subset_df)} gambar untuk set '{target_folder}'...")
|
|
|
|
|
|
for index, row in subset_df.iterrows():
|
|
|
grade = str(row['grade'])
|
|
|
image_path = row['image']
|
|
|
|
|
|
class_folder = os.path.join(target_path, grade)
|
|
|
os.makedirs(class_folder, exist_ok=True)
|
|
|
|
|
|
source_file = os.path.join(SOURCE_DATA_DIR, image_path)
|
|
|
destination_file = os.path.join(class_folder, os.path.basename(image_path))
|
|
|
|
|
|
|
|
|
if os.path.exists(source_file):
|
|
|
shutil.copyfile(source_file, destination_file)
|
|
|
else:
|
|
|
print(f" Peringatan: File sumber tidak ditemukan -> {source_file}")
|
|
|
|
|
|
|
|
|
if os.path.exists(SPLIT_DATA_DIR):
|
|
|
shutil.rmtree(SPLIT_DATA_DIR)
|
|
|
os.makedirs(SPLIT_DATA_DIR)
|
|
|
|
|
|
|
|
|
copy_files(train_patients, 'train')
|
|
|
copy_files(valid_patients, 'valid')
|
|
|
|
|
|
print("\n--- Proses splitting per pasien selesai! ---")
|
|
|
print(f"Dataset baru siap di folder '{SPLIT_DATA_DIR}' dengan struktur yang benar.") |