""" A collection of dataset (DS) loading and subsetting functions. """ import os import random import numpy as np from datasets import load_dataset ''' Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML Subset indicies are uploaded to ClearML for reproducibility REPRODUCE: Load full DS, then load indicies from ClearML to get same subset ''' def make_subset(dataset_link, subset_ratio, clearml_task): # Load dataset try: ds = load_dataset(dataset_link) except Exception as e: raise RuntimeError(f"Error loading the dataset: {e}") data_plants = ds['train'] data_length = len(data_plants) features = data_plants.features # Calculate amount of samples we use subset_size = int(data_length * subset_ratio) # Creating a subset of random data (by their indicies) indices = list(range(data_length)) random.shuffle(indices) subset_indices = indices[:subset_size] subset_dataset = data_plants.select(subset_indices) # -------- Upload the subset indices as a ClearML artifact -------- subset_path = "subset_indices.npy" np.save(subset_path, subset_indices) clearml_task.upload_artifact( name="subset_indices", artifact_object=subset_path ) clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}") return data_plants, subset_dataset, features