smallGroupProject / dataPrep /helpers /create_dataset.py
Yusuf
fix: replace clearml datasets with artifacts
03021e1
"""
A collection of dataset (DS) loading and subsetting functions.
"""
import os
import random
import numpy as np
from datasets import load_dataset
'''
Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
Subset indicies are uploaded to ClearML for reproducibility
REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
'''
def make_subset(dataset_link, subset_ratio, clearml_task):
# Load dataset
try:
ds = load_dataset(dataset_link)
except Exception as e:
raise RuntimeError(f"Error loading the dataset: {e}")
data_plants = ds['train']
data_length = len(data_plants)
features = data_plants.features
# Calculate amount of samples we use
subset_size = int(data_length * subset_ratio)
# Creating a subset of random data (by their indicies)
indices = list(range(data_length))
random.shuffle(indices)
subset_indices = indices[:subset_size]
subset_dataset = data_plants.select(subset_indices)
# -------- Upload the subset indices as a ClearML artifact --------
subset_path = "subset_indices.npy"
np.save(subset_path, subset_indices)
clearml_task.upload_artifact(
name="subset_indices",
artifact_object=subset_path
)
clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}")
return data_plants, subset_dataset, features