Spaces:
Sleeping
Sleeping
File size: 1,421 Bytes
04cb886 2ace27a 04cb886 2fd4542 03021e1 04cb886 2ace27a 04cb886 03021e1 04cb886 03021e1 04cb886 03021e1 2ace27a 03021e1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
"""
A collection of dataset (DS) loading and subsetting functions.
"""
import os
import random
import numpy as np
from datasets import load_dataset
'''
Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
Subset indicies are uploaded to ClearML for reproducibility
REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
'''
def make_subset(dataset_link, subset_ratio, clearml_task):
# Load dataset
try:
ds = load_dataset(dataset_link)
except Exception as e:
raise RuntimeError(f"Error loading the dataset: {e}")
data_plants = ds['train']
data_length = len(data_plants)
features = data_plants.features
# Calculate amount of samples we use
subset_size = int(data_length * subset_ratio)
# Creating a subset of random data (by their indicies)
indices = list(range(data_length))
random.shuffle(indices)
subset_indices = indices[:subset_size]
subset_dataset = data_plants.select(subset_indices)
# -------- Upload the subset indices as a ClearML artifact --------
subset_path = "subset_indices.npy"
np.save(subset_path, subset_indices)
clearml_task.upload_artifact(
name="subset_indices",
artifact_object=subset_path
)
clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}")
return data_plants, subset_dataset, features
|