Spaces:
Sleeping
Sleeping
| """ | |
| A collection of dataset (DS) loading and subsetting functions. | |
| """ | |
| import os | |
| import random | |
| import numpy as np | |
| from datasets import load_dataset | |
| ''' | |
| Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML | |
| Subset indicies are uploaded to ClearML for reproducibility | |
| REPRODUCE: Load full DS, then load indicies from ClearML to get same subset | |
| ''' | |
| def make_subset(dataset_link, subset_ratio, clearml_task): | |
| # Load dataset | |
| try: | |
| ds = load_dataset(dataset_link) | |
| except Exception as e: | |
| raise RuntimeError(f"Error loading the dataset: {e}") | |
| data_plants = ds['train'] | |
| data_length = len(data_plants) | |
| features = data_plants.features | |
| # Calculate amount of samples we use | |
| subset_size = int(data_length * subset_ratio) | |
| # Creating a subset of random data (by their indicies) | |
| indices = list(range(data_length)) | |
| random.shuffle(indices) | |
| subset_indices = indices[:subset_size] | |
| subset_dataset = data_plants.select(subset_indices) | |
| # -------- Upload the subset indices as a ClearML artifact -------- | |
| subset_path = "subset_indices.npy" | |
| np.save(subset_path, subset_indices) | |
| clearml_task.upload_artifact( | |
| name="subset_indices", | |
| artifact_object=subset_path | |
| ) | |
| clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}") | |
| return data_plants, subset_dataset, features | |