File size: 1,421 Bytes
04cb886
 
 
 
2ace27a
04cb886
 
 
 
 
2fd4542
 
 
 
 
03021e1
04cb886
 
 
2ace27a
04cb886
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03021e1
04cb886
03021e1
04cb886
 
 
03021e1
 
 
 
 
2ace27a
03021e1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""
A collection of dataset (DS) loading and subsetting functions.
"""

import os
import random
import numpy as np
from datasets import load_dataset


'''
Load a DS from HuggingFace Link & randomly subset it - upload subset to ClearML
Subset indicies are uploaded to ClearML for reproducibility
REPRODUCE: Load full DS, then load indicies from ClearML to get same subset
'''
def make_subset(dataset_link, subset_ratio, clearml_task):

    # Load dataset
    try:
        ds = load_dataset(dataset_link)
    except Exception as e:
        raise RuntimeError(f"Error loading the dataset: {e}")

    data_plants = ds['train']
    data_length = len(data_plants)
    features = data_plants.features

    # Calculate amount of samples we use
    subset_size = int(data_length * subset_ratio)

    # Creating a subset of random data (by their indicies)
    indices = list(range(data_length))
    random.shuffle(indices)
    subset_indices = indices[:subset_size]

    subset_dataset = data_plants.select(subset_indices)

    # -------- Upload the subset indices as a ClearML artifact --------
    subset_path = "subset_indices.npy"
    np.save(subset_path, subset_indices)

    clearml_task.upload_artifact(
        name="subset_indices",
        artifact_object=subset_path
    )
    clearml_task.get_logger().report_text(f"Uploaded subset indices as artifact: {subset_path}")

    return data_plants, subset_dataset, features