ra1425 commited on
Commit
c0dc8ab
·
1 Parent(s): 71353f6

FEAT: Completed EDA, subsetting, and logged all artifacts to ClearML

Browse files
Files changed (1) hide show
  1. data_preparation.py +96 -1
data_preparation.py CHANGED
@@ -1,6 +1,12 @@
1
  import os
2
  import random
3
  import numpy as np
 
 
 
 
 
 
4
  import torch
5
  from clearml import Task, Logger
6
  from datasets import load_dataset
@@ -16,6 +22,8 @@ if torch.cuda.is_available():
16
  # Initialising a task on ClearML
17
  task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
18
  task.set_random_seed(SEED)
 
 
19
 
20
  # Loading dataset from HugginFace
21
  try:
@@ -44,4 +52,91 @@ else:
44
  # Verifying single sample
45
  sample = data_plants[0]
46
  print(f"Sample image type: {type(sample['image'])}")
47
- print(f"Sample label: {sample['label']}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import random
3
  import numpy as np
4
+ import pandas as pd
5
+
6
+ # Visualisation
7
+ #import seaborn as sns
8
+ import matplotlib.pyplot as plt
9
+
10
  import torch
11
  from clearml import Task, Logger
12
  from datasets import load_dataset
 
22
  # Initialising a task on ClearML
23
  task = Task.init(project_name= 'smallGroupProject', task_name = 'data_prep')
24
  task.set_random_seed(SEED)
25
+ clearml_logger = task.get_logger()
26
+
27
 
28
  # Loading dataset from HugginFace
29
  try:
 
52
  # Verifying single sample
53
  sample = data_plants[0]
54
  print(f"Sample image type: {type(sample['image'])}")
55
+ print(f"Sample label: {sample['label']}")
56
+
57
+ # -----------------------------------------------------------
58
+ # Creating the prototyping dataset
59
+ SUBSET_RATIO = 0.25 # 25% for prototyping
60
+
61
+ # Loggint it to ClearML
62
+ task.connect_configuration(
63
+ {"subset_ratio": SUBSET_RATIO},
64
+ name="Data subsetting"
65
+ )
66
+ # Calculate amount of samples we use
67
+ subset_size = int(data_length * SUBSET_RATIO)
68
+
69
+ # Creating a subset of random data (by their indices)
70
+ indices = list(range(data_length))
71
+ random.shuffle(indices)
72
+ subset_indices = indices[:subset_size]
73
+
74
+ prototyping_dataset = data_plants.select(subset_indices)
75
+
76
+ #Verifying
77
+ print(f"Prototyping dataset size: {len(prototyping_dataset)}")
78
+
79
+ # -----------------------------------------------------------
80
+ # Exploratory data analysis (EDA)
81
+
82
+ #sns.set(color_codes = True)
83
+
84
+ # Reformatting the label feature to understand bias
85
+ labels_list = prototyping_dataset['label']
86
+ df_labels = pd.Series(labels_list)
87
+ label_count = df_labels.value_counts(sort = False)
88
+
89
+ # Checking the amount of samples in each class and logging it to clearML
90
+
91
+ min_count = label_count.min()
92
+ clearml_logger.report_scalar(
93
+ title="Classes Counts",
94
+ series="Min Class Count",
95
+ value=min_count,
96
+ iteration=1
97
+ )
98
+
99
+ max_count = label_count.max()
100
+ clearml_logger.report_scalar(
101
+ title="Classes Counts",
102
+ series="Max Class Count",
103
+ value=max_count,
104
+ iteration=1
105
+ )
106
+
107
+ mean_count = label_count.mean()
108
+ clearml_logger.report_scalar(
109
+ title="Classes Counts",
110
+ series="Imbalance Ratio (Max/Min)",
111
+ value=(max_count / min_count),
112
+ iteration=1
113
+ )
114
+
115
+ print("Class imbalance analysis: ")
116
+ print(f"Max labels in a class: {max_count}")
117
+ print(f"Min labels in a class: {min_count}")
118
+ print(f"Mean labels in a class: {mean_count}")
119
+ print(f"Imbalance ratio: {max_count/min_count:.2f}")
120
+
121
+ # Mapping indeces to class names
122
+ class_names = features['label'].names
123
+ formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
124
+ label_count.index = formatted_class_names
125
+
126
+
127
+ # Creating bar chart with labels distribution
128
+ label_count.plot(kind='bar', figsize=(15,6))
129
+ plt.xlabel('Labels')
130
+ plt.ylabel('Sample count')
131
+ plt.title('Class distribution among chosen samples')
132
+
133
+ plot_file = 'class_distribution.png'
134
+ plt.savefig(plot_file)
135
+ clearml_logger.report_image(
136
+ title="EDA", # The title for the plot section in ClearML
137
+ series="Class Distribution", # The name of this specific plot
138
+ iteration=1, # The experiment step
139
+ local_path=plot_file # The path to the file you just saved
140
+ )
141
+
142
+ plt.show()