Ranjit commited on
Commit
2f98415
·
verified ·
1 Parent(s): 30424fa

Training in progress, step 1000

Browse files
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5e9b7db77734b6587da1e2c4dde29bd33582495050802c96ca806ba17478b0a
3
  size 151061672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0c4b7fd8785c039942c4e5e66255da6cfbc756c8d9f1ea43ec233b7e27b666e
3
  size 151061672
runs/Jan15_18-56-45_DESKTOP-NIMM8ON/events.out.tfevents.1736947607.DESKTOP-NIMM8ON ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:38e4644230bf0fca8d9507d6ecbb08e5075eeaa72f7722fd0b8db58818d0fe4f
3
+ size 14198
train.py CHANGED
@@ -29,7 +29,7 @@ print(f"\n\n Loading {model_name} for {language} to {task}...this might take a w
29
  ## 3. Setting Up Training Args
30
  output_dir = "./"
31
  overwrite_output_dir = True
32
- max_steps = 45000
33
  # max_steps = 5
34
  per_device_train_batch_size = 8
35
  # per_device_train_batch_size = 1
@@ -79,47 +79,47 @@ print("\n\n Loading Datasets...this might take a while..\n\n")
79
 
80
  from datasets import load_dataset, DatasetDict, Features, Value, Audio
81
 
82
- common_voice = DatasetDict()
83
- google_fleurs = DatasetDict()
84
  openslr = DatasetDict()
85
  ## commonvoice_11.0 + google_fleurs + openslr53
86
  my_dataset = DatasetDict()
87
 
88
- common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="train+validation+other", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
89
 
90
  #####################
91
  # google_fleurs["train"] = load_dataset("google/fleurs", "or_in", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
92
- # openslr["train"] = load_dataset("Ranjit/or_in_dataset", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
93
 
94
- common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
95
  #####################
96
  # google_fleurs["test"] = load_dataset("google/fleurs", "or_in", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
97
- # openslr["test"] = load_dataset("Ranjit/or_in_dataset", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
98
 
99
 
100
  # see count of samples in each dataset
101
  print("\n\n Datasets Loaded \n\n")
102
- print(common_voice)
103
  #####################
104
  # print(google_fleurs)
105
- # print(openslr)
106
 
107
  ## Removing bad samples from common_voice based on upvotes and downvotes
108
- print("\n BEFORE Filtering by Upvotes (Common Voice): \n")
109
- print(common_voice["train"])
110
- # FILTERING!!! Will get 37k Data if >0 and will get 201k Data if >=0 out of 207k
111
- common_voice["train"] = common_voice["train"].filter(lambda x: (x["up_votes"] - x["down_votes"]) >= 0, num_proc=None)
112
- print("\n AFTER Filtering by Upvotes (Common Voice): \n")
113
- print(common_voice["train"])
114
-
115
- print("\n\n So, the datasets to be trained are: \n\n")
116
- print("\n Common Voice 11.0 - Bangla\n")
117
- print(common_voice)
118
  #####################
119
  # print("\n Google Fleurs - Bangla \n")
120
  # print(google_fleurs)
121
- # print("\n OpenSLR-53 - Bangla \n")
122
- # print(openslr)
123
  # print("\n")
124
 
125
 
@@ -130,39 +130,39 @@ from datasets import concatenate_datasets, Audio
130
  sampling_rate = 16000
131
 
132
  ## resample to specified sampling rate
133
- common_voice = common_voice.cast_column("audio", Audio(sampling_rate))
134
  #####################
135
  # google_fleurs = google_fleurs.cast_column("audio", Audio(sampling_rate))
136
- # openslr = openslr.cast_column("audio", Audio(sampling_rate))
137
 
138
  ## normalise columns to ["audio", "sentence"]
139
- common_voice = common_voice.remove_columns(
140
- set(common_voice['test'].features.keys()) - {"audio", "sentence"}
141
- )
142
 
143
  #####################
144
- # google_fleurs = google_fleurs.rename_column("raw_transcription", "sentence")
145
  # google_fleurs = google_fleurs.remove_columns(
146
  # set(google_fleurs['test'].features.keys()) - {"audio", "sentence"}
147
  # )
148
 
149
- # openslr = openslr.remove_columns(
150
- # set(openslr['train'].features.keys()) - {"audio", "sentence"}
151
- # )
152
 
153
 
154
 
155
  ## check if all audio are in float32 dtype or not.
156
  ## a fix is: https://github.com/huggingface/datasets/issues/5345
157
- print("\n Checking all audio dtype is float32 or not... \n")
158
- print(f'Common Voice Train: {common_voice["train"][0]["audio"]["array"].dtype}')
159
- print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
160
 
161
  #####################
162
  # print(f'Google Fleurs Train: {google_fleurs["train"][0]["audio"]["array"].dtype}')
163
  # print(f'Google Fleurs Test: {google_fleurs["test"][0]["audio"]["array"].dtype}')
164
- # print(f'OpenSlR: {openslr["train"][0]["audio"]["array"].dtype}')
165
- # print("\n")
166
 
167
 
168
  ## merge the three datasets
@@ -171,8 +171,8 @@ print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
171
  # my_dataset['train'] = concatenate_datasets([common_voice['train'], google_fleurs['train'], openslr['train']]) #for linux
172
 
173
  #####################
174
- my_dataset['train'] = concatenate_datasets([common_voice['train']])
175
- my_dataset['test'] = concatenate_datasets([common_voice['test']])
176
 
177
  # my_dataset['test'] = concatenate_datasets([common_voice['test'], google_fleurs['test'], openslr['test']]) #for linux
178
 
@@ -287,9 +287,11 @@ print("\n")
287
  ## Removes unused cached files & returns the number of removed cache files
288
  print("\n Removing UNUSED Cache Files: \n")
289
  try:
290
- print(f"{common_voice.cleanup_cache_files()} for common_voice")
291
  # print(f"{google_fleurs.cleanup_cache_files()} for google_fleurs")
292
- # print(f"{openslr.cleanup_cache_files()} for openslr")
 
 
293
  # print(f"{crblp.cleanup_cache_files()} for crblp")
294
  print(f"{my_dataset.cleanup_cache_files()} for my_dataset")
295
 
 
29
  ## 3. Setting Up Training Args
30
  output_dir = "./"
31
  overwrite_output_dir = True
32
+ max_steps = 16000
33
  # max_steps = 5
34
  per_device_train_batch_size = 8
35
  # per_device_train_batch_size = 1
 
79
 
80
  from datasets import load_dataset, DatasetDict, Features, Value, Audio
81
 
82
+ # common_voice = DatasetDict()
83
+ # google_fleurs = DatasetDict()
84
  openslr = DatasetDict()
85
  ## commonvoice_11.0 + google_fleurs + openslr53
86
  my_dataset = DatasetDict()
87
 
88
+ # common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="train+validation+other", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
89
 
90
  #####################
91
  # google_fleurs["train"] = load_dataset("google/fleurs", "or_in", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
92
+ openslr["train"] = load_dataset("Ranjit/or_in_dataset", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
93
 
94
+ # common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
95
  #####################
96
  # google_fleurs["test"] = load_dataset("google/fleurs", "or_in", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
97
+ openslr["test"] = load_dataset("Ranjit/or_in_dataset", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
98
 
99
 
100
  # see count of samples in each dataset
101
  print("\n\n Datasets Loaded \n\n")
102
+ # print(common_voice)
103
  #####################
104
  # print(google_fleurs)
105
+ print(openslr)
106
 
107
  ## Removing bad samples from common_voice based on upvotes and downvotes
108
+ # print("\n BEFORE Filtering by Upvotes (Common Voice): \n")
109
+ # print(common_voice["train"])
110
+ # # FILTERING!!! Will get 37k Data if >0 and will get 201k Data if >=0 out of 207k
111
+ # common_voice["train"] = common_voice["train"].filter(lambda x: (x["up_votes"] - x["down_votes"]) >= 0, num_proc=None)
112
+ # print("\n AFTER Filtering by Upvotes (Common Voice): \n")
113
+ # print(common_voice["train"])
114
+
115
+ # print("\n\n So, the datasets to be trained are: \n\n")
116
+ # print("\n Common Voice 11.0 - Bangla\n")
117
+ # print(common_voice)
118
  #####################
119
  # print("\n Google Fleurs - Bangla \n")
120
  # print(google_fleurs)
121
+ print("\n OpenSLR-53 - Odia \n")
122
+ print(openslr)
123
  # print("\n")
124
 
125
 
 
130
  sampling_rate = 16000
131
 
132
  ## resample to specified sampling rate
133
+ # common_voice = common_voice.cast_column("audio", Audio(sampling_rate))
134
  #####################
135
  # google_fleurs = google_fleurs.cast_column("audio", Audio(sampling_rate))
136
+ openslr = openslr.cast_column("audio", Audio(sampling_rate))
137
 
138
  ## normalise columns to ["audio", "sentence"]
139
+ # common_voice = common_voice.remove_columns(
140
+ # set(common_voice['test'].features.keys()) - {"audio", "sentence"}
141
+ # )
142
 
143
  #####################
144
+ openslr = openslr.rename_column("transcription", "sentence")
145
  # google_fleurs = google_fleurs.remove_columns(
146
  # set(google_fleurs['test'].features.keys()) - {"audio", "sentence"}
147
  # )
148
 
149
+ openslr = openslr.remove_columns(
150
+ set(openslr['train'].features.keys()) - {"audio", "sentence"}
151
+ )
152
 
153
 
154
 
155
  ## check if all audio are in float32 dtype or not.
156
  ## a fix is: https://github.com/huggingface/datasets/issues/5345
157
+ # print("\n Checking all audio dtype is float32 or not... \n")
158
+ # print(f'Common Voice Train: {common_voice["train"][0]["audio"]["array"].dtype}')
159
+ # print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
160
 
161
  #####################
162
  # print(f'Google Fleurs Train: {google_fleurs["train"][0]["audio"]["array"].dtype}')
163
  # print(f'Google Fleurs Test: {google_fleurs["test"][0]["audio"]["array"].dtype}')
164
+ print(f'OpenSlR: {openslr["train"][0]["audio"]["array"].dtype}')
165
+ print("\n")
166
 
167
 
168
  ## merge the three datasets
 
171
  # my_dataset['train'] = concatenate_datasets([common_voice['train'], google_fleurs['train'], openslr['train']]) #for linux
172
 
173
  #####################
174
+ my_dataset['train'] = concatenate_datasets([openslr['train']])
175
+ my_dataset['test'] = concatenate_datasets([openslr['test']])
176
 
177
  # my_dataset['test'] = concatenate_datasets([common_voice['test'], google_fleurs['test'], openslr['test']]) #for linux
178
 
 
287
  ## Removes unused cached files & returns the number of removed cache files
288
  print("\n Removing UNUSED Cache Files: \n")
289
  try:
290
+ # print(f"{common_voice.cleanup_cache_files()} for common_voice")
291
  # print(f"{google_fleurs.cleanup_cache_files()} for google_fleurs")
292
+ print(f"{openslr.cleanup_cache_files()} for openslr")
293
+
294
+
295
  # print(f"{crblp.cleanup_cache_files()} for crblp")
296
  print(f"{my_dataset.cleanup_cache_files()} for my_dataset")
297
 
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1e8a9af8dd247775125a6f6fba77bf404e06dc3739cd7f5f285cc570de628b0
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d15624a789abc926420d80f49a8125d726268cd886dcc520fda26a4eb609ff5
3
  size 5432