Training in progress, step 1000

Browse files

Files changed (4) hide show

model.safetensors +1 -1
runs/Jan15_18-56-45_DESKTOP-NIMM8ON/events.out.tfevents.1736947607.DESKTOP-NIMM8ON +3 -0
train.py +41 -39
training_args.bin +1 -1

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a5e9b7db77734b6587da1e2c4dde29bd33582495050802c96ca806ba17478b0a
 size 151061672

 version https://git-lfs.github.com/spec/v1
+oid sha256:d0c4b7fd8785c039942c4e5e66255da6cfbc756c8d9f1ea43ec233b7e27b666e
 size 151061672

runs/Jan15_18-56-45_DESKTOP-NIMM8ON/events.out.tfevents.1736947607.DESKTOP-NIMM8ON ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:38e4644230bf0fca8d9507d6ecbb08e5075eeaa72f7722fd0b8db58818d0fe4f
+size 14198

train.py CHANGED Viewed

@@ -29,7 +29,7 @@ print(f"\n\n Loading {model_name} for {language} to {task}...this might take a w
 ## 3. Setting Up Training Args
 output_dir = "./"
 overwrite_output_dir = True
-max_steps = 45000
 # max_steps = 5
 per_device_train_batch_size = 8
 # per_device_train_batch_size = 1
@@ -79,47 +79,47 @@ print("\n\n Loading Datasets...this might take a while..\n\n")
 from datasets import load_dataset, DatasetDict, Features, Value, Audio
-common_voice = DatasetDict()
-google_fleurs = DatasetDict()
 openslr = DatasetDict()
 ## commonvoice_11.0 + google_fleurs + openslr53
 my_dataset = DatasetDict()
-common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="train+validation+other", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
 #####################
 # google_fleurs["train"] = load_dataset("google/fleurs", "or_in", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
-# openslr["train"] = load_dataset("Ranjit/or_in_dataset", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
-common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
 #####################
 # google_fleurs["test"] = load_dataset("google/fleurs", "or_in", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
-# openslr["test"] = load_dataset("Ranjit/or_in_dataset", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
 # see count of samples in each dataset
 print("\n\n Datasets Loaded \n\n")
-print(common_voice)
 #####################
 # print(google_fleurs)
-# print(openslr)
 ## Removing bad samples from common_voice based on upvotes and downvotes
-print("\n BEFORE Filtering by Upvotes (Common Voice): \n")
-print(common_voice["train"])
-# FILTERING!!! Will get 37k Data if >0 and will get 201k Data if >=0 out of 207k
-common_voice["train"] = common_voice["train"].filter(lambda x: (x["up_votes"] - x["down_votes"]) >= 0, num_proc=None)
-print("\n AFTER Filtering by Upvotes (Common Voice): \n")
-print(common_voice["train"])
-print("\n\n So, the datasets to be trained are: \n\n")
-print("\n Common Voice 11.0 - Bangla\n")
-print(common_voice)
 #####################
 # print("\n Google Fleurs - Bangla \n")
 # print(google_fleurs)
-# print("\n OpenSLR-53 - Bangla \n")
-# print(openslr)
 # print("\n")
@@ -130,39 +130,39 @@ from datasets import concatenate_datasets, Audio
 sampling_rate = 16000
 ## resample to specified sampling rate
-common_voice = common_voice.cast_column("audio", Audio(sampling_rate))
 #####################
 # google_fleurs = google_fleurs.cast_column("audio", Audio(sampling_rate))
-# openslr = openslr.cast_column("audio", Audio(sampling_rate))
 ## normalise columns to ["audio", "sentence"]
-common_voice = common_voice.remove_columns(
-    set(common_voice['test'].features.keys()) - {"audio", "sentence"}
-)
 #####################
-# google_fleurs = google_fleurs.rename_column("raw_transcription", "sentence")
 # google_fleurs = google_fleurs.remove_columns(
 #     set(google_fleurs['test'].features.keys()) - {"audio", "sentence"}
 # )
-# openslr = openslr.remove_columns(
-#     set(openslr['train'].features.keys()) - {"audio", "sentence"}
-# )
 ## check if all audio are in float32 dtype or not.
 ## a fix is: https://github.com/huggingface/datasets/issues/5345
-print("\n Checking all audio dtype is float32 or not... \n")
-print(f'Common Voice Train: {common_voice["train"][0]["audio"]["array"].dtype}')
-print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
 #####################
 # print(f'Google Fleurs Train: {google_fleurs["train"][0]["audio"]["array"].dtype}')
 # print(f'Google Fleurs Test: {google_fleurs["test"][0]["audio"]["array"].dtype}')
-# print(f'OpenSlR: {openslr["train"][0]["audio"]["array"].dtype}')
-# print("\n")
 ## merge the three datasets
@@ -171,8 +171,8 @@ print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
 # my_dataset['train'] = concatenate_datasets([common_voice['train'], google_fleurs['train'], openslr['train']]) #for linux
 #####################
-my_dataset['train'] = concatenate_datasets([common_voice['train']])
-my_dataset['test'] = concatenate_datasets([common_voice['test']])
 # my_dataset['test'] = concatenate_datasets([common_voice['test'], google_fleurs['test'], openslr['test']]) #for linux
@@ -287,9 +287,11 @@ print("\n")
 ## Removes unused cached files & returns the number of removed cache files
 print("\n Removing UNUSED Cache Files: \n")
 try:
-    print(f"{common_voice.cleanup_cache_files()} for common_voice")
     # print(f"{google_fleurs.cleanup_cache_files()} for google_fleurs")
-    # print(f"{openslr.cleanup_cache_files()} for openslr")
     # print(f"{crblp.cleanup_cache_files()} for crblp")
     print(f"{my_dataset.cleanup_cache_files()} for my_dataset")

 ## 3. Setting Up Training Args
 output_dir = "./"
 overwrite_output_dir = True
+max_steps = 16000
 # max_steps = 5
 per_device_train_batch_size = 8
 # per_device_train_batch_size = 1
 from datasets import load_dataset, DatasetDict, Features, Value, Audio
+# common_voice = DatasetDict()
+# google_fleurs = DatasetDict()
 openslr = DatasetDict()
 ## commonvoice_11.0 + google_fleurs + openslr53
 my_dataset = DatasetDict()
+# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="train+validation+other", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
 #####################
 # google_fleurs["train"] = load_dataset("google/fleurs", "or_in", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
+openslr["train"] = load_dataset("Ranjit/or_in_dataset", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
+# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
 #####################
 # google_fleurs["test"] = load_dataset("google/fleurs", "or_in", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
+openslr["test"] = load_dataset("Ranjit/or_in_dataset", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
 # see count of samples in each dataset
 print("\n\n Datasets Loaded \n\n")
+# print(common_voice)
 #####################
 # print(google_fleurs)
+print(openslr)
 ## Removing bad samples from common_voice based on upvotes and downvotes
+# print("\n BEFORE Filtering by Upvotes (Common Voice): \n")
+# print(common_voice["train"])
+# # FILTERING!!! Will get 37k Data if >0 and will get 201k Data if >=0 out of 207k
+# common_voice["train"] = common_voice["train"].filter(lambda x: (x["up_votes"] - x["down_votes"]) >= 0, num_proc=None)
+# print("\n AFTER Filtering by Upvotes (Common Voice): \n")
+# print(common_voice["train"])
+# print("\n\n So, the datasets to be trained are: \n\n")
+# print("\n Common Voice 11.0 - Bangla\n")
+# print(common_voice)
 #####################
 # print("\n Google Fleurs - Bangla \n")
 # print(google_fleurs)
+print("\n OpenSLR-53 - Odia \n")
+print(openslr)
 # print("\n")
 sampling_rate = 16000
 ## resample to specified sampling rate
+# common_voice = common_voice.cast_column("audio", Audio(sampling_rate))
 #####################
 # google_fleurs = google_fleurs.cast_column("audio", Audio(sampling_rate))
+openslr = openslr.cast_column("audio", Audio(sampling_rate))
 ## normalise columns to ["audio", "sentence"]
+# common_voice = common_voice.remove_columns(
+#     set(common_voice['test'].features.keys()) - {"audio", "sentence"}
+# )
 #####################
+openslr = openslr.rename_column("transcription", "sentence")
 # google_fleurs = google_fleurs.remove_columns(
 #     set(google_fleurs['test'].features.keys()) - {"audio", "sentence"}
 # )
+openslr = openslr.remove_columns(
+    set(openslr['train'].features.keys()) - {"audio", "sentence"}
+)
 ## check if all audio are in float32 dtype or not.
 ## a fix is: https://github.com/huggingface/datasets/issues/5345
+# print("\n Checking all audio dtype is float32 or not... \n")
+# print(f'Common Voice Train: {common_voice["train"][0]["audio"]["array"].dtype}')
+# print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
 #####################
 # print(f'Google Fleurs Train: {google_fleurs["train"][0]["audio"]["array"].dtype}')
 # print(f'Google Fleurs Test: {google_fleurs["test"][0]["audio"]["array"].dtype}')
+print(f'OpenSlR: {openslr["train"][0]["audio"]["array"].dtype}')
+print("\n")
 ## merge the three datasets
 # my_dataset['train'] = concatenate_datasets([common_voice['train'], google_fleurs['train'], openslr['train']]) #for linux
 #####################
+my_dataset['train'] = concatenate_datasets([openslr['train']])
+my_dataset['test'] = concatenate_datasets([openslr['test']])
 # my_dataset['test'] = concatenate_datasets([common_voice['test'], google_fleurs['test'], openslr['test']]) #for linux
 ## Removes unused cached files & returns the number of removed cache files
 print("\n Removing UNUSED Cache Files: \n")
 try:
+    # print(f"{common_voice.cleanup_cache_files()} for common_voice")
     # print(f"{google_fleurs.cleanup_cache_files()} for google_fleurs")
+    print(f"{openslr.cleanup_cache_files()} for openslr")
     # print(f"{crblp.cleanup_cache_files()} for crblp")
     print(f"{my_dataset.cleanup_cache_files()} for my_dataset")

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d1e8a9af8dd247775125a6f6fba77bf404e06dc3739cd7f5f285cc570de628b0
 size 5432

 version https://git-lfs.github.com/spec/v1
+oid sha256:2d15624a789abc926420d80f49a8125d726268cd886dcc520fda26a4eb609ff5
 size 5432