Training in progress, step 1000
Browse files
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 151061672
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0c4b7fd8785c039942c4e5e66255da6cfbc756c8d9f1ea43ec233b7e27b666e
|
| 3 |
size 151061672
|
runs/Jan15_18-56-45_DESKTOP-NIMM8ON/events.out.tfevents.1736947607.DESKTOP-NIMM8ON
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:38e4644230bf0fca8d9507d6ecbb08e5075eeaa72f7722fd0b8db58818d0fe4f
|
| 3 |
+
size 14198
|
train.py
CHANGED
|
@@ -29,7 +29,7 @@ print(f"\n\n Loading {model_name} for {language} to {task}...this might take a w
|
|
| 29 |
## 3. Setting Up Training Args
|
| 30 |
output_dir = "./"
|
| 31 |
overwrite_output_dir = True
|
| 32 |
-
max_steps =
|
| 33 |
# max_steps = 5
|
| 34 |
per_device_train_batch_size = 8
|
| 35 |
# per_device_train_batch_size = 1
|
|
@@ -79,47 +79,47 @@ print("\n\n Loading Datasets...this might take a while..\n\n")
|
|
| 79 |
|
| 80 |
from datasets import load_dataset, DatasetDict, Features, Value, Audio
|
| 81 |
|
| 82 |
-
common_voice = DatasetDict()
|
| 83 |
-
google_fleurs = DatasetDict()
|
| 84 |
openslr = DatasetDict()
|
| 85 |
## commonvoice_11.0 + google_fleurs + openslr53
|
| 86 |
my_dataset = DatasetDict()
|
| 87 |
|
| 88 |
-
common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="train+validation+other", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
| 89 |
|
| 90 |
#####################
|
| 91 |
# google_fleurs["train"] = load_dataset("google/fleurs", "or_in", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
| 92 |
-
|
| 93 |
|
| 94 |
-
common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
|
| 95 |
#####################
|
| 96 |
# google_fleurs["test"] = load_dataset("google/fleurs", "or_in", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
|
| 97 |
-
|
| 98 |
|
| 99 |
|
| 100 |
# see count of samples in each dataset
|
| 101 |
print("\n\n Datasets Loaded \n\n")
|
| 102 |
-
print(common_voice)
|
| 103 |
#####################
|
| 104 |
# print(google_fleurs)
|
| 105 |
-
|
| 106 |
|
| 107 |
## Removing bad samples from common_voice based on upvotes and downvotes
|
| 108 |
-
print("\n BEFORE Filtering by Upvotes (Common Voice): \n")
|
| 109 |
-
print(common_voice["train"])
|
| 110 |
-
# FILTERING!!! Will get 37k Data if >0 and will get 201k Data if >=0 out of 207k
|
| 111 |
-
common_voice["train"] = common_voice["train"].filter(lambda x: (x["up_votes"] - x["down_votes"]) >= 0, num_proc=None)
|
| 112 |
-
print("\n AFTER Filtering by Upvotes (Common Voice): \n")
|
| 113 |
-
print(common_voice["train"])
|
| 114 |
-
|
| 115 |
-
print("\n\n So, the datasets to be trained are: \n\n")
|
| 116 |
-
print("\n Common Voice 11.0 - Bangla\n")
|
| 117 |
-
print(common_voice)
|
| 118 |
#####################
|
| 119 |
# print("\n Google Fleurs - Bangla \n")
|
| 120 |
# print(google_fleurs)
|
| 121 |
-
|
| 122 |
-
|
| 123 |
# print("\n")
|
| 124 |
|
| 125 |
|
|
@@ -130,39 +130,39 @@ from datasets import concatenate_datasets, Audio
|
|
| 130 |
sampling_rate = 16000
|
| 131 |
|
| 132 |
## resample to specified sampling rate
|
| 133 |
-
common_voice = common_voice.cast_column("audio", Audio(sampling_rate))
|
| 134 |
#####################
|
| 135 |
# google_fleurs = google_fleurs.cast_column("audio", Audio(sampling_rate))
|
| 136 |
-
|
| 137 |
|
| 138 |
## normalise columns to ["audio", "sentence"]
|
| 139 |
-
common_voice = common_voice.remove_columns(
|
| 140 |
-
|
| 141 |
-
)
|
| 142 |
|
| 143 |
#####################
|
| 144 |
-
|
| 145 |
# google_fleurs = google_fleurs.remove_columns(
|
| 146 |
# set(google_fleurs['test'].features.keys()) - {"audio", "sentence"}
|
| 147 |
# )
|
| 148 |
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
|
| 153 |
|
| 154 |
|
| 155 |
## check if all audio are in float32 dtype or not.
|
| 156 |
## a fix is: https://github.com/huggingface/datasets/issues/5345
|
| 157 |
-
print("\n Checking all audio dtype is float32 or not... \n")
|
| 158 |
-
print(f'Common Voice Train: {common_voice["train"][0]["audio"]["array"].dtype}')
|
| 159 |
-
print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
|
| 160 |
|
| 161 |
#####################
|
| 162 |
# print(f'Google Fleurs Train: {google_fleurs["train"][0]["audio"]["array"].dtype}')
|
| 163 |
# print(f'Google Fleurs Test: {google_fleurs["test"][0]["audio"]["array"].dtype}')
|
| 164 |
-
|
| 165 |
-
|
| 166 |
|
| 167 |
|
| 168 |
## merge the three datasets
|
|
@@ -171,8 +171,8 @@ print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
|
|
| 171 |
# my_dataset['train'] = concatenate_datasets([common_voice['train'], google_fleurs['train'], openslr['train']]) #for linux
|
| 172 |
|
| 173 |
#####################
|
| 174 |
-
my_dataset['train'] = concatenate_datasets([
|
| 175 |
-
my_dataset['test'] = concatenate_datasets([
|
| 176 |
|
| 177 |
# my_dataset['test'] = concatenate_datasets([common_voice['test'], google_fleurs['test'], openslr['test']]) #for linux
|
| 178 |
|
|
@@ -287,9 +287,11 @@ print("\n")
|
|
| 287 |
## Removes unused cached files & returns the number of removed cache files
|
| 288 |
print("\n Removing UNUSED Cache Files: \n")
|
| 289 |
try:
|
| 290 |
-
print(f"{common_voice.cleanup_cache_files()} for common_voice")
|
| 291 |
# print(f"{google_fleurs.cleanup_cache_files()} for google_fleurs")
|
| 292 |
-
|
|
|
|
|
|
|
| 293 |
# print(f"{crblp.cleanup_cache_files()} for crblp")
|
| 294 |
print(f"{my_dataset.cleanup_cache_files()} for my_dataset")
|
| 295 |
|
|
|
|
| 29 |
## 3. Setting Up Training Args
|
| 30 |
output_dir = "./"
|
| 31 |
overwrite_output_dir = True
|
| 32 |
+
max_steps = 16000
|
| 33 |
# max_steps = 5
|
| 34 |
per_device_train_batch_size = 8
|
| 35 |
# per_device_train_batch_size = 1
|
|
|
|
| 79 |
|
| 80 |
from datasets import load_dataset, DatasetDict, Features, Value, Audio
|
| 81 |
|
| 82 |
+
# common_voice = DatasetDict()
|
| 83 |
+
# google_fleurs = DatasetDict()
|
| 84 |
openslr = DatasetDict()
|
| 85 |
## commonvoice_11.0 + google_fleurs + openslr53
|
| 86 |
my_dataset = DatasetDict()
|
| 87 |
|
| 88 |
+
# common_voice["train"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="train+validation+other", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
| 89 |
|
| 90 |
#####################
|
| 91 |
# google_fleurs["train"] = load_dataset("google/fleurs", "or_in", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
| 92 |
+
openslr["train"] = load_dataset("Ranjit/or_in_dataset", split="train+validation", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
| 93 |
|
| 94 |
+
# common_voice["test"] = load_dataset("mozilla-foundation/common_voice_13_0", "or", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
|
| 95 |
#####################
|
| 96 |
# google_fleurs["test"] = load_dataset("google/fleurs", "or_in", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'))
|
| 97 |
+
openslr["test"] = load_dataset("Ranjit/or_in_dataset", split="test", cache_dir=os.path.join(base_dir, 'datasets_cache'), trust_remote_code=True)
|
| 98 |
|
| 99 |
|
| 100 |
# see count of samples in each dataset
|
| 101 |
print("\n\n Datasets Loaded \n\n")
|
| 102 |
+
# print(common_voice)
|
| 103 |
#####################
|
| 104 |
# print(google_fleurs)
|
| 105 |
+
print(openslr)
|
| 106 |
|
| 107 |
## Removing bad samples from common_voice based on upvotes and downvotes
|
| 108 |
+
# print("\n BEFORE Filtering by Upvotes (Common Voice): \n")
|
| 109 |
+
# print(common_voice["train"])
|
| 110 |
+
# # FILTERING!!! Will get 37k Data if >0 and will get 201k Data if >=0 out of 207k
|
| 111 |
+
# common_voice["train"] = common_voice["train"].filter(lambda x: (x["up_votes"] - x["down_votes"]) >= 0, num_proc=None)
|
| 112 |
+
# print("\n AFTER Filtering by Upvotes (Common Voice): \n")
|
| 113 |
+
# print(common_voice["train"])
|
| 114 |
+
|
| 115 |
+
# print("\n\n So, the datasets to be trained are: \n\n")
|
| 116 |
+
# print("\n Common Voice 11.0 - Bangla\n")
|
| 117 |
+
# print(common_voice)
|
| 118 |
#####################
|
| 119 |
# print("\n Google Fleurs - Bangla \n")
|
| 120 |
# print(google_fleurs)
|
| 121 |
+
print("\n OpenSLR-53 - Odia \n")
|
| 122 |
+
print(openslr)
|
| 123 |
# print("\n")
|
| 124 |
|
| 125 |
|
|
|
|
| 130 |
sampling_rate = 16000
|
| 131 |
|
| 132 |
## resample to specified sampling rate
|
| 133 |
+
# common_voice = common_voice.cast_column("audio", Audio(sampling_rate))
|
| 134 |
#####################
|
| 135 |
# google_fleurs = google_fleurs.cast_column("audio", Audio(sampling_rate))
|
| 136 |
+
openslr = openslr.cast_column("audio", Audio(sampling_rate))
|
| 137 |
|
| 138 |
## normalise columns to ["audio", "sentence"]
|
| 139 |
+
# common_voice = common_voice.remove_columns(
|
| 140 |
+
# set(common_voice['test'].features.keys()) - {"audio", "sentence"}
|
| 141 |
+
# )
|
| 142 |
|
| 143 |
#####################
|
| 144 |
+
openslr = openslr.rename_column("transcription", "sentence")
|
| 145 |
# google_fleurs = google_fleurs.remove_columns(
|
| 146 |
# set(google_fleurs['test'].features.keys()) - {"audio", "sentence"}
|
| 147 |
# )
|
| 148 |
|
| 149 |
+
openslr = openslr.remove_columns(
|
| 150 |
+
set(openslr['train'].features.keys()) - {"audio", "sentence"}
|
| 151 |
+
)
|
| 152 |
|
| 153 |
|
| 154 |
|
| 155 |
## check if all audio are in float32 dtype or not.
|
| 156 |
## a fix is: https://github.com/huggingface/datasets/issues/5345
|
| 157 |
+
# print("\n Checking all audio dtype is float32 or not... \n")
|
| 158 |
+
# print(f'Common Voice Train: {common_voice["train"][0]["audio"]["array"].dtype}')
|
| 159 |
+
# print(f'Common Voice Test: {common_voice["test"][0]["audio"]["array"].dtype}')
|
| 160 |
|
| 161 |
#####################
|
| 162 |
# print(f'Google Fleurs Train: {google_fleurs["train"][0]["audio"]["array"].dtype}')
|
| 163 |
# print(f'Google Fleurs Test: {google_fleurs["test"][0]["audio"]["array"].dtype}')
|
| 164 |
+
print(f'OpenSlR: {openslr["train"][0]["audio"]["array"].dtype}')
|
| 165 |
+
print("\n")
|
| 166 |
|
| 167 |
|
| 168 |
## merge the three datasets
|
|
|
|
| 171 |
# my_dataset['train'] = concatenate_datasets([common_voice['train'], google_fleurs['train'], openslr['train']]) #for linux
|
| 172 |
|
| 173 |
#####################
|
| 174 |
+
my_dataset['train'] = concatenate_datasets([openslr['train']])
|
| 175 |
+
my_dataset['test'] = concatenate_datasets([openslr['test']])
|
| 176 |
|
| 177 |
# my_dataset['test'] = concatenate_datasets([common_voice['test'], google_fleurs['test'], openslr['test']]) #for linux
|
| 178 |
|
|
|
|
| 287 |
## Removes unused cached files & returns the number of removed cache files
|
| 288 |
print("\n Removing UNUSED Cache Files: \n")
|
| 289 |
try:
|
| 290 |
+
# print(f"{common_voice.cleanup_cache_files()} for common_voice")
|
| 291 |
# print(f"{google_fleurs.cleanup_cache_files()} for google_fleurs")
|
| 292 |
+
print(f"{openslr.cleanup_cache_files()} for openslr")
|
| 293 |
+
|
| 294 |
+
|
| 295 |
# print(f"{crblp.cleanup_cache_files()} for crblp")
|
| 296 |
print(f"{my_dataset.cleanup_cache_files()} for my_dataset")
|
| 297 |
|
training_args.bin
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 5432
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2d15624a789abc926420d80f49a8125d726268cd886dcc520fda26a4eb609ff5
|
| 3 |
size 5432
|