HAMMALE commited on
Commit
bd0e2b2
·
verified ·
1 Parent(s): 05cab38

Add model, Gradio demo, and README

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. BATSH.sh +18 -0
  2. PUSH.PY +204 -0
  3. README.md +65 -178
  4. Readme.md +39 -0
  5. app.py +56 -0
  6. checkpoint-1000/config.json +107 -0
  7. checkpoint-1000/model.safetensors +3 -0
  8. checkpoint-1000/optimizer.pt +3 -0
  9. checkpoint-1000/rng_state.pth +3 -0
  10. checkpoint-1000/scaler.pt +3 -0
  11. checkpoint-1000/scheduler.pt +3 -0
  12. checkpoint-1000/trainer_state.json +131 -0
  13. checkpoint-1000/training_args.bin +3 -0
  14. checkpoint-1500/config.json +107 -0
  15. checkpoint-1500/model.safetensors +3 -0
  16. checkpoint-1500/optimizer.pt +3 -0
  17. checkpoint-1500/rng_state.pth +3 -0
  18. checkpoint-1500/scaler.pt +3 -0
  19. checkpoint-1500/scheduler.pt +3 -0
  20. checkpoint-1500/trainer_state.json +175 -0
  21. checkpoint-1500/training_args.bin +3 -0
  22. checkpoint-2000/config.json +107 -0
  23. checkpoint-2000/model.safetensors +3 -0
  24. checkpoint-2000/optimizer.pt +3 -0
  25. checkpoint-2000/rng_state.pth +3 -0
  26. checkpoint-2000/scaler.pt +3 -0
  27. checkpoint-2000/scheduler.pt +3 -0
  28. checkpoint-2000/trainer_state.json +219 -0
  29. checkpoint-2000/training_args.bin +3 -0
  30. checkpoint-2500/config.json +107 -0
  31. checkpoint-2500/model.safetensors +3 -0
  32. checkpoint-2500/optimizer.pt +3 -0
  33. checkpoint-2500/rng_state.pth +3 -0
  34. checkpoint-2500/scaler.pt +3 -0
  35. checkpoint-2500/scheduler.pt +3 -0
  36. checkpoint-2500/trainer_state.json +263 -0
  37. checkpoint-2500/training_args.bin +3 -0
  38. checkpoint-3000/config.json +107 -0
  39. checkpoint-3000/model.safetensors +3 -0
  40. checkpoint-3000/optimizer.pt +3 -0
  41. checkpoint-3000/rng_state.pth +3 -0
  42. checkpoint-3000/scaler.pt +3 -0
  43. checkpoint-3000/scheduler.pt +3 -0
  44. checkpoint-3000/trainer_state.json +307 -0
  45. checkpoint-3000/training_args.bin +3 -0
  46. checkpoint-3500/config.json +107 -0
  47. checkpoint-3500/model.safetensors +3 -0
  48. checkpoint-3500/optimizer.pt +3 -0
  49. checkpoint-3500/rng_state.pth +3 -0
  50. checkpoint-3500/scaler.pt +3 -0
BATSH.sh ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ huggingface-cli login
2
+ transformers-cli repo create mms-darija-finetuned --organization your-org # if needed
3
+
4
+ cd mms-darija-finetuned
5
+ cp -r output_dir/* . # copier tout le modèle fine-tuné
6
+ cp ../README.md .
7
+ cp ../app.py .
8
+ git add .
9
+ git commit -m "Upload Darija fine-tuned MMS model + Space demo"
10
+ git push
11
+ cd mms-darija-finetuned
12
+ git init
13
+ git lfs install
14
+ git remote add orig
15
+ in https://huggingface.co/YOUR_USERNAME/mms-darija-finetuned
16
+ git add .
17
+ git commit -m "Upload fine-tuned Darija MMS model"
18
+ git push -u origin master
PUSH.PY ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Complete script to push model, demo, and README to HuggingFace Hub
4
+ """
5
+
6
+ import os
7
+ from huggingface_hub import HfApi
8
+
9
+ # Configuration
10
+ REPO_ID = "HAMMALE/mms-darija-finetuned"
11
+ MODEL_DIR = "./mms-darija-finetuned"
12
+
13
+ # Initialize API
14
+ api = HfApi()
15
+
16
+ # Create repository
17
+ print("📦 Creating repository...")
18
+ api.create_repo(
19
+ repo_id=REPO_ID,
20
+ repo_type="model",
21
+ exist_ok=True
22
+ )
23
+
24
+ # Create README.md
25
+ readme_content = """---
26
+ language:
27
+ - ar
28
+ - ary
29
+ tags:
30
+ - speech-recognition
31
+ - audio
32
+ - wav2vec2
33
+ - mms
34
+ - darija
35
+ - moroccan-arabic
36
+ - bible
37
+ license: apache-2.0
38
+ datasets:
39
+ - atlasia/darija_bible_aligned
40
+ metrics:
41
+ - wer
42
+ widget:
43
+ - example_title: "Darija Speech Example"
44
+ src: "https://example.com/darija_sample.wav"
45
+ ---
46
+
47
+ # MMS-1B-All Fine-tuned on Darija Bible Dataset
48
+
49
+ This model is a fine-tuned version of [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all) on the [atlasia/darija_bible_aligned](https://huggingface.co/datasets/atlasia/darija_bible_aligned) dataset for Moroccan Arabic (Darija) speech recognition.
50
+
51
+ ## Model Description
52
+
53
+ - **Model type:** Speech Recognition (CTC)
54
+ - **Language:** Moroccan Arabic (Darija)
55
+ - **Base model:** facebook/mms-1b-all
56
+ - **Dataset:** Darija Bible Aligned Dataset
57
+ - **License:** Apache 2.0
58
+
59
+ ## Usage
60
+
61
+ ```python
62
+ from transformers import AutoProcessor, AutoModelForCTC
63
+ import torch
64
+ import librosa
65
+
66
+ # Load model and processor
67
+ processor = AutoProcessor.from_pretrained("HAMMALE/mms-darija-finetuned")
68
+ model = AutoModelForCTC.from_pretrained("HAMMALE/mms-darija-finetuned")
69
+
70
+ # Load and preprocess audio
71
+ audio, sr = librosa.load("path/to/darija/audio.wav", sr=16000)
72
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
73
+
74
+ # Inference
75
+ with torch.no_grad():
76
+ logits = model(**inputs).logits
77
+
78
+ predicted_ids = torch.argmax(logits, dim=-1)
79
+ transcription = processor.batch_decode(predicted_ids)[0]
80
+ print(f"Transcription: {transcription}")
81
+ ```
82
+
83
+ ## Training Details
84
+
85
+ The model was fine-tuned on the Darija Bible Aligned Dataset, which contains audio segments from the Moroccan Standard Translation (MSTD) of the Bible with aligned text transcriptions.
86
+
87
+ ## Limitations
88
+
89
+ - Trained specifically on religious text (Bible translations)
90
+ - May not perform well on colloquial/everyday Darija speech
91
+ - Limited vocabulary outside religious domain
92
+
93
+ ## Citation
94
+
95
+ ```bibtex
96
+ @misc{darija-mms-finetuned,
97
+ title={MMS-1B-All Fine-tuned on Darija Bible Dataset},
98
+ author={HAMMALE},
99
+ year={2025},
100
+ publisher={Hugging Face},
101
+ journal={Hugging Face Model Hub},
102
+ howpublished={\\url{https://huggingface.co/HAMMALE/mms-darija-finetuned}}
103
+ }
104
+ ```
105
+
106
+ ## Acknowledgments
107
+
108
+ - Original MMS model by Meta AI
109
+ - Darija Bible dataset by Morocco Bible Society
110
+ - Audio alignment using Facebook's MMS toolkit
111
+ """
112
+
113
+ # Create app.py for Gradio demo
114
+ app_content = """import gradio as gr
115
+ import torch
116
+ import librosa
117
+ import numpy as np
118
+ from transformers import AutoProcessor, AutoModelForCTC
119
+
120
+ # Load model and processor
121
+ print("Loading model...")
122
+ processor = AutoProcessor.from_pretrained("HAMMALE/mms-darija-finetuned")
123
+ model = AutoModelForCTC.from_pretrained("HAMMALE/mms-darija-finetuned")
124
+
125
+ def transcribe_audio(audio_file):
126
+ try:
127
+ # Load audio
128
+ if audio_file is None:
129
+ return "Please upload an audio file."
130
+
131
+ # Load and preprocess audio
132
+ audio, sr = librosa.load(audio_file, sr=16000)
133
+
134
+ # Handle very short audio
135
+ if len(audio) < 1600: # Less than 0.1 seconds
136
+ return "Audio too short. Please upload a longer audio file."
137
+
138
+ # Process with model
139
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
140
+
141
+ # Inference
142
+ with torch.no_grad():
143
+ logits = model(**inputs).logits
144
+
145
+ predicted_ids = torch.argmax(logits, dim=-1)
146
+ transcription = processor.batch_decode(predicted_ids)[0]
147
+
148
+ return transcription if transcription.strip() else "No transcription generated."
149
+
150
+ except Exception as e:
151
+ return f"Error processing audio: {str(e)}"
152
+
153
+ # Create Gradio interface
154
+ demo = gr.Interface(
155
+ fn=transcribe_audio,
156
+ inputs=gr.Audio(type="filepath", label="Upload Darija Audio"),
157
+ outputs=gr.Textbox(label="Transcription", placeholder="Transcription will appear here..."),
158
+ title="🎤 Darija Speech Recognition",
159
+ description="Upload an audio file in Moroccan Arabic (Darija) and get the transcription. This model was fine-tuned on the Darija Bible dataset.",
160
+ article="Model: [HAMMALE/mms-darija-finetuned](https://huggingface.co/HAMMALE/mms-darija-finetuned)",
161
+ examples=[
162
+ # You can add example audio files here if you have them
163
+ ],
164
+ cache_examples=False,
165
+ theme=gr.themes.Soft()
166
+ )
167
+
168
+ if __name__ == "__main__":
169
+ demo.launch()
170
+ """
171
+
172
+ # Create requirements.txt
173
+ requirements_content = """torch
174
+ transformers
175
+ librosa
176
+ gradio
177
+ numpy
178
+ """
179
+
180
+ # Save files to model directory
181
+ print("📝 Creating files...")
182
+ os.makedirs(MODEL_DIR, exist_ok=True)
183
+
184
+ with open(f"{MODEL_DIR}/README.md", "w", encoding="utf-8") as f:
185
+ f.write(readme_content)
186
+
187
+ with open(f"{MODEL_DIR}/app.py", "w", encoding="utf-8") as f:
188
+ f.write(app_content)
189
+
190
+ with open(f"{MODEL_DIR}/requirements.txt", "w", encoding="utf-8") as f:
191
+ f.write(requirements_content)
192
+
193
+ # Upload entire directory
194
+ print("🚀 Uploading to HuggingFace Hub...")
195
+ api.upload_folder(
196
+ folder_path=MODEL_DIR,
197
+ repo_id=REPO_ID,
198
+ repo_type="model",
199
+ commit_message="Add model, Gradio demo, and README"
200
+ )
201
+
202
+ print(f"✅ Complete model pushed to: https://huggingface.co/{REPO_ID}")
203
+ print(f"🎯 Demo available at: https://huggingface.co/spaces/{REPO_ID}")
204
+ print("📱 The Gradio demo will be automatically deployed!")
README.md CHANGED
@@ -1,199 +1,86 @@
1
  ---
2
- library_name: transformers
3
- tags: []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  ---
5
 
6
- # Model Card for Model ID
7
 
8
- <!-- Provide a quick summary of what the model is/does. -->
9
 
 
10
 
 
 
 
 
 
11
 
12
- ## Model Details
13
 
14
- ### Model Description
 
 
 
15
 
16
- <!-- Provide a longer summary of what this model is. -->
 
 
17
 
18
- This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
 
 
19
 
20
- - **Developed by:** [More Information Needed]
21
- - **Funded by [optional]:** [More Information Needed]
22
- - **Shared by [optional]:** [More Information Needed]
23
- - **Model type:** [More Information Needed]
24
- - **Language(s) (NLP):** [More Information Needed]
25
- - **License:** [More Information Needed]
26
- - **Finetuned from model [optional]:** [More Information Needed]
27
 
28
- ### Model Sources [optional]
29
-
30
- <!-- Provide the basic links for the model. -->
31
-
32
- - **Repository:** [More Information Needed]
33
- - **Paper [optional]:** [More Information Needed]
34
- - **Demo [optional]:** [More Information Needed]
35
-
36
- ## Uses
37
-
38
- <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
-
40
- ### Direct Use
41
-
42
- <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
-
44
- [More Information Needed]
45
-
46
- ### Downstream Use [optional]
47
-
48
- <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
-
50
- [More Information Needed]
51
-
52
- ### Out-of-Scope Use
53
-
54
- <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
-
56
- [More Information Needed]
57
-
58
- ## Bias, Risks, and Limitations
59
-
60
- <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
-
62
- [More Information Needed]
63
-
64
- ### Recommendations
65
-
66
- <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
-
68
- Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
-
70
- ## How to Get Started with the Model
71
-
72
- Use the code below to get started with the model.
73
-
74
- [More Information Needed]
75
 
76
  ## Training Details
77
 
78
- ### Training Data
79
-
80
- <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
-
82
- [More Information Needed]
83
-
84
- ### Training Procedure
85
-
86
- <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
-
88
- #### Preprocessing [optional]
89
-
90
- [More Information Needed]
91
-
92
-
93
- #### Training Hyperparameters
94
-
95
- - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
-
97
- #### Speeds, Sizes, Times [optional]
98
-
99
- <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
-
101
- [More Information Needed]
102
-
103
- ## Evaluation
104
-
105
- <!-- This section describes the evaluation protocols and provides the results. -->
106
-
107
- ### Testing Data, Factors & Metrics
108
-
109
- #### Testing Data
110
-
111
- <!-- This should link to a Dataset Card if possible. -->
112
-
113
- [More Information Needed]
114
-
115
- #### Factors
116
-
117
- <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
-
119
- [More Information Needed]
120
-
121
- #### Metrics
122
-
123
- <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
-
125
- [More Information Needed]
126
-
127
- ### Results
128
-
129
- [More Information Needed]
130
-
131
- #### Summary
132
-
133
-
134
-
135
- ## Model Examination [optional]
136
-
137
- <!-- Relevant interpretability work for the model goes here -->
138
-
139
- [More Information Needed]
140
-
141
- ## Environmental Impact
142
-
143
- <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
-
145
- Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
-
147
- - **Hardware Type:** [More Information Needed]
148
- - **Hours used:** [More Information Needed]
149
- - **Cloud Provider:** [More Information Needed]
150
- - **Compute Region:** [More Information Needed]
151
- - **Carbon Emitted:** [More Information Needed]
152
-
153
- ## Technical Specifications [optional]
154
-
155
- ### Model Architecture and Objective
156
-
157
- [More Information Needed]
158
-
159
- ### Compute Infrastructure
160
-
161
- [More Information Needed]
162
-
163
- #### Hardware
164
-
165
- [More Information Needed]
166
-
167
- #### Software
168
-
169
- [More Information Needed]
170
-
171
- ## Citation [optional]
172
-
173
- <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
-
175
- **BibTeX:**
176
-
177
- [More Information Needed]
178
-
179
- **APA:**
180
-
181
- [More Information Needed]
182
-
183
- ## Glossary [optional]
184
-
185
- <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
-
187
- [More Information Needed]
188
 
189
- ## More Information [optional]
190
 
191
- [More Information Needed]
 
 
192
 
193
- ## Model Card Authors [optional]
194
 
195
- [More Information Needed]
 
 
 
 
 
 
 
 
 
196
 
197
- ## Model Card Contact
198
 
199
- [More Information Needed]
 
 
 
1
  ---
2
+ language:
3
+ - ar
4
+ - ary
5
+ tags:
6
+ - speech-recognition
7
+ - audio
8
+ - wav2vec2
9
+ - mms
10
+ - darija
11
+ - moroccan-arabic
12
+ - bible
13
+ license: apache-2.0
14
+ datasets:
15
+ - atlasia/darija_bible_aligned
16
+ metrics:
17
+ - wer
18
+ widget:
19
+ - example_title: "Darija Speech Example"
20
+ src: "https://example.com/darija_sample.wav"
21
  ---
22
 
23
+ # MMS-1B-All Fine-tuned on Darija Bible Dataset
24
 
25
+ This model is a fine-tuned version of [facebook/mms-1b-all](https://huggingface.co/facebook/mms-1b-all) on the [atlasia/darija_bible_aligned](https://huggingface.co/datasets/atlasia/darija_bible_aligned) dataset for Moroccan Arabic (Darija) speech recognition.
26
 
27
+ ## Model Description
28
 
29
+ - **Model type:** Speech Recognition (CTC)
30
+ - **Language:** Moroccan Arabic (Darija)
31
+ - **Base model:** facebook/mms-1b-all
32
+ - **Dataset:** Darija Bible Aligned Dataset
33
+ - **License:** Apache 2.0
34
 
35
+ ## Usage
36
 
37
+ ```python
38
+ from transformers import AutoProcessor, AutoModelForCTC
39
+ import torch
40
+ import librosa
41
 
42
+ # Load model and processor
43
+ processor = AutoProcessor.from_pretrained("HAMMALE/mms-darija-finetuned")
44
+ model = AutoModelForCTC.from_pretrained("HAMMALE/mms-darija-finetuned")
45
 
46
+ # Load and preprocess audio
47
+ audio, sr = librosa.load("path/to/darija/audio.wav", sr=16000)
48
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
49
 
50
+ # Inference
51
+ with torch.no_grad():
52
+ logits = model(**inputs).logits
 
 
 
 
53
 
54
+ predicted_ids = torch.argmax(logits, dim=-1)
55
+ transcription = processor.batch_decode(predicted_ids)[0]
56
+ print(f"Transcription: {transcription}")
57
+ ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
  ## Training Details
60
 
61
+ The model was fine-tuned on the Darija Bible Aligned Dataset, which contains audio segments from the Moroccan Standard Translation (MSTD) of the Bible with aligned text transcriptions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
+ ## Limitations
64
 
65
+ - Trained specifically on religious text (Bible translations)
66
+ - May not perform well on colloquial/everyday Darija speech
67
+ - Limited vocabulary outside religious domain
68
 
69
+ ## Citation
70
 
71
+ ```bibtex
72
+ @misc{darija-mms-finetuned,
73
+ title={MMS-1B-All Fine-tuned on Darija Bible Dataset},
74
+ author={HAMMALE},
75
+ year={2025},
76
+ publisher={Hugging Face},
77
+ journal={Hugging Face Model Hub},
78
+ howpublished={\url{https://huggingface.co/HAMMALE/mms-darija-finetuned}}
79
+ }
80
+ ```
81
 
82
+ ## Acknowledgments
83
 
84
+ - Original MMS model by Meta AI
85
+ - Darija Bible dataset by Morocco Bible Society
86
+ - Audio alignment using Facebook's MMS toolkit
Readme.md ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # MMS-1B-All fine-tuned on Darija Bible Aligned Dataset
2
+
3
+ This model is a fine-tuned version of `facebook/mms-1b-all` for Moroccan Arabic (Darija) speech recognition, trained on the **Darija Bible Aligned Dataset** provided by [AtlasAI](https://huggingface.co/atlasia).
4
+
5
+ ## 📊 Dataset
6
+ - **Name**: `atlasia/darija_bible_aligned`
7
+ - **Domain**: Religious texts (Moroccan Darija audio ↔ Arabic text)
8
+ - **License**: See original dataset page.
9
+
10
+ ## 🎯 Intended use
11
+ This model is intended for research and experimentation in low-resource Arabic dialect ASR.
12
+
13
+ ## 🚀 Training Details
14
+ - Base model: `facebook/mms-1b-all`
15
+ - Language: `ara` (Moroccan Arabic)
16
+ - Framework: `Transformers` + `datasets`
17
+ - WER on eval set: *To be filled after training*
18
+
19
+ ## 🙏 Acknowledgements
20
+ Special thanks to **AtlasAI** for providing the aligned Darija Bible dataset.
21
+
22
+ ## 🧪 Demo
23
+ Try the model on your own audio! Check out the demo in the Space or use this snippet:
24
+
25
+ ```python
26
+ from transformers import AutoProcessor, AutoModelForCTC
27
+ import torchaudio
28
+
29
+ processor = AutoProcessor.from_pretrained("your-username/mms-darija-finetuned")
30
+ model = AutoModelForCTC.from_pretrained("your-username/mms-darija-finetuned")
31
+
32
+ waveform, sr = torchaudio.load("your_audio.wav")
33
+ inputs = processor(waveform.squeeze().numpy(), sampling_rate=sr, return_tensors="pt")
34
+ with torch.no_grad():
35
+ logits = model(**inputs).logits
36
+
37
+ predicted_ids = torch.argmax(logits, dim=-1)
38
+ transcription = processor.batch_decode(predicted_ids)[0]
39
+ print(transcription)
app.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ import librosa
4
+ import numpy as np
5
+ from transformers import AutoProcessor, AutoModelForCTC
6
+
7
+ # Load model and processor
8
+ print("Loading model...")
9
+ processor = AutoProcessor.from_pretrained("HAMMALE/mms-darija-finetuned")
10
+ model = AutoModelForCTC.from_pretrained("HAMMALE/mms-darija-finetuned")
11
+
12
+ def transcribe_audio(audio_file):
13
+ try:
14
+ # Load audio
15
+ if audio_file is None:
16
+ return "Please upload an audio file."
17
+
18
+ # Load and preprocess audio
19
+ audio, sr = librosa.load(audio_file, sr=16000)
20
+
21
+ # Handle very short audio
22
+ if len(audio) < 1600: # Less than 0.1 seconds
23
+ return "Audio too short. Please upload a longer audio file."
24
+
25
+ # Process with model
26
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
27
+
28
+ # Inference
29
+ with torch.no_grad():
30
+ logits = model(**inputs).logits
31
+
32
+ predicted_ids = torch.argmax(logits, dim=-1)
33
+ transcription = processor.batch_decode(predicted_ids)[0]
34
+
35
+ return transcription if transcription.strip() else "No transcription generated."
36
+
37
+ except Exception as e:
38
+ return f"Error processing audio: {str(e)}"
39
+
40
+ # Create Gradio interface
41
+ demo = gr.Interface(
42
+ fn=transcribe_audio,
43
+ inputs=gr.Audio(type="filepath", label="Upload Darija Audio"),
44
+ outputs=gr.Textbox(label="Transcription", placeholder="Transcription will appear here..."),
45
+ title="🎤 Darija Speech Recognition",
46
+ description="Upload an audio file in Moroccan Arabic (Darija) and get the transcription. This model was fine-tuned on the Darija Bible dataset.",
47
+ article="Model: [HAMMALE/mms-darija-finetuned](https://huggingface.co/HAMMALE/mms-darija-finetuned)",
48
+ examples=[
49
+ # You can add example audio files here if you have them
50
+ ],
51
+ cache_examples=False,
52
+ theme=gr.themes.Soft()
53
+ )
54
+
55
+ if __name__ == "__main__":
56
+ demo.launch()
checkpoint-1000/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.52.3",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 154,
106
+ "xvector_output_dim": 512
107
+ }
checkpoint-1000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2840e5e0a3373a4483fb4fa588ea832e45d43305b55ccac5233530fdd1db34e4
3
+ size 3859521176
checkpoint-1000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66e3aae72f2fe0371afdfb4fbc827842f2643e4b50c0e839c1dc9d9a4ef9d3e9
3
+ size 7686012659
checkpoint-1000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5dda9e7edbad7e1f2ea9b88ea55f72ba9c14c99c293a599e98092f98f8fb1ae0
3
+ size 14709
checkpoint-1000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37bb1581d202aa22ea74daa929181abdfb2a3fa33a6d916b965e475f8b8905b1
3
+ size 1383
checkpoint-1000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0be48007f207c421506e912144b194f5d92eee8abae75376301624430209a58a
3
+ size 1465
checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1000,
3
+ "best_metric": 0.250485358356253,
4
+ "best_model_checkpoint": "mms-darija-finetuned/checkpoint-1000",
5
+ "epoch": 1.1286681715575622,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.11286681715575621,
14
+ "grad_norm": 2.7396411895751953,
15
+ "learning_rate": 5.88e-05,
16
+ "loss": 2.3959,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22573363431151242,
21
+ "grad_norm": 2.7468762397766113,
22
+ "learning_rate": 0.0001188,
23
+ "loss": 0.5308,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.33860045146726864,
28
+ "grad_norm": 2.6180174350738525,
29
+ "learning_rate": 0.00017879999999999998,
30
+ "loss": 0.53,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.45146726862302483,
35
+ "grad_norm": 2.046238660812378,
36
+ "learning_rate": 0.0002388,
37
+ "loss": 0.5489,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.5643340857787811,
42
+ "grad_norm": 2.4370131492614746,
43
+ "learning_rate": 0.0002982,
44
+ "loss": 0.5507,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5643340857787811,
49
+ "eval_loss": 0.32221749424934387,
50
+ "eval_runtime": 49.3694,
51
+ "eval_samples_per_second": 31.923,
52
+ "eval_steps_per_second": 15.961,
53
+ "eval_wer": 0.38112764924769454,
54
+ "step": 500
55
+ },
56
+ {
57
+ "epoch": 0.6772009029345373,
58
+ "grad_norm": 0.8341818451881409,
59
+ "learning_rate": 0.0002965191387559808,
60
+ "loss": 0.5311,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 0.7900677200902935,
65
+ "grad_norm": 0.6782581210136414,
66
+ "learning_rate": 0.00029293062200956934,
67
+ "loss": 0.4588,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 0.9029345372460497,
72
+ "grad_norm": 3.4923579692840576,
73
+ "learning_rate": 0.00028934210526315783,
74
+ "loss": 0.3537,
75
+ "step": 800
76
+ },
77
+ {
78
+ "epoch": 1.0158013544018059,
79
+ "grad_norm": 0.758439302444458,
80
+ "learning_rate": 0.0002857535885167464,
81
+ "loss": 0.375,
82
+ "step": 900
83
+ },
84
+ {
85
+ "epoch": 1.1286681715575622,
86
+ "grad_norm": 0.712623119354248,
87
+ "learning_rate": 0.0002821650717703349,
88
+ "loss": 0.3764,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 1.1286681715575622,
93
+ "eval_loss": 0.28351786732673645,
94
+ "eval_runtime": 48.5264,
95
+ "eval_samples_per_second": 32.477,
96
+ "eval_steps_per_second": 16.239,
97
+ "eval_wer": 0.250485358356253,
98
+ "step": 1000
99
+ }
100
+ ],
101
+ "logging_steps": 100,
102
+ "max_steps": 8860,
103
+ "num_input_tokens_seen": 0,
104
+ "num_train_epochs": 10,
105
+ "save_steps": 500,
106
+ "stateful_callbacks": {
107
+ "EarlyStoppingCallback": {
108
+ "args": {
109
+ "early_stopping_patience": 3,
110
+ "early_stopping_threshold": 0.0
111
+ },
112
+ "attributes": {
113
+ "early_stopping_patience_counter": 0
114
+ }
115
+ },
116
+ "TrainerControl": {
117
+ "args": {
118
+ "should_epoch_stop": false,
119
+ "should_evaluate": false,
120
+ "should_log": false,
121
+ "should_save": true,
122
+ "should_training_stop": false
123
+ },
124
+ "attributes": {}
125
+ }
126
+ },
127
+ "total_flos": 1.5035300125095938e+19,
128
+ "train_batch_size": 2,
129
+ "trial_name": null,
130
+ "trial_params": null
131
+ }
checkpoint-1000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e049bd7921eb48f996e726c39ae31633a72809a2ae3db8056230d9b767c6a3
3
+ size 5713
checkpoint-1500/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.52.3",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 154,
106
+ "xvector_output_dim": 512
107
+ }
checkpoint-1500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:575f59479f8b3633fddcb5a7beab9bce91620ffe36bd68e87d4c96cd78392d47
3
+ size 3859521176
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d10c9c5099f71b6f921a96eba107c9a3a9fb36274dfae6cda0f1cb5321037163
3
+ size 7686012659
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97e9becf39296478be8cdd62efb4615adb5468be1464023d29e1ad944a51605b
3
+ size 14709
checkpoint-1500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59575f2c6feaf27bb4a927c2c0d53a080cc10e83e7738fc167f66129f6dc7670
3
+ size 1383
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32669f0fa422025ae7cdba910e1f0a84f65bbb7f79fa856c8bba9eebd541bc0a
3
+ size 1465
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 1500,
3
+ "best_metric": 0.21003882866850024,
4
+ "best_model_checkpoint": "mms-darija-finetuned/checkpoint-1500",
5
+ "epoch": 1.6930022573363432,
6
+ "eval_steps": 500,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.11286681715575621,
14
+ "grad_norm": 2.7396411895751953,
15
+ "learning_rate": 5.88e-05,
16
+ "loss": 2.3959,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22573363431151242,
21
+ "grad_norm": 2.7468762397766113,
22
+ "learning_rate": 0.0001188,
23
+ "loss": 0.5308,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.33860045146726864,
28
+ "grad_norm": 2.6180174350738525,
29
+ "learning_rate": 0.00017879999999999998,
30
+ "loss": 0.53,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.45146726862302483,
35
+ "grad_norm": 2.046238660812378,
36
+ "learning_rate": 0.0002388,
37
+ "loss": 0.5489,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.5643340857787811,
42
+ "grad_norm": 2.4370131492614746,
43
+ "learning_rate": 0.0002982,
44
+ "loss": 0.5507,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5643340857787811,
49
+ "eval_loss": 0.32221749424934387,
50
+ "eval_runtime": 49.3694,
51
+ "eval_samples_per_second": 31.923,
52
+ "eval_steps_per_second": 15.961,
53
+ "eval_wer": 0.38112764924769454,
54
+ "step": 500
55
+ },
56
+ {
57
+ "epoch": 0.6772009029345373,
58
+ "grad_norm": 0.8341818451881409,
59
+ "learning_rate": 0.0002965191387559808,
60
+ "loss": 0.5311,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 0.7900677200902935,
65
+ "grad_norm": 0.6782581210136414,
66
+ "learning_rate": 0.00029293062200956934,
67
+ "loss": 0.4588,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 0.9029345372460497,
72
+ "grad_norm": 3.4923579692840576,
73
+ "learning_rate": 0.00028934210526315783,
74
+ "loss": 0.3537,
75
+ "step": 800
76
+ },
77
+ {
78
+ "epoch": 1.0158013544018059,
79
+ "grad_norm": 0.758439302444458,
80
+ "learning_rate": 0.0002857535885167464,
81
+ "loss": 0.375,
82
+ "step": 900
83
+ },
84
+ {
85
+ "epoch": 1.1286681715575622,
86
+ "grad_norm": 0.712623119354248,
87
+ "learning_rate": 0.0002821650717703349,
88
+ "loss": 0.3764,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 1.1286681715575622,
93
+ "eval_loss": 0.28351786732673645,
94
+ "eval_runtime": 48.5264,
95
+ "eval_samples_per_second": 32.477,
96
+ "eval_steps_per_second": 16.239,
97
+ "eval_wer": 0.250485358356253,
98
+ "step": 1000
99
+ },
100
+ {
101
+ "epoch": 1.2415349887133182,
102
+ "grad_norm": 0.4382162094116211,
103
+ "learning_rate": 0.0002785765550239234,
104
+ "loss": 0.3205,
105
+ "step": 1100
106
+ },
107
+ {
108
+ "epoch": 1.3544018058690745,
109
+ "grad_norm": 5.204100131988525,
110
+ "learning_rate": 0.00027498803827751196,
111
+ "loss": 0.3739,
112
+ "step": 1200
113
+ },
114
+ {
115
+ "epoch": 1.4672686230248306,
116
+ "grad_norm": 0.893830418586731,
117
+ "learning_rate": 0.00027139952153110045,
118
+ "loss": 0.2979,
119
+ "step": 1300
120
+ },
121
+ {
122
+ "epoch": 1.580135440180587,
123
+ "grad_norm": 0.426662415266037,
124
+ "learning_rate": 0.000267811004784689,
125
+ "loss": 0.3017,
126
+ "step": 1400
127
+ },
128
+ {
129
+ "epoch": 1.6930022573363432,
130
+ "grad_norm": 0.4134344458580017,
131
+ "learning_rate": 0.0002642224880382775,
132
+ "loss": 0.2575,
133
+ "step": 1500
134
+ },
135
+ {
136
+ "epoch": 1.6930022573363432,
137
+ "eval_loss": 0.19595518708229065,
138
+ "eval_runtime": 58.1681,
139
+ "eval_samples_per_second": 27.094,
140
+ "eval_steps_per_second": 13.547,
141
+ "eval_wer": 0.21003882866850024,
142
+ "step": 1500
143
+ }
144
+ ],
145
+ "logging_steps": 100,
146
+ "max_steps": 8860,
147
+ "num_input_tokens_seen": 0,
148
+ "num_train_epochs": 10,
149
+ "save_steps": 500,
150
+ "stateful_callbacks": {
151
+ "EarlyStoppingCallback": {
152
+ "args": {
153
+ "early_stopping_patience": 3,
154
+ "early_stopping_threshold": 0.0
155
+ },
156
+ "attributes": {
157
+ "early_stopping_patience_counter": 0
158
+ }
159
+ },
160
+ "TrainerControl": {
161
+ "args": {
162
+ "should_epoch_stop": false,
163
+ "should_evaluate": false,
164
+ "should_log": false,
165
+ "should_save": true,
166
+ "should_training_stop": false
167
+ },
168
+ "attributes": {}
169
+ }
170
+ },
171
+ "total_flos": 2.256611183513416e+19,
172
+ "train_batch_size": 2,
173
+ "trial_name": null,
174
+ "trial_params": null
175
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e049bd7921eb48f996e726c39ae31633a72809a2ae3db8056230d9b767c6a3
3
+ size 5713
checkpoint-2000/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.52.3",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 154,
106
+ "xvector_output_dim": 512
107
+ }
checkpoint-2000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a221fc95c57e6200dd7e661b0c997ca8954a15016cd7f5bc0cf319d9f7244528
3
+ size 3859521176
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed49f9a59d57ff6914314eb71f624ed68f547b36c55b81910b91762db375655b
3
+ size 7686012659
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0bbe6d08d335cc8bb9ebc70e4c641eb2edde72bae1e631fd648b17db81fc267
3
+ size 14709
checkpoint-2000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce6af8c84c84b3d38abbacb49b636d4c0d288e97bb1e4c1dab714289dd100208
3
+ size 1383
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:194cdd674af00063e2d937369d227b46a8b8f2fe4d95aff0e801d7a579175691
3
+ size 1465
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 0.18823814916680148,
4
+ "best_model_checkpoint": "mms-darija-finetuned/checkpoint-2000",
5
+ "epoch": 2.2573363431151243,
6
+ "eval_steps": 500,
7
+ "global_step": 2000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.11286681715575621,
14
+ "grad_norm": 2.7396411895751953,
15
+ "learning_rate": 5.88e-05,
16
+ "loss": 2.3959,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22573363431151242,
21
+ "grad_norm": 2.7468762397766113,
22
+ "learning_rate": 0.0001188,
23
+ "loss": 0.5308,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.33860045146726864,
28
+ "grad_norm": 2.6180174350738525,
29
+ "learning_rate": 0.00017879999999999998,
30
+ "loss": 0.53,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.45146726862302483,
35
+ "grad_norm": 2.046238660812378,
36
+ "learning_rate": 0.0002388,
37
+ "loss": 0.5489,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.5643340857787811,
42
+ "grad_norm": 2.4370131492614746,
43
+ "learning_rate": 0.0002982,
44
+ "loss": 0.5507,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5643340857787811,
49
+ "eval_loss": 0.32221749424934387,
50
+ "eval_runtime": 49.3694,
51
+ "eval_samples_per_second": 31.923,
52
+ "eval_steps_per_second": 15.961,
53
+ "eval_wer": 0.38112764924769454,
54
+ "step": 500
55
+ },
56
+ {
57
+ "epoch": 0.6772009029345373,
58
+ "grad_norm": 0.8341818451881409,
59
+ "learning_rate": 0.0002965191387559808,
60
+ "loss": 0.5311,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 0.7900677200902935,
65
+ "grad_norm": 0.6782581210136414,
66
+ "learning_rate": 0.00029293062200956934,
67
+ "loss": 0.4588,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 0.9029345372460497,
72
+ "grad_norm": 3.4923579692840576,
73
+ "learning_rate": 0.00028934210526315783,
74
+ "loss": 0.3537,
75
+ "step": 800
76
+ },
77
+ {
78
+ "epoch": 1.0158013544018059,
79
+ "grad_norm": 0.758439302444458,
80
+ "learning_rate": 0.0002857535885167464,
81
+ "loss": 0.375,
82
+ "step": 900
83
+ },
84
+ {
85
+ "epoch": 1.1286681715575622,
86
+ "grad_norm": 0.712623119354248,
87
+ "learning_rate": 0.0002821650717703349,
88
+ "loss": 0.3764,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 1.1286681715575622,
93
+ "eval_loss": 0.28351786732673645,
94
+ "eval_runtime": 48.5264,
95
+ "eval_samples_per_second": 32.477,
96
+ "eval_steps_per_second": 16.239,
97
+ "eval_wer": 0.250485358356253,
98
+ "step": 1000
99
+ },
100
+ {
101
+ "epoch": 1.2415349887133182,
102
+ "grad_norm": 0.4382162094116211,
103
+ "learning_rate": 0.0002785765550239234,
104
+ "loss": 0.3205,
105
+ "step": 1100
106
+ },
107
+ {
108
+ "epoch": 1.3544018058690745,
109
+ "grad_norm": 5.204100131988525,
110
+ "learning_rate": 0.00027498803827751196,
111
+ "loss": 0.3739,
112
+ "step": 1200
113
+ },
114
+ {
115
+ "epoch": 1.4672686230248306,
116
+ "grad_norm": 0.893830418586731,
117
+ "learning_rate": 0.00027139952153110045,
118
+ "loss": 0.2979,
119
+ "step": 1300
120
+ },
121
+ {
122
+ "epoch": 1.580135440180587,
123
+ "grad_norm": 0.426662415266037,
124
+ "learning_rate": 0.000267811004784689,
125
+ "loss": 0.3017,
126
+ "step": 1400
127
+ },
128
+ {
129
+ "epoch": 1.6930022573363432,
130
+ "grad_norm": 0.4134344458580017,
131
+ "learning_rate": 0.0002642224880382775,
132
+ "loss": 0.2575,
133
+ "step": 1500
134
+ },
135
+ {
136
+ "epoch": 1.6930022573363432,
137
+ "eval_loss": 0.19595518708229065,
138
+ "eval_runtime": 58.1681,
139
+ "eval_samples_per_second": 27.094,
140
+ "eval_steps_per_second": 13.547,
141
+ "eval_wer": 0.21003882866850024,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 1.8058690744920993,
146
+ "grad_norm": 1.7899537086486816,
147
+ "learning_rate": 0.000260633971291866,
148
+ "loss": 0.2805,
149
+ "step": 1600
150
+ },
151
+ {
152
+ "epoch": 1.9187358916478554,
153
+ "grad_norm": 0.36495277285575867,
154
+ "learning_rate": 0.0002570454545454545,
155
+ "loss": 0.2567,
156
+ "step": 1700
157
+ },
158
+ {
159
+ "epoch": 2.0316027088036117,
160
+ "grad_norm": 0.4699445068836212,
161
+ "learning_rate": 0.00025345693779904306,
162
+ "loss": 0.2234,
163
+ "step": 1800
164
+ },
165
+ {
166
+ "epoch": 2.144469525959368,
167
+ "grad_norm": 0.6241821646690369,
168
+ "learning_rate": 0.00024986842105263155,
169
+ "loss": 0.2331,
170
+ "step": 1900
171
+ },
172
+ {
173
+ "epoch": 2.2573363431151243,
174
+ "grad_norm": 0.5561006665229797,
175
+ "learning_rate": 0.00024631578947368417,
176
+ "loss": 0.2519,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 2.2573363431151243,
181
+ "eval_loss": 0.1508670300245285,
182
+ "eval_runtime": 47.958,
183
+ "eval_samples_per_second": 32.862,
184
+ "eval_steps_per_second": 16.431,
185
+ "eval_wer": 0.18823814916680148,
186
+ "step": 2000
187
+ }
188
+ ],
189
+ "logging_steps": 100,
190
+ "max_steps": 8860,
191
+ "num_input_tokens_seen": 0,
192
+ "num_train_epochs": 10,
193
+ "save_steps": 500,
194
+ "stateful_callbacks": {
195
+ "EarlyStoppingCallback": {
196
+ "args": {
197
+ "early_stopping_patience": 3,
198
+ "early_stopping_threshold": 0.0
199
+ },
200
+ "attributes": {
201
+ "early_stopping_patience_counter": 0
202
+ }
203
+ },
204
+ "TrainerControl": {
205
+ "args": {
206
+ "should_epoch_stop": false,
207
+ "should_evaluate": false,
208
+ "should_log": false,
209
+ "should_save": true,
210
+ "should_training_stop": false
211
+ },
212
+ "attributes": {}
213
+ }
214
+ },
215
+ "total_flos": 3.004473369008046e+19,
216
+ "train_batch_size": 2,
217
+ "trial_name": null,
218
+ "trial_params": null
219
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e049bd7921eb48f996e726c39ae31633a72809a2ae3db8056230d9b767c6a3
3
+ size 5713
checkpoint-2500/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.52.3",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 154,
106
+ "xvector_output_dim": 512
107
+ }
checkpoint-2500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50a132b2753ce6ded08dfc70b9dd315d50a15033eb0640dd5e275d06ac794006
3
+ size 3859521176
checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ca018b139d013b53efd41b63aa5c7c87120fbaee22d2e926a9a7e1555a18a75
3
+ size 7686012659
checkpoint-2500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f8d79f63b2f87f840f2d64ead18ac3510040553c28f44f5c4f2265b02365327
3
+ size 14709
checkpoint-2500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:91d4f81509232003d5e6c179ff0078f6becbd8bc508439a20f21939ccba8ae7d
3
+ size 1383
checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c9a463e4b195573c6e3a3087023d396ca4d0bc7418bfd033429d7cc80f73a89
3
+ size 1465
checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 2000,
3
+ "best_metric": 0.18823814916680148,
4
+ "best_model_checkpoint": "mms-darija-finetuned/checkpoint-2000",
5
+ "epoch": 2.8216704288939054,
6
+ "eval_steps": 500,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.11286681715575621,
14
+ "grad_norm": 2.7396411895751953,
15
+ "learning_rate": 5.88e-05,
16
+ "loss": 2.3959,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22573363431151242,
21
+ "grad_norm": 2.7468762397766113,
22
+ "learning_rate": 0.0001188,
23
+ "loss": 0.5308,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.33860045146726864,
28
+ "grad_norm": 2.6180174350738525,
29
+ "learning_rate": 0.00017879999999999998,
30
+ "loss": 0.53,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.45146726862302483,
35
+ "grad_norm": 2.046238660812378,
36
+ "learning_rate": 0.0002388,
37
+ "loss": 0.5489,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.5643340857787811,
42
+ "grad_norm": 2.4370131492614746,
43
+ "learning_rate": 0.0002982,
44
+ "loss": 0.5507,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5643340857787811,
49
+ "eval_loss": 0.32221749424934387,
50
+ "eval_runtime": 49.3694,
51
+ "eval_samples_per_second": 31.923,
52
+ "eval_steps_per_second": 15.961,
53
+ "eval_wer": 0.38112764924769454,
54
+ "step": 500
55
+ },
56
+ {
57
+ "epoch": 0.6772009029345373,
58
+ "grad_norm": 0.8341818451881409,
59
+ "learning_rate": 0.0002965191387559808,
60
+ "loss": 0.5311,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 0.7900677200902935,
65
+ "grad_norm": 0.6782581210136414,
66
+ "learning_rate": 0.00029293062200956934,
67
+ "loss": 0.4588,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 0.9029345372460497,
72
+ "grad_norm": 3.4923579692840576,
73
+ "learning_rate": 0.00028934210526315783,
74
+ "loss": 0.3537,
75
+ "step": 800
76
+ },
77
+ {
78
+ "epoch": 1.0158013544018059,
79
+ "grad_norm": 0.758439302444458,
80
+ "learning_rate": 0.0002857535885167464,
81
+ "loss": 0.375,
82
+ "step": 900
83
+ },
84
+ {
85
+ "epoch": 1.1286681715575622,
86
+ "grad_norm": 0.712623119354248,
87
+ "learning_rate": 0.0002821650717703349,
88
+ "loss": 0.3764,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 1.1286681715575622,
93
+ "eval_loss": 0.28351786732673645,
94
+ "eval_runtime": 48.5264,
95
+ "eval_samples_per_second": 32.477,
96
+ "eval_steps_per_second": 16.239,
97
+ "eval_wer": 0.250485358356253,
98
+ "step": 1000
99
+ },
100
+ {
101
+ "epoch": 1.2415349887133182,
102
+ "grad_norm": 0.4382162094116211,
103
+ "learning_rate": 0.0002785765550239234,
104
+ "loss": 0.3205,
105
+ "step": 1100
106
+ },
107
+ {
108
+ "epoch": 1.3544018058690745,
109
+ "grad_norm": 5.204100131988525,
110
+ "learning_rate": 0.00027498803827751196,
111
+ "loss": 0.3739,
112
+ "step": 1200
113
+ },
114
+ {
115
+ "epoch": 1.4672686230248306,
116
+ "grad_norm": 0.893830418586731,
117
+ "learning_rate": 0.00027139952153110045,
118
+ "loss": 0.2979,
119
+ "step": 1300
120
+ },
121
+ {
122
+ "epoch": 1.580135440180587,
123
+ "grad_norm": 0.426662415266037,
124
+ "learning_rate": 0.000267811004784689,
125
+ "loss": 0.3017,
126
+ "step": 1400
127
+ },
128
+ {
129
+ "epoch": 1.6930022573363432,
130
+ "grad_norm": 0.4134344458580017,
131
+ "learning_rate": 0.0002642224880382775,
132
+ "loss": 0.2575,
133
+ "step": 1500
134
+ },
135
+ {
136
+ "epoch": 1.6930022573363432,
137
+ "eval_loss": 0.19595518708229065,
138
+ "eval_runtime": 58.1681,
139
+ "eval_samples_per_second": 27.094,
140
+ "eval_steps_per_second": 13.547,
141
+ "eval_wer": 0.21003882866850024,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 1.8058690744920993,
146
+ "grad_norm": 1.7899537086486816,
147
+ "learning_rate": 0.000260633971291866,
148
+ "loss": 0.2805,
149
+ "step": 1600
150
+ },
151
+ {
152
+ "epoch": 1.9187358916478554,
153
+ "grad_norm": 0.36495277285575867,
154
+ "learning_rate": 0.0002570454545454545,
155
+ "loss": 0.2567,
156
+ "step": 1700
157
+ },
158
+ {
159
+ "epoch": 2.0316027088036117,
160
+ "grad_norm": 0.4699445068836212,
161
+ "learning_rate": 0.00025345693779904306,
162
+ "loss": 0.2234,
163
+ "step": 1800
164
+ },
165
+ {
166
+ "epoch": 2.144469525959368,
167
+ "grad_norm": 0.6241821646690369,
168
+ "learning_rate": 0.00024986842105263155,
169
+ "loss": 0.2331,
170
+ "step": 1900
171
+ },
172
+ {
173
+ "epoch": 2.2573363431151243,
174
+ "grad_norm": 0.5561006665229797,
175
+ "learning_rate": 0.00024631578947368417,
176
+ "loss": 0.2519,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 2.2573363431151243,
181
+ "eval_loss": 0.1508670300245285,
182
+ "eval_runtime": 47.958,
183
+ "eval_samples_per_second": 32.862,
184
+ "eval_steps_per_second": 16.431,
185
+ "eval_wer": 0.18823814916680148,
186
+ "step": 2000
187
+ },
188
+ {
189
+ "epoch": 2.37020316027088,
190
+ "grad_norm": 0.5059154033660889,
191
+ "learning_rate": 0.0002427272727272727,
192
+ "loss": 0.2144,
193
+ "step": 2100
194
+ },
195
+ {
196
+ "epoch": 2.4830699774266365,
197
+ "grad_norm": 0.5262096524238586,
198
+ "learning_rate": 0.00023913875598086123,
199
+ "loss": 0.2034,
200
+ "step": 2200
201
+ },
202
+ {
203
+ "epoch": 2.595936794582393,
204
+ "grad_norm": 0.46527203917503357,
205
+ "learning_rate": 0.00023555023923444972,
206
+ "loss": 0.201,
207
+ "step": 2300
208
+ },
209
+ {
210
+ "epoch": 2.708803611738149,
211
+ "grad_norm": 3.4555609226226807,
212
+ "learning_rate": 0.00023196172248803824,
213
+ "loss": 0.2248,
214
+ "step": 2400
215
+ },
216
+ {
217
+ "epoch": 2.8216704288939054,
218
+ "grad_norm": 0.497281938791275,
219
+ "learning_rate": 0.00022837320574162676,
220
+ "loss": 0.2515,
221
+ "step": 2500
222
+ },
223
+ {
224
+ "epoch": 2.8216704288939054,
225
+ "eval_loss": 0.17336268723011017,
226
+ "eval_runtime": 49.1468,
227
+ "eval_samples_per_second": 32.067,
228
+ "eval_steps_per_second": 16.034,
229
+ "eval_wer": 0.1939815563824624,
230
+ "step": 2500
231
+ }
232
+ ],
233
+ "logging_steps": 100,
234
+ "max_steps": 8860,
235
+ "num_input_tokens_seen": 0,
236
+ "num_train_epochs": 10,
237
+ "save_steps": 500,
238
+ "stateful_callbacks": {
239
+ "EarlyStoppingCallback": {
240
+ "args": {
241
+ "early_stopping_patience": 3,
242
+ "early_stopping_threshold": 0.0
243
+ },
244
+ "attributes": {
245
+ "early_stopping_patience_counter": 1
246
+ }
247
+ },
248
+ "TrainerControl": {
249
+ "args": {
250
+ "should_epoch_stop": false,
251
+ "should_evaluate": false,
252
+ "should_log": false,
253
+ "should_save": true,
254
+ "should_training_stop": false
255
+ },
256
+ "attributes": {}
257
+ }
258
+ },
259
+ "total_flos": 3.7579227205569675e+19,
260
+ "train_batch_size": 2,
261
+ "trial_name": null,
262
+ "trial_params": null
263
+ }
checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e049bd7921eb48f996e726c39ae31633a72809a2ae3db8056230d9b767c6a3
3
+ size 5713
checkpoint-3000/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.52.3",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 154,
106
+ "xvector_output_dim": 512
107
+ }
checkpoint-3000/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8242023b491381e4f595c2e69aacb3c51066600428ef42b58accfc8c65e8d432
3
+ size 3859521176
checkpoint-3000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97855f6703b7ceabac1a46c6ba3c259417727e8f7aa93beb6d7aba2845704153
3
+ size 7686012659
checkpoint-3000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ee68bd8550ab0601f2aabc6a49ba6790e07dc6599d8fbc38cc02e0aeed39d34c
3
+ size 14709
checkpoint-3000/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22a0dc82f610208770bebb932d695a43fcf87277d31d9d5bca339ba51c41ce3a
3
+ size 1383
checkpoint-3000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b520150ed007dadc448186006366134cd4e755edaed99c2cfe231eacf21cd4f
3
+ size 1465
checkpoint-3000/trainer_state.json ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": 3000,
3
+ "best_metric": 0.16409157094321308,
4
+ "best_model_checkpoint": "mms-darija-finetuned/checkpoint-3000",
5
+ "epoch": 3.386004514672686,
6
+ "eval_steps": 500,
7
+ "global_step": 3000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.11286681715575621,
14
+ "grad_norm": 2.7396411895751953,
15
+ "learning_rate": 5.88e-05,
16
+ "loss": 2.3959,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.22573363431151242,
21
+ "grad_norm": 2.7468762397766113,
22
+ "learning_rate": 0.0001188,
23
+ "loss": 0.5308,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.33860045146726864,
28
+ "grad_norm": 2.6180174350738525,
29
+ "learning_rate": 0.00017879999999999998,
30
+ "loss": 0.53,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.45146726862302483,
35
+ "grad_norm": 2.046238660812378,
36
+ "learning_rate": 0.0002388,
37
+ "loss": 0.5489,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.5643340857787811,
42
+ "grad_norm": 2.4370131492614746,
43
+ "learning_rate": 0.0002982,
44
+ "loss": 0.5507,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.5643340857787811,
49
+ "eval_loss": 0.32221749424934387,
50
+ "eval_runtime": 49.3694,
51
+ "eval_samples_per_second": 31.923,
52
+ "eval_steps_per_second": 15.961,
53
+ "eval_wer": 0.38112764924769454,
54
+ "step": 500
55
+ },
56
+ {
57
+ "epoch": 0.6772009029345373,
58
+ "grad_norm": 0.8341818451881409,
59
+ "learning_rate": 0.0002965191387559808,
60
+ "loss": 0.5311,
61
+ "step": 600
62
+ },
63
+ {
64
+ "epoch": 0.7900677200902935,
65
+ "grad_norm": 0.6782581210136414,
66
+ "learning_rate": 0.00029293062200956934,
67
+ "loss": 0.4588,
68
+ "step": 700
69
+ },
70
+ {
71
+ "epoch": 0.9029345372460497,
72
+ "grad_norm": 3.4923579692840576,
73
+ "learning_rate": 0.00028934210526315783,
74
+ "loss": 0.3537,
75
+ "step": 800
76
+ },
77
+ {
78
+ "epoch": 1.0158013544018059,
79
+ "grad_norm": 0.758439302444458,
80
+ "learning_rate": 0.0002857535885167464,
81
+ "loss": 0.375,
82
+ "step": 900
83
+ },
84
+ {
85
+ "epoch": 1.1286681715575622,
86
+ "grad_norm": 0.712623119354248,
87
+ "learning_rate": 0.0002821650717703349,
88
+ "loss": 0.3764,
89
+ "step": 1000
90
+ },
91
+ {
92
+ "epoch": 1.1286681715575622,
93
+ "eval_loss": 0.28351786732673645,
94
+ "eval_runtime": 48.5264,
95
+ "eval_samples_per_second": 32.477,
96
+ "eval_steps_per_second": 16.239,
97
+ "eval_wer": 0.250485358356253,
98
+ "step": 1000
99
+ },
100
+ {
101
+ "epoch": 1.2415349887133182,
102
+ "grad_norm": 0.4382162094116211,
103
+ "learning_rate": 0.0002785765550239234,
104
+ "loss": 0.3205,
105
+ "step": 1100
106
+ },
107
+ {
108
+ "epoch": 1.3544018058690745,
109
+ "grad_norm": 5.204100131988525,
110
+ "learning_rate": 0.00027498803827751196,
111
+ "loss": 0.3739,
112
+ "step": 1200
113
+ },
114
+ {
115
+ "epoch": 1.4672686230248306,
116
+ "grad_norm": 0.893830418586731,
117
+ "learning_rate": 0.00027139952153110045,
118
+ "loss": 0.2979,
119
+ "step": 1300
120
+ },
121
+ {
122
+ "epoch": 1.580135440180587,
123
+ "grad_norm": 0.426662415266037,
124
+ "learning_rate": 0.000267811004784689,
125
+ "loss": 0.3017,
126
+ "step": 1400
127
+ },
128
+ {
129
+ "epoch": 1.6930022573363432,
130
+ "grad_norm": 0.4134344458580017,
131
+ "learning_rate": 0.0002642224880382775,
132
+ "loss": 0.2575,
133
+ "step": 1500
134
+ },
135
+ {
136
+ "epoch": 1.6930022573363432,
137
+ "eval_loss": 0.19595518708229065,
138
+ "eval_runtime": 58.1681,
139
+ "eval_samples_per_second": 27.094,
140
+ "eval_steps_per_second": 13.547,
141
+ "eval_wer": 0.21003882866850024,
142
+ "step": 1500
143
+ },
144
+ {
145
+ "epoch": 1.8058690744920993,
146
+ "grad_norm": 1.7899537086486816,
147
+ "learning_rate": 0.000260633971291866,
148
+ "loss": 0.2805,
149
+ "step": 1600
150
+ },
151
+ {
152
+ "epoch": 1.9187358916478554,
153
+ "grad_norm": 0.36495277285575867,
154
+ "learning_rate": 0.0002570454545454545,
155
+ "loss": 0.2567,
156
+ "step": 1700
157
+ },
158
+ {
159
+ "epoch": 2.0316027088036117,
160
+ "grad_norm": 0.4699445068836212,
161
+ "learning_rate": 0.00025345693779904306,
162
+ "loss": 0.2234,
163
+ "step": 1800
164
+ },
165
+ {
166
+ "epoch": 2.144469525959368,
167
+ "grad_norm": 0.6241821646690369,
168
+ "learning_rate": 0.00024986842105263155,
169
+ "loss": 0.2331,
170
+ "step": 1900
171
+ },
172
+ {
173
+ "epoch": 2.2573363431151243,
174
+ "grad_norm": 0.5561006665229797,
175
+ "learning_rate": 0.00024631578947368417,
176
+ "loss": 0.2519,
177
+ "step": 2000
178
+ },
179
+ {
180
+ "epoch": 2.2573363431151243,
181
+ "eval_loss": 0.1508670300245285,
182
+ "eval_runtime": 47.958,
183
+ "eval_samples_per_second": 32.862,
184
+ "eval_steps_per_second": 16.431,
185
+ "eval_wer": 0.18823814916680148,
186
+ "step": 2000
187
+ },
188
+ {
189
+ "epoch": 2.37020316027088,
190
+ "grad_norm": 0.5059154033660889,
191
+ "learning_rate": 0.0002427272727272727,
192
+ "loss": 0.2144,
193
+ "step": 2100
194
+ },
195
+ {
196
+ "epoch": 2.4830699774266365,
197
+ "grad_norm": 0.5262096524238586,
198
+ "learning_rate": 0.00023913875598086123,
199
+ "loss": 0.2034,
200
+ "step": 2200
201
+ },
202
+ {
203
+ "epoch": 2.595936794582393,
204
+ "grad_norm": 0.46527203917503357,
205
+ "learning_rate": 0.00023555023923444972,
206
+ "loss": 0.201,
207
+ "step": 2300
208
+ },
209
+ {
210
+ "epoch": 2.708803611738149,
211
+ "grad_norm": 3.4555609226226807,
212
+ "learning_rate": 0.00023196172248803824,
213
+ "loss": 0.2248,
214
+ "step": 2400
215
+ },
216
+ {
217
+ "epoch": 2.8216704288939054,
218
+ "grad_norm": 0.497281938791275,
219
+ "learning_rate": 0.00022837320574162676,
220
+ "loss": 0.2515,
221
+ "step": 2500
222
+ },
223
+ {
224
+ "epoch": 2.8216704288939054,
225
+ "eval_loss": 0.17336268723011017,
226
+ "eval_runtime": 49.1468,
227
+ "eval_samples_per_second": 32.067,
228
+ "eval_steps_per_second": 16.034,
229
+ "eval_wer": 0.1939815563824624,
230
+ "step": 2500
231
+ },
232
+ {
233
+ "epoch": 2.9345372460496613,
234
+ "grad_norm": 0.29302966594696045,
235
+ "learning_rate": 0.00022478468899521527,
236
+ "loss": 0.209,
237
+ "step": 2600
238
+ },
239
+ {
240
+ "epoch": 3.0474040632054176,
241
+ "grad_norm": 0.2737024426460266,
242
+ "learning_rate": 0.0002211961722488038,
243
+ "loss": 0.1449,
244
+ "step": 2700
245
+ },
246
+ {
247
+ "epoch": 3.160270880361174,
248
+ "grad_norm": 0.34064537286758423,
249
+ "learning_rate": 0.00021760765550239233,
250
+ "loss": 0.1597,
251
+ "step": 2800
252
+ },
253
+ {
254
+ "epoch": 3.27313769751693,
255
+ "grad_norm": 0.3498232662677765,
256
+ "learning_rate": 0.00021401913875598085,
257
+ "loss": 0.1855,
258
+ "step": 2900
259
+ },
260
+ {
261
+ "epoch": 3.386004514672686,
262
+ "grad_norm": 0.5131095051765442,
263
+ "learning_rate": 0.00021043062200956937,
264
+ "loss": 0.1715,
265
+ "step": 3000
266
+ },
267
+ {
268
+ "epoch": 3.386004514672686,
269
+ "eval_loss": 0.2065732479095459,
270
+ "eval_runtime": 48.3591,
271
+ "eval_samples_per_second": 32.59,
272
+ "eval_steps_per_second": 16.295,
273
+ "eval_wer": 0.16409157094321308,
274
+ "step": 3000
275
+ }
276
+ ],
277
+ "logging_steps": 100,
278
+ "max_steps": 8860,
279
+ "num_input_tokens_seen": 0,
280
+ "num_train_epochs": 10,
281
+ "save_steps": 500,
282
+ "stateful_callbacks": {
283
+ "EarlyStoppingCallback": {
284
+ "args": {
285
+ "early_stopping_patience": 3,
286
+ "early_stopping_threshold": 0.0
287
+ },
288
+ "attributes": {
289
+ "early_stopping_patience_counter": 0
290
+ }
291
+ },
292
+ "TrainerControl": {
293
+ "args": {
294
+ "should_epoch_stop": false,
295
+ "should_evaluate": false,
296
+ "should_log": false,
297
+ "should_save": true,
298
+ "should_training_stop": false
299
+ },
300
+ "attributes": {}
301
+ }
302
+ },
303
+ "total_flos": 4.506282119291601e+19,
304
+ "train_batch_size": 2,
305
+ "trial_name": null,
306
+ "trial_params": null
307
+ }
checkpoint-3000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6e049bd7921eb48f996e726c39ae31633a72809a2ae3db8056230d9b767c6a3
3
+ size 5713
checkpoint-3500/config.json ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.05,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "eos_token_id": 2,
49
+ "feat_extract_activation": "gelu",
50
+ "feat_extract_dropout": 0.0,
51
+ "feat_extract_norm": "layer",
52
+ "feat_proj_dropout": 0.05,
53
+ "feat_quantizer_dropout": 0.0,
54
+ "final_dropout": 0.05,
55
+ "hidden_act": "gelu",
56
+ "hidden_dropout": 0.05,
57
+ "hidden_size": 1280,
58
+ "initializer_range": 0.02,
59
+ "intermediate_size": 5120,
60
+ "layer_norm_eps": 1e-05,
61
+ "layerdrop": 0.05,
62
+ "mask_feature_length": 10,
63
+ "mask_feature_min_masks": 0,
64
+ "mask_feature_prob": 0.0,
65
+ "mask_time_length": 10,
66
+ "mask_time_min_masks": 2,
67
+ "mask_time_prob": 0.05,
68
+ "model_type": "wav2vec2",
69
+ "num_adapter_layers": 3,
70
+ "num_attention_heads": 16,
71
+ "num_codevector_groups": 2,
72
+ "num_codevectors_per_group": 320,
73
+ "num_conv_pos_embedding_groups": 16,
74
+ "num_conv_pos_embeddings": 128,
75
+ "num_feat_extract_layers": 7,
76
+ "num_hidden_layers": 48,
77
+ "num_negatives": 100,
78
+ "output_hidden_size": 1280,
79
+ "pad_token_id": 0,
80
+ "proj_codevector_dim": 1024,
81
+ "tdnn_dilation": [
82
+ 1,
83
+ 2,
84
+ 3,
85
+ 1,
86
+ 1
87
+ ],
88
+ "tdnn_dim": [
89
+ 512,
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 1500
94
+ ],
95
+ "tdnn_kernel": [
96
+ 5,
97
+ 3,
98
+ 3,
99
+ 1,
100
+ 1
101
+ ],
102
+ "torch_dtype": "float32",
103
+ "transformers_version": "4.52.3",
104
+ "use_weighted_layer_sum": false,
105
+ "vocab_size": 154,
106
+ "xvector_output_dim": 512
107
+ }
checkpoint-3500/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:99aeca7a440077a24f1c0b09897ed36f5a932570dd2496729faae5b4396914d3
3
+ size 3859521176
checkpoint-3500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0dfbc69b38fe026009bcc11b146b0c29cfb62bfd590ce87773a3d5d40ef0f29e
3
+ size 7686012659
checkpoint-3500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5849d0d95cfbe5dd2e51751b724005066bf8a4a36b5dfb9defee57e73469572d
3
+ size 14709
checkpoint-3500/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43f5e71a7a99649a841958dea51b3ddabaa13446a4bd011a06e48dc4379e99d1
3
+ size 1383