Selest commited on
Commit
d35c4dc
·
verified ·
1 Parent(s): 67be8d1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -1,35 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
MODEL_INFO.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "MMS Urmi ASR (fine-tuned adapters)",
3
+ "language": "urmi",
4
+ "task": "speech_recognition_ctc",
5
+ "best_checkpoint": {
6
+ "path_relative": "mms-urmi-out/checkpoint-8000",
7
+ "global_step": 8000,
8
+ "epoch_approx": 6.95,
9
+ "selection": "HuggingFace Trainer load_best_model_at_end; metric_key_for_best_model = eval_cer (lower is better)"
10
+ },
11
+ "eval_metrics_dev_split_at_best_step": {
12
+ "eval_cer": 0.23638727853930758,
13
+ "eval_wer": 0.7476359338061466,
14
+ "eval_loss": 0.7667429447174072
15
+ },
16
+ "eval_metrics_last_saved_checkpoints_for_reference": {
17
+ "checkpoint-16500": {
18
+ "eval_cer": 0.3157609579021509,
19
+ "eval_wer": 0.9042553191489362,
20
+ "eval_loss": 1.0155612230300903
21
+ },
22
+ "checkpoint-17000": {
23
+ "eval_cer": 0.3179281573386791,
24
+ "eval_wer": 0.9051418439716312,
25
+ "eval_loss": 1.0153851509094238
26
+ }
27
+ },
28
+ "base_pretrained_model": "facebook/mms-1b-all",
29
+ "fine_tuning_mode": "train_adapters_only (MMS attention adapters + lm_head; base wav2vec2 frozen)",
30
+ "config_highlights": {
31
+ "architecture": "Wav2Vec2ForCTC",
32
+ "hidden_size": 1280,
33
+ "num_hidden_layers": 48,
34
+ "vocab_size": 160,
35
+ "adapter_attn_dim": 16,
36
+ "transformers_version_used_in_training": "5.7.0"
37
+ },
38
+ "training_run": {
39
+ "script": "train_mms_urmi.py",
40
+ "train_dir": "5h_train",
41
+ "test_dir": "5h_test",
42
+ "output_dir": "./mms-urmi-out",
43
+ "wandb_project": "mms-urmi-asr",
44
+ "wandb_run_dir_example": "wandb/run-20260503_202229-6jkw5dsk",
45
+ "started_at_utc_note": "see wandb-metadata.json in that run folder",
46
+ "hardware_from_wandb": {
47
+ "gpu": "NVIDIA GeForce RTX 5090",
48
+ "cuda": "12.8",
49
+ "python": "3.12.13"
50
+ }
51
+ },
52
+ "archive_contents": [
53
+ "MODEL_INFO.json",
54
+ "model.safetensors",
55
+ "config.json",
56
+ "tokenizer_config.json",
57
+ "vocab.json",
58
+ "added_tokens.json",
59
+ "processor_config.json",
60
+ "trainer_state.json (from checkpoint-8000 only; partial training history)"
61
+ ],
62
+ "not_included_in_inference_archive": [
63
+ "optimizer.pt",
64
+ "scheduler.pt",
65
+ "scaler.pt",
66
+ "rng_state.pth",
67
+ "training_args.bin"
68
+ ],
69
+ "inference_dependencies": "see requirements_asr.txt in project root (torch, transformers, librosa, soundfile, etc.)",
70
+ "load_with_transformers_example": "from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor; import torch; model = Wav2Vec2ForCTC.from_pretrained('PATH_TO_EXTRACTED_FOLDER'); processor = Wav2Vec2Processor.from_pretrained('PATH_TO_EXTRACTED_FOLDER'); model.eval()"
71
+ }
README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: aii
3
+ license: apache-2.0
4
+ tags:
5
+ - automatic-speech-recognition
6
+ - wav2vec2
7
+ - ctc
8
+ - mms
9
+ - assyrian
10
+ - neo-aramaic
11
+ - aramaic
12
+ - syriac
13
+ - urmi
14
+ - urmia
15
+ library_name: transformers
16
+ pipeline_tag: automatic-speech-recognition
17
+ base_model: facebook/mms-1b-all
18
+ metrics:
19
+ - cer
20
+ - wer
21
+ ---
22
+
23
+ # MMS Urmi ASR (adapter fine-tune, checkpoint step 8000)
24
+
25
+ Speech recognition (CTC) for **Neo-Aramaic (Assyrian / Syriac tradition)**, **Urmi (Urmia) Christian dialect**. Hub metadata uses language code **`aii`** (Assyrian Neo-Aramaic, ISO 639-3), not the non-ISO label `urmi`. Fine-tuned from [`facebook/mms-1b-all`](https://huggingface.co/facebook/mms-1b-all). Training kept the wav2vec2 encoder frozen and updated **MMS attention adapters** and the **CTC head** only.
26
+
27
+ ## Metrics (dev split at best checkpoint)
28
+
29
+ - **CER:** 0.236
30
+ - **WER:** 0.748
31
+
32
+ See `MODEL_INFO.json` in this repo for full training metadata.
33
+
34
+ ## Requirements
35
+
36
+ - `transformers` ≥ 4.30 (config lists 5.7.0)
37
+ - `torch`, `soundfile` or similar for audio I/O
38
+
39
+ ## Load and transcribe
40
+
41
+ ```python
42
+ import torch
43
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
44
+ import soundfile as sf
45
+
46
+ model_id = "<YOUR_USERNAME>/<THIS_REPO_NAME>" # after upload
47
+
48
+ processor = Wav2Vec2Processor.from_pretrained(model_id)
49
+ model = Wav2Vec2ForCTC.from_pretrained(model_id)
50
+ model.eval()
51
+
52
+ audio, sr = sf.read("audio.wav")
53
+ if audio.ndim > 1:
54
+ audio = audio.mean(axis=1)
55
+ if sr != 16000:
56
+ raise ValueError(f"Expected 16 kHz mono WAV; got sr={sr}")
57
+
58
+ inputs = processor(audio, sampling_rate=16000, return_tensors="pt", padding=True)
59
+ with torch.no_grad():
60
+ logits = model(inputs.input_values).logits
61
+ pred_ids = torch.argmax(logits, dim=-1)
62
+ text = processor.batch_decode(pred_ids)[0]
63
+ print(text)
64
+ ```
65
+
66
+ Local folder after clone/download works the same: pass the directory path instead of `model_id`.
67
+
68
+ ## Base model
69
+
70
+ This checkpoint was trained from **facebook/mms-1b-all** with adapter-only updates; the saved `config.json` and weights in this repo are sufficient for `Wav2Vec2ForCTC.from_pretrained` without loading the base separately.
added_tokens.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ " ": 154,
3
+ "ǧ": 155,
4
+ "ɣ": 156,
5
+ "̣": 157,
6
+ "ṗ": 158,
7
+ "ṭ": 159
8
+ }
config.json ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_dropout": 0.05,
3
+ "adapter_attn_dim": 16,
4
+ "adapter_kernel_size": 3,
5
+ "adapter_stride": 2,
6
+ "add_adapter": false,
7
+ "apply_spec_augment": true,
8
+ "architectures": [
9
+ "Wav2Vec2ForCTC"
10
+ ],
11
+ "attention_dropout": 0.0,
12
+ "bos_token_id": 1,
13
+ "classifier_proj_size": 256,
14
+ "codevector_dim": 1024,
15
+ "contrastive_logits_temperature": 0.1,
16
+ "conv_bias": true,
17
+ "conv_dim": [
18
+ 512,
19
+ 512,
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512
25
+ ],
26
+ "conv_kernel": [
27
+ 10,
28
+ 3,
29
+ 3,
30
+ 3,
31
+ 3,
32
+ 2,
33
+ 2
34
+ ],
35
+ "conv_stride": [
36
+ 5,
37
+ 2,
38
+ 2,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2
43
+ ],
44
+ "ctc_loss_reduction": "mean",
45
+ "ctc_zero_infinity": false,
46
+ "diversity_loss_weight": 0.1,
47
+ "do_stable_layer_norm": true,
48
+ "dtype": "float32",
49
+ "eos_token_id": 2,
50
+ "feat_extract_activation": "gelu",
51
+ "feat_extract_dropout": 0.0,
52
+ "feat_extract_norm": "layer",
53
+ "feat_proj_dropout": 0.0,
54
+ "feat_quantizer_dropout": 0.0,
55
+ "final_dropout": 0.05,
56
+ "hidden_act": "gelu",
57
+ "hidden_dropout": 0.0,
58
+ "hidden_size": 1280,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 5120,
61
+ "layer_norm_eps": 1e-05,
62
+ "layerdrop": 0.0,
63
+ "mask_feature_length": 10,
64
+ "mask_feature_min_masks": 0,
65
+ "mask_feature_prob": 0.0,
66
+ "mask_time_length": 10,
67
+ "mask_time_min_masks": 2,
68
+ "mask_time_prob": 0.05,
69
+ "model_type": "wav2vec2",
70
+ "num_adapter_layers": 3,
71
+ "num_attention_heads": 16,
72
+ "num_codevector_groups": 2,
73
+ "num_codevectors_per_group": 320,
74
+ "num_conv_pos_embedding_groups": 16,
75
+ "num_conv_pos_embeddings": 128,
76
+ "num_feat_extract_layers": 7,
77
+ "num_hidden_layers": 48,
78
+ "num_negatives": 100,
79
+ "output_hidden_size": 1280,
80
+ "pad_token_id": 0,
81
+ "proj_codevector_dim": 1024,
82
+ "tdnn_dilation": [
83
+ 1,
84
+ 2,
85
+ 3,
86
+ 1,
87
+ 1
88
+ ],
89
+ "tdnn_dim": [
90
+ 512,
91
+ 512,
92
+ 512,
93
+ 512,
94
+ 1500
95
+ ],
96
+ "tdnn_kernel": [
97
+ 5,
98
+ 3,
99
+ 3,
100
+ 1,
101
+ 1
102
+ ],
103
+ "transformers_version": "5.7.0",
104
+ "use_cache": false,
105
+ "use_weighted_layer_sum": false,
106
+ "vocab_size": 160,
107
+ "xvector_output_dim": 512
108
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cb5be0a677460fa492b703e2805d4729379148e6f0d8fe69cc770b713b94dda
3
+ size 3859551880
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "eos_token": "</s>",
4
+ "unk_token": "<unk>",
5
+ "pad_token": "<pad>",
6
+ "word_delimiter_token": "|"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<pad>",
5
+ "lstrip": true,
6
+ "normalized": false,
7
+ "rstrip": true,
8
+ "single_word": false,
9
+ "special": false
10
+ },
11
+ "1": {
12
+ "content": "<s>",
13
+ "lstrip": true,
14
+ "normalized": false,
15
+ "rstrip": true,
16
+ "single_word": false,
17
+ "special": false
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": true,
22
+ "normalized": false,
23
+ "rstrip": true,
24
+ "single_word": false,
25
+ "special": false
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": true,
30
+ "normalized": false,
31
+ "rstrip": true,
32
+ "single_word": false,
33
+ "special": false
34
+ },
35
+ "4": {
36
+ "content": "|",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "154": {
44
+ "content": " ",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": false
50
+ },
51
+ "155": {
52
+ "content": "ǧ",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "156": {
60
+ "content": "ɣ",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": false
66
+ },
67
+ "157": {
68
+ "content": "̣",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": false
74
+ },
75
+ "158": {
76
+ "content": "ṗ",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "159": {
84
+ "content": "ṭ",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ }
91
+ },
92
+ "backend": "custom",
93
+ "bos_token": "<s>",
94
+ "clean_up_tokenization_spaces": true,
95
+ "do_lower_case": false,
96
+ "eos_token": "</s>",
97
+ "is_local": false,
98
+ "local_files_only": false,
99
+ "model_max_length": 1000000000000000019884624838656,
100
+ "model_specific_special_tokens": {
101
+ "word_delimiter_token": "|"
102
+ },
103
+ "pad_token": "<pad>",
104
+ "processor_class": "Wav2Vec2Processor",
105
+ "replace_word_delimiter_char": " ",
106
+ "target_lang": "eng",
107
+ "tokenizer_class": "Wav2Vec2CTCTokenizer",
108
+ "unk_token": "<unk>",
109
+ "word_delimiter_token": "|"
110
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff