onecxi commited on
Commit
9bd306d
·
verified ·
1 Parent(s): 071337f

Initial commit

Browse files
README.md ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ - hi
5
+ - or
6
+ - bn
7
+ - ta
8
+ - te
9
+ - kn
10
+ - ml
11
+ - mr
12
+ - gu
13
+ - pa
14
+ - as
15
+ ---
16
+
17
+ # **Vakgyata**
18
+
19
+ **Language Identification for Indian Languages from Speech**
20
+
21
+ ---
22
+
23
+ ## **Model Overview**
24
+
25
+ `vakgyata` is an open-source language identification model specifically designed to classify Indian languages from raw speech audio. It is built upon the pretrained [`Harveenchadha/wav2vec2-pretrained-clsril-23-10k`](https://huggingface.co/Harveenchadha/wav2vec2-pretrained-clsril-23-10k) with additional **Layer Normalization** integrated to improve stability and performance for audio classification tasks.
26
+
27
+ ---
28
+
29
+ ## **Variants and Model Sizes**
30
+
31
+ | Variant | Parameters | Accuracy |
32
+ | ---------------- | ---------- | -------- |
33
+ | `vakgyata-base` | 95M | 95.88% |
34
+ | `vakgyata-small` | 52M | 95.06% |
35
+ | `vakgyata-mini` | 38M | 95.06% |
36
+ | `vakgyata-tiny` | 24M | 93.63% |
37
+
38
+ ---
39
+
40
+ ## **Supported Languages**
41
+
42
+ | Language | Code |
43
+ | --------------- | ----- |
44
+ | English (India) | en-IN |
45
+ | Hindi | hi-IN |
46
+ | Odia | or-IN |
47
+ | Bengali | bn-IN |
48
+ | Tamil | ta-IN |
49
+ | Telugu | te-IN |
50
+ | Kannada | kn-IN |
51
+ | Malayalam | ml-IN |
52
+ | Marathi | mr-IN |
53
+ | Gujarati | gu-IN |
54
+ | Punjabi | pa-IN |
55
+ | Assamese | as-IN |
56
+
57
+ ---
58
+
59
+ ## **Specifications**
60
+
61
+ * **Supported Sampling Rate:** 16000 Hz
62
+ * **Recommended Audio Format:** 16kHz, 16bit PCM (Mono)
63
+
64
+ ---
65
+
66
+ ## **Installation**
67
+
68
+ ```bash
69
+ pip install transformers torchaudio
70
+ ```
71
+
72
+ ---
73
+
74
+ ## **Usage**
75
+
76
+ ```python
77
+ from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
78
+ import torch
79
+
80
+ device = "cuda" if torch.cuda.is_available() else "cpu"
81
+
82
+ model_id = "onecxi/vakgyata-base" # You can replace with tiny/small/mini variants
83
+
84
+ processor = AutoFeatureExtractor.from_pretrained(model_id)
85
+ model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id).to(device)
86
+ ```
87
+
88
+ ---
89
+
90
+ ## **Inference Example**
91
+
92
+ ```python
93
+ import torchaudio
94
+
95
+ # Load the audio (ensure it's 16kHz mono)
96
+ audio, sr = torchaudio.load("path/to/audio.wav")
97
+
98
+ # Preprocess
99
+ inputs = processor(audio.squeeze(), sampling_rate=sr, return_tensors="pt").to(device)
100
+
101
+ # Inference
102
+ with torch.no_grad():
103
+ logits = model(**inputs).logits
104
+
105
+ # Softmax to get probabilities
106
+ probs = logits.softmax(dim=-1).cpu().numpy()
107
+
108
+ # Predicted language
109
+ language = model.config.id2label.get(probs.argmax())
110
+ print("Predicted Language:", language)
111
+ ```
112
+
113
+ ---
114
+
115
+ ## **Citation**
116
+
117
+ If you use this model in your research or application, please consider citing the model and its base source:
118
+
119
+ ```
120
+ @misc{vakgyata2024,
121
+ title={vakgyata: Language Identification for Indian Speech},
122
+ author={OneCXI},
123
+ year={2024},
124
+ url={https://huggingface.co/onecxi/vakgyata-base}
125
+ }
126
+ ```
127
+
128
+ ---
config.json ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "onecxi/vakgyata-base/",
3
+ "activation_dropout": 0.1,
4
+ "adapter_attn_dim": null,
5
+ "adapter_kernel_size": 3,
6
+ "adapter_stride": 2,
7
+ "add_adapter": false,
8
+ "apply_spec_augment": true,
9
+ "architectures": [
10
+ "Wav2Vec2ForSequenceClassification"
11
+ ],
12
+ "attention_dropout": 0.1,
13
+ "bos_token": "<s>",
14
+ "bos_token_id": 1,
15
+ "classifier_proj_size": 1024,
16
+ "codevector_dim": 256,
17
+ "contrastive_logits_temperature": 0.1,
18
+ "conv_bias": false,
19
+ "conv_dim": [
20
+ 512,
21
+ 512,
22
+ 512,
23
+ 512,
24
+ 512,
25
+ 512,
26
+ 512
27
+ ],
28
+ "conv_kernel": [
29
+ 10,
30
+ 3,
31
+ 3,
32
+ 3,
33
+ 3,
34
+ 2,
35
+ 2
36
+ ],
37
+ "conv_stride": [
38
+ 5,
39
+ 2,
40
+ 2,
41
+ 2,
42
+ 2,
43
+ 2,
44
+ 2
45
+ ],
46
+ "ctc_loss_reduction": "sum",
47
+ "ctc_zero_infinity": false,
48
+ "diversity_loss_weight": 0.1,
49
+ "do_lower_case": false,
50
+ "do_stable_layer_norm": true,
51
+ "eos_token": "</s>",
52
+ "eos_token_id": 2,
53
+ "feat_extract_activation": "gelu",
54
+ "feat_extract_norm": "group",
55
+ "feat_proj_dropout": 0.1,
56
+ "feat_quantizer_dropout": 0.0,
57
+ "final_dropout": 0.1,
58
+ "gradient_checkpointing": false,
59
+ "hidden_act": "gelu",
60
+ "hidden_dropout": 0.1,
61
+ "hidden_size": 768,
62
+ "id2label": {
63
+ "0": "en-IN",
64
+ "1": "hi-IN",
65
+ "2": "or-IN",
66
+ "3": "bn-IN",
67
+ "4": "ta-IN",
68
+ "5": "te-IN",
69
+ "6": "kn-IN",
70
+ "7": "ml-IN",
71
+ "8": "mr-IN",
72
+ "9": "gu-IN",
73
+ "10": "pa-IN",
74
+ "11": "as-IN"
75
+ },
76
+ "initializer_range": 0.02,
77
+ "intermediate_size": 3072,
78
+ "label2id": {
79
+ "as-IN": 11,
80
+ "bn-IN": 3,
81
+ "en-IN": 0,
82
+ "gu-IN": 9,
83
+ "hi-IN": 1,
84
+ "kn-IN": 6,
85
+ "ml-IN": 7,
86
+ "mr-IN": 8,
87
+ "or-IN": 2,
88
+ "pa-IN": 10,
89
+ "ta-IN": 4,
90
+ "te-IN": 5
91
+ },
92
+ "layer_norm_eps": 1e-05,
93
+ "layerdrop": 0.1,
94
+ "mask_feature_length": 10,
95
+ "mask_feature_min_masks": 0,
96
+ "mask_feature_prob": 0.0,
97
+ "mask_time_length": 10,
98
+ "mask_time_min_masks": 2,
99
+ "mask_time_prob": 0.05,
100
+ "model_name": "vakgyata",
101
+ "model_type": "wav2vec2",
102
+ "num_adapter_layers": 3,
103
+ "num_attention_heads": 12,
104
+ "num_codevector_groups": 2,
105
+ "num_codevectors_per_group": 320,
106
+ "num_conv_pos_embedding_groups": 16,
107
+ "num_conv_pos_embeddings": 128,
108
+ "num_feat_extract_layers": 7,
109
+ "num_hidden_layers": 12,
110
+ "num_negatives": 100,
111
+ "output_hidden_size": 768,
112
+ "pad_token": "[PAD]",
113
+ "pad_token_id": 0,
114
+ "proj_codevector_dim": 256,
115
+ "tdnn_dilation": [
116
+ 1,
117
+ 2,
118
+ 3,
119
+ 1,
120
+ 1
121
+ ],
122
+ "tdnn_dim": [
123
+ 512,
124
+ 512,
125
+ 512,
126
+ 512,
127
+ 1500
128
+ ],
129
+ "tdnn_kernel": [
130
+ 5,
131
+ 3,
132
+ 3,
133
+ 1,
134
+ 1
135
+ ],
136
+ "torch_dtype": "float32",
137
+ "transformers_version": "4.48.3",
138
+ "unk_token": "[UNK]",
139
+ "use_weighted_layer_sum": false,
140
+ "vocab_size": 12,
141
+ "word_delimiter_token": "|",
142
+ "xvector_output_dim": 512
143
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2389efee5b6f4602d061219a8475ec31545e5a1fa93a7b09ae7f225dca1de52
3
+ size 380711896
onnx/model_quantized.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f10668607c452d27eab588188d55ad7d83733a75b5419d2fd067328d9968fcb
3
+ size 380953511
preprocessor_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_normalize": true,
3
+ "feature_extractor_type": "Wav2Vec2FeatureExtractor",
4
+ "feature_size": 1,
5
+ "padding_side": "right",
6
+ "padding_value": 0.0,
7
+ "return_attention_mask": true,
8
+ "sampling_rate": 16000
9
+ }