Upload model + model card

Browse files

Files changed (8) hide show

README.md +56 -3
added_tokens.json +4 -0
config.json +212 -0
model.safetensors +3 -0
preprocessor_config.json +10 -0
special_tokens_map.json +6 -0
tokenizer_config.json +249 -0
vocab.json +46 -0

README.md CHANGED Viewed

@@ -1,5 +1,58 @@
 ---
 license: mit
-base_model:
-- microsoft/wavlm-large
----

 ---
 license: mit
+base_model: microsoft/wavlm-large
+tags:
+- audio
+- speech
+- wavlm
+- ctc
+- phone-recognition
+- arpabet
+---
+# HuPER Recognizer (ARPAbet phone recognition)
+A CTC phone recognizer fine-tuned from **WavLM-Large** that maps **16 kHz** speech audio to an **ARPAbet** phone sequence.
+See the HuPER paper for details: **arXiv:2602.01634**.
+## Quickstart
+```bash
+pip install -U transformers torchaudio
+```
+```python
+import torch
+import torchaudio
+from transformers import Wav2Vec2Processor, WavLMForCTC
+repo_id = "huper29/huper_recognizer"
+processor = Wav2Vec2Processor.from_pretrained(repo_id)
+model = WavLMForCTC.from_pretrained(repo_id)
+model.eval()
+waveform, sr = torchaudio.load("sample.wav")
+if waveform.shape[0] > 1:
+    waveform = waveform.mean(dim=0, keepdim=True)
+if sr != 16000:
+    waveform = torchaudio.transforms.Resample(sr, 16000)(waveform)
+inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt")
+with torch.no_grad():
+    logits = model(**inputs).logits
+pred_ids = torch.argmax(logits, dim=-1)[0]
+phone_seq = processor.tokenizer.decode(pred_ids, skip_special_tokens=True)
+print(phone_seq)
+```
+## Citation
+```bibtex
+@article{guo2026huper,
+  title   = {HuPER: A Human-Inspired Framework for Phonetic Perception},
+  author  = {Guo, Chenxu and Lian, Jiachen and Liu, Yisi and Huang, Baihe and Narayanan, Shriyaa and Cho, Cheol Jun and Anumanchipalli, Gopala},
+  journal = {arXiv preprint arXiv:2602.01634},
+  year    = {2026}
+}
+```

added_tokens.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "</s>": 45,
+  "<s>": 44
+}

config.json ADDED Viewed

	@@ -0,0 +1,212 @@

+{
+  "activation_dropout": 0.0,
+  "adapter_kernel_size": 3,
+  "adapter_stride": 2,
+  "add_adapter": false,
+  "apply_spec_augment": true,
+  "architectures": [
+    "WavLMForCTC"
+  ],
+  "attention_dropout": 0.1,
+  "bos_token_id": 1,
+  "classifier_proj_size": 256,
+  "codevector_dim": 768,
+  "contrastive_logits_temperature": 0.1,
+  "conv_bias": false,
+  "conv_dim": [
+    512,
+    512,
+    512,
+    512,
+    512,
+    512,
+    512
+  ],
+  "conv_kernel": [
+    10,
+    3,
+    3,
+    3,
+    3,
+    2,
+    2
+  ],
+  "conv_stride": [
+    5,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2
+  ],
+  "ctc_loss_reduction": "mean",
+  "ctc_zero_infinity": false,
+  "diversity_loss_weight": 0.1,
+  "do_stable_layer_norm": true,
+  "dtype": "float32",
+  "eos_token_id": 2,
+  "feat_extract_activation": "gelu",
+  "feat_extract_dropout": 0.0,
+  "feat_extract_norm": "layer",
+  "feat_proj_dropout": 0.1,
+  "feat_quantizer_dropout": 0.0,
+  "final_dropout": 0.0,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout": 0.1,
+  "hidden_size": 1024,
+  "id2label": {
+    "0": "<PAD>",
+    "1": "<UNK>",
+    "2": "<BOS>",
+    "3": "<EOS>",
+    "4": "|",
+    "5": "AA",
+    "6": "AE",
+    "7": "AH",
+    "8": "AW",
+    "9": "AY",
+    "10": "B",
+    "11": "CH",
+    "12": "D",
+    "13": "DH",
+    "14": "DX",
+    "15": "EH",
+    "16": "ER",
+    "17": "EY",
+    "18": "F",
+    "19": "G",
+    "20": "HH",
+    "21": "IH",
+    "22": "IY",
+    "23": "JH",
+    "24": "K",
+    "25": "L",
+    "26": "M",
+    "27": "N",
+    "28": "NG",
+    "29": "OW",
+    "30": "OY",
+    "31": "P",
+    "32": "R",
+    "33": "S",
+    "34": "SH",
+    "35": "T",
+    "36": "TH",
+    "37": "UH",
+    "38": "UW",
+    "39": "V",
+    "40": "W",
+    "41": "Y",
+    "42": "Z",
+    "43": "ZH"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "label2id": {
+    "<BOS>": 2,
+    "<EOS>": 3,
+    "<PAD>": 0,
+    "<UNK>": 1,
+    "AA": 5,
+    "AE": 6,
+    "AH": 7,
+    "AW": 8,
+    "AY": 9,
+    "B": 10,
+    "CH": 11,
+    "D": 12,
+    "DH": 13,
+    "DX": 14,
+    "EH": 15,
+    "ER": 16,
+    "EY": 17,
+    "F": 18,
+    "G": 19,
+    "HH": 20,
+    "IH": 21,
+    "IY": 22,
+    "JH": 23,
+    "K": 24,
+    "L": 25,
+    "M": 26,
+    "N": 27,
+    "NG": 28,
+    "OW": 29,
+    "OY": 30,
+    "P": 31,
+    "R": 32,
+    "S": 33,
+    "SH": 34,
+    "T": 35,
+    "TH": 36,
+    "UH": 37,
+    "UW": 38,
+    "V": 39,
+    "W": 40,
+    "Y": 41,
+    "Z": 42,
+    "ZH": 43,
+    "|": 4
+  },
+  "layer_norm_eps": 1e-05,
+  "layerdrop": 0.1,
+  "mask_channel_length": 10,
+  "mask_channel_min_space": 1,
+  "mask_channel_other": 0.0,
+  "mask_channel_prob": 0.0,
+  "mask_channel_selection": "static",
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_min_space": 1,
+  "mask_time_other": 0.0,
+  "mask_time_prob": 0.075,
+  "mask_time_selection": "static",
+  "max_bucket_distance": 800,
+  "model_type": "wavlm",
+  "num_adapter_layers": 3,
+  "num_attention_heads": 16,
+  "num_buckets": 320,
+  "num_codevector_groups": 2,
+  "num_codevectors_per_group": 320,
+  "num_conv_pos_embedding_groups": 16,
+  "num_conv_pos_embeddings": 128,
+  "num_ctc_classes": 80,
+  "num_feat_extract_layers": 7,
+  "num_hidden_layers": 24,
+  "num_negatives": 100,
+  "output_hidden_size": 1024,
+  "pad_token_id": 0,
+  "proj_codevector_dim": 768,
+  "replace_prob": 0.5,
+  "tdnn_dilation": [
+    1,
+    2,
+    3,
+    1,
+    1
+  ],
+  "tdnn_dim": [
+    512,
+    512,
+    512,
+    512,
+    1500
+  ],
+  "tdnn_kernel": [
+    5,
+    3,
+    3,
+    1,
+    1
+  ],
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "transformers_version": "4.57.3",
+  "use_weighted_layer_sum": false,
+  "vocab_size": 46,
+  "xvector_output_dim": 512
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6117404b02dcdeeabf3602feae10c2f95f4f5dcf73df8af92a942e301f510113
+size 1262060824

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "processor_class": "Wav2Vec2Processor",
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token": "<s>",
+  "eos_token": "</s>",
+  "pad_token": "<PAD>",
+  "unk_token": "<UNK>"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,249 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<PAD>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "1": {
+      "content": "<UNK>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "2": {
+      "content": "<BOS>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "3": {
+      "content": "<EOS>",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "5": {
+      "content": "AA",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "6": {
+      "content": "AE",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "7": {
+      "content": "AH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "8": {
+      "content": "AW",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "9": {
+      "content": "AY",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "11": {
+      "content": "CH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "13": {
+      "content": "DH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "14": {
+      "content": "DX",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "15": {
+      "content": "EH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "16": {
+      "content": "ER",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "17": {
+      "content": "EY",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "20": {
+      "content": "HH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "21": {
+      "content": "IH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "22": {
+      "content": "IY",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "23": {
+      "content": "JH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "28": {
+      "content": "NG",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "29": {
+      "content": "OW",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "30": {
+      "content": "OY",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "34": {
+      "content": "SH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "36": {
+      "content": "TH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "37": {
+      "content": "UH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "38": {
+      "content": "UW",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "43": {
+      "content": "ZH",
+      "lstrip": true,
+      "normalized": false,
+      "rstrip": true,
+      "single_word": false,
+      "special": false
+    },
+    "44": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "45": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "bos_token": "<s>",
+  "clean_up_tokenization_spaces": false,
+  "do_lower_case": false,
+  "eos_token": "</s>",
+  "extra_special_tokens": {},
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "<PAD>",
+  "processor_class": "Wav2Vec2Processor",
+  "replace_word_delimiter_char": " ",
+  "target_lang": null,
+  "tokenizer_class": "Wav2Vec2CTCTokenizer",
+  "unk_token": "<UNK>",
+  "word_delimiter_token": "|"
+}

vocab.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "<BOS>": 2,
+  "<EOS>": 3,
+  "<PAD>": 0,
+  "<UNK>": 1,
+  "AA": 5,
+  "AE": 6,
+  "AH": 7,
+  "AW": 8,
+  "AY": 9,
+  "B": 10,
+  "CH": 11,
+  "D": 12,
+  "DH": 13,
+  "DX": 14,
+  "EH": 15,
+  "ER": 16,
+  "EY": 17,
+  "F": 18,
+  "G": 19,
+  "HH": 20,
+  "IH": 21,
+  "IY": 22,
+  "JH": 23,
+  "K": 24,
+  "L": 25,
+  "M": 26,
+  "N": 27,
+  "NG": 28,
+  "OW": 29,
+  "OY": 30,
+  "P": 31,
+  "R": 32,
+  "S": 33,
+  "SH": 34,
+  "T": 35,
+  "TH": 36,
+  "UH": 37,
+  "UW": 38,
+  "V": 39,
+  "W": 40,
+  "Y": 41,
+  "Z": 42,
+  "ZH": 43,
+  "|": 4
+}