|
|
--- |
|
|
license: mit |
|
|
base_model: microsoft/wavlm-large |
|
|
tags: |
|
|
- audio |
|
|
- speech |
|
|
- wavlm |
|
|
- ctc |
|
|
- phone-recognition |
|
|
- arpabet |
|
|
--- |
|
|
|
|
|
# HuPER Recognizer (ARPAbet phone recognition) |
|
|
|
|
|
A CTC phone recognizer fine-tuned from **WavLM-Large** that maps **16 kHz** speech audio to an **ARPAbet** phone sequence. |
|
|
See the HuPER paper for details: **arXiv:2602.01634**. |
|
|
|
|
|
## Quickstart |
|
|
|
|
|
```bash |
|
|
pip install -U transformers torchaudio |
|
|
``` |
|
|
|
|
|
```python |
|
|
import torch |
|
|
import torchaudio |
|
|
from transformers import Wav2Vec2Processor, WavLMForCTC |
|
|
|
|
|
repo_id = "huper29/huper_recognizer" |
|
|
processor = Wav2Vec2Processor.from_pretrained(repo_id) |
|
|
model = WavLMForCTC.from_pretrained(repo_id) |
|
|
model.eval() |
|
|
|
|
|
waveform, sr = torchaudio.load("sample.wav") |
|
|
if waveform.shape[0] > 1: |
|
|
waveform = waveform.mean(dim=0, keepdim=True) |
|
|
if sr != 16000: |
|
|
waveform = torchaudio.transforms.Resample(sr, 16000)(waveform) |
|
|
|
|
|
inputs = processor(waveform.squeeze().numpy(), sampling_rate=16000, return_tensors="pt") |
|
|
with torch.no_grad(): |
|
|
logits = model(**inputs).logits |
|
|
|
|
|
pred_ids = torch.argmax(logits, dim=-1)[0].tolist() |
|
|
blank_id = processor.tokenizer.pad_token_id |
|
|
|
|
|
phone_tokens = [] |
|
|
prev = None |
|
|
for token_id in pred_ids: |
|
|
if token_id != blank_id and token_id != prev: |
|
|
token = model.config.id2label.get(token_id, processor.tokenizer.convert_ids_to_tokens(token_id)) |
|
|
if token not in {"<PAD>", "<UNK>", "<BOS>", "<EOS>", "|"}: |
|
|
phone_tokens.append(token) |
|
|
prev = token_id |
|
|
|
|
|
print(" ".join(phone_tokens)) |
|
|
``` |
|
|
|
|
|
## Citation |
|
|
|
|
|
```bibtex |
|
|
@article{guo2026huper, |
|
|
title = {HuPER: A Human-Inspired Framework for Phonetic Perception}, |
|
|
author = {Guo, Chenxu and Lian, Jiachen and Liu, Yisi and Huang, Baihe and Narayanan, Shriyaa and Cho, Cheol Jun and Anumanchipalli, Gopala}, |
|
|
journal = {arXiv preprint arXiv:2602.01634}, |
|
|
year = {2026} |
|
|
} |
|
|
``` |
|
|
|