Commit ·
187d79d
1
Parent(s): 7d0662d
Release HuPER Corrector weights and inference code
Browse files- README.md +69 -0
- hparams.json +26 -0
- model.safetensors +3 -0
- requirements.txt +7 -0
README.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- speech
|
| 4 |
+
- phoneme
|
| 5 |
+
- pytorch-lightning
|
| 6 |
+
library_name: pytorch
|
| 7 |
+
---
|
| 8 |
+
|
| 9 |
+
# HuPER Corrector (Phoneme Corrector)
|
| 10 |
+
|
| 11 |
+
This repo releases the HuPER Corrector model checkpoints and inference code.
|
| 12 |
+
|
| 13 |
+
## What it does
|
| 14 |
+
Given (1) a canonical phoneme sequence (ARPAbet) and (2) discrete audio tokens, the model predicts edit operations (KEEP/DEL/SUB:PHN) and optional insertions to better match realized phones.
|
| 15 |
+
|
| 16 |
+
## Files
|
| 17 |
+
- `model.safetensors`: model weights
|
| 18 |
+
- `hparams.json`: training hyper-parameters saved from Lightning
|
| 19 |
+
- `edit_seq_speech/`: inference + model definition
|
| 20 |
+
- `edit_seq_speech/config/vocab.json`: op/insert mappings
|
| 21 |
+
|
| 22 |
+
## Quickstart
|
| 23 |
+
|
| 24 |
+
```python
|
| 25 |
+
import os
|
| 26 |
+
from huggingface_hub import snapshot_download
|
| 27 |
+
|
| 28 |
+
repo_dir = snapshot_download("huper29/huper_corrector")
|
| 29 |
+
|
| 30 |
+
# Make sure python can import the package
|
| 31 |
+
import sys
|
| 32 |
+
sys.path.append(repo_dir)
|
| 33 |
+
|
| 34 |
+
from edit_seq_speech.inference import PhonemeCorrectionInference
|
| 35 |
+
|
| 36 |
+
ckpt_path = os.path.join(repo_dir, "model.safetensors") # or a .ckpt if you uploaded it
|
| 37 |
+
vocab_path = os.path.join(repo_dir, "edit_seq_speech/config/vocab.json")
|
| 38 |
+
|
| 39 |
+
infer = PhonemeCorrectionInference(
|
| 40 |
+
checkpoint_path=ckpt_path,
|
| 41 |
+
vocab_path=vocab_path,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
wav_path = "your.wav"
|
| 45 |
+
text = "AY R OW T AH L EH T ER" # phonemized input
|
| 46 |
+
final_phns, log = infer.predict(wav_path, text)
|
| 47 |
+
print(final_phns)
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## Notes / Limitations
|
| 51 |
+
|
| 52 |
+
- Audio tokenization must match training (see code and provided artifacts).
|
| 53 |
+
|
| 54 |
+
- Input phoneme format: ARPAbet tokens separated by spaces.
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
## Citation
|
| 58 |
+
|
| 59 |
+
If you use this model, please cite:
|
| 60 |
+
|
| 61 |
+
```bibtex
|
| 62 |
+
@article{guo2026huper,
|
| 63 |
+
title = {HuPER: A Human-Inspired Framework for Phonetic Perception},
|
| 64 |
+
author = {Guo, Chenxu and Lian, Jiachen and Liu, Yisi and Huang, Baihe and Narayanan, Shriyaa and Cho, Cheol Jun and Anumanchipalli, Gopala},
|
| 65 |
+
journal = {arXiv preprint arXiv:2602.01634},
|
| 66 |
+
year = {2026}
|
| 67 |
+
}
|
| 68 |
+
```
|
| 69 |
+
|
hparams.json
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"vocab_size": 42,
|
| 3 |
+
"audio_vocab_size": 2048,
|
| 4 |
+
"d_model": 512,
|
| 5 |
+
"nhead": 8,
|
| 6 |
+
"num_layers": 8,
|
| 7 |
+
"dropout": 0.2,
|
| 8 |
+
"lr": 0.0002,
|
| 9 |
+
"weight_decay": 0.01,
|
| 10 |
+
"scheduler_config": {
|
| 11 |
+
"type": "cosine",
|
| 12 |
+
"warmup_ratio": 0.1,
|
| 13 |
+
"eta_min": 1e-06,
|
| 14 |
+
"factor": 0.5,
|
| 15 |
+
"patience": 3,
|
| 16 |
+
"min_lr": 1e-06
|
| 17 |
+
},
|
| 18 |
+
"optimizer_config": {
|
| 19 |
+
"name": "adamw",
|
| 20 |
+
"betas": [
|
| 21 |
+
0.9,
|
| 22 |
+
0.999
|
| 23 |
+
],
|
| 24 |
+
"eps": 1e-08
|
| 25 |
+
}
|
| 26 |
+
}
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:79845cbe992dedaf9d9bab95157466647a8431d35dab06e805b245edcba4ead4
|
| 3 |
+
size 149242192
|
requirements.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchaudio
|
| 3 |
+
pytorch-lightning
|
| 4 |
+
transformers
|
| 5 |
+
huggingface_hub
|
| 6 |
+
g2p_en
|
| 7 |
+
safetensors
|