| --- |
| base_model: |
| - Qwen/Qwen3-0.6B |
| language: |
| - aae |
| - aal |
| - aao |
| - ab |
| - abb |
| - abn |
| - abr |
| - abs |
| - abv |
| - acm |
| - acw |
| - acx |
| - adf |
| - adx |
| - ady |
| - aeb |
| - aec |
| - af |
| - afb |
| - afo |
| - ahl |
| - ahs |
| - ajg |
| - aju |
| - ala |
| - aln |
| - alo |
| - am |
| - amu |
| - an |
| - anc |
| - ank |
| - anp |
| - anw |
| - aom |
| - apc |
| - apd |
| - arb |
| - arq |
| - ars |
| - ary |
| - arz |
| - as |
| - ast |
| - avl |
| - awo |
| - ayl |
| - ayp |
| - az |
| - ba |
| - bag |
| - bas |
| - bax |
| - bba |
| - bbj |
| - bbl |
| - bbu |
| - bce |
| - bci |
| - bcs |
| - bcy |
| - bda |
| - bde |
| - bdm |
| - be |
| - beb |
| - bew |
| - bfd |
| - bft |
| - bg |
| - bgp |
| - bhb |
| - bhh |
| - bho |
| - bhp |
| - bhr |
| - bjj |
| - bjk |
| - bjn |
| - bjt |
| - bkh |
| - bkm |
| - bky |
| - bmm |
| - bmq |
| - bn |
| - bnm |
| - bnn |
| - bns |
| - bo |
| - bou |
| - bqg |
| - br |
| - bra |
| - brh |
| - bri |
| - brx |
| - bs |
| - bsh |
| - bsj |
| - bsk |
| - btm |
| - btv |
| - bug |
| - bum |
| - buo |
| - bux |
| - bwr |
| - bxf |
| - byc |
| - bys |
| - byv |
| - byx |
| - bzc |
| - bzw |
| - ca |
| - ccg |
| - ceb |
| - cen |
| - cfa |
| - cgg |
| - chq |
| - cjk |
| - ckb |
| - ckl |
| - ckr |
| - cky |
| - cnh |
| - cpy |
| - cs |
| - cte |
| - ctl |
| - cut |
| - cux |
| - cv |
| - cy |
| - da |
| - dag |
| - dar |
| - dav |
| - dbd |
| - dcc |
| - de |
| - deg |
| - dgh |
| - dgo |
| - dje |
| - dmk |
| - dml |
| - dru |
| - dty |
| - dua |
| - dv |
| - dyu |
| - dzg |
| - ebr |
| - ebu |
| - ego |
| - eiv |
| - eko |
| - ekr |
| - el |
| - elm |
| - en |
| - eo |
| - es |
| - esu |
| - et |
| - eto |
| - ets |
| - etu |
| - eu |
| - ewo |
| - ext |
| - eyo |
| - fa |
| - fan |
| - fat |
| - ff |
| - ffm |
| - fi |
| - fia |
| - fil |
| - fip |
| - fkk |
| - fmp |
| - fr |
| - fub |
| - fuc |
| - fue |
| - fuf |
| - fuh |
| - fui |
| - fuq |
| - fuv |
| - fy |
| - ga |
| - gbm |
| - gbr |
| - gby |
| - gcc |
| - gdf |
| - gej |
| - ges |
| - ggg |
| - gid |
| - gig |
| - giz |
| - gjk |
| - gju |
| - gl |
| - glw |
| - gn |
| - gol |
| - gom |
| - gsl |
| - gu |
| - gui |
| - gur |
| - guz |
| - gv |
| - gwc |
| - gwe |
| - gwt |
| - gya |
| - gyz |
| - ha |
| - hah |
| - hao |
| - haw |
| - haz |
| - hbb |
| - he |
| - hem |
| - hi |
| - hia |
| - hkk |
| - hla |
| - hno |
| - hoj |
| - hr |
| - hsb |
| - ht |
| - hu |
| - hue |
| - hul |
| - hux |
| - hwo |
| - hy |
| - hz |
| - ia |
| - ibb |
| - id |
| - ida |
| - idu |
| - ig |
| - ijc |
| - ijn |
| - ik |
| - ikw |
| - is |
| - ish |
| - iso |
| - it |
| - its |
| - itw |
| - itz |
| - ja |
| - jal |
| - jax |
| - jgo |
| - jmx |
| - jns |
| - jqr |
| - juk |
| - juo |
| - jv |
| - ka |
| - kab |
| - kai |
| - kaj |
| - kam |
| - kbd |
| - kbl |
| - kbt |
| - kcq |
| - kdh |
| - kea |
| - keu |
| - kfe |
| - kfk |
| - kfp |
| - khg |
| - khw |
| - kj |
| - kjc |
| - kjk |
| - kk |
| - kln |
| - kls |
| - km |
| - kmr |
| - kmy |
| - kn |
| - kna |
| - knn |
| - ko |
| - kol |
| - koo |
| - kpo |
| - kqo |
| - ks |
| - ksd |
| - ksf |
| - kto |
| - kuh |
| - kvx |
| - kw |
| - kwm |
| - kxp |
| - ky |
| - kyx |
| - lag |
| - lb |
| - lcm |
| - ldb |
| - lg |
| - lij |
| - lir |
| - lkb |
| - lla |
| - ln |
| - lnu |
| - lo |
| - loa |
| - lrk |
| - lss |
| - lt |
| - ltg |
| - lto |
| - lua |
| - luo |
| - lus |
| - lv |
| - lwg |
| - mab |
| - maf |
| - mai |
| - mau |
| - max |
| - mbo |
| - mcf |
| - mcn |
| - mcx |
| - mdd |
| - mde |
| - mdf |
| - mek |
| - mer |
| - meu |
| - mfm |
| - mfn |
| - mfo |
| - mfv |
| - mgg |
| - mgi |
| - mhk |
| - mhr |
| - mi |
| - mig |
| - miu |
| - mk |
| - mkf |
| - mki |
| - ml |
| - mlq |
| - mn |
| - mne |
| - mni |
| - mqy |
| - mr |
| - mrj |
| - mrr |
| - mrt |
| - ms |
| - mse |
| - msh |
| - msw |
| - mt |
| - mtr |
| - mtu |
| - mtx |
| - mua |
| - mug |
| - mui |
| - mve |
| - mvy |
| - mxs |
| - mxu |
| - mxy |
| - my |
| - myv |
| - mzl |
| - nal |
| - nan |
| - nap |
| - nb |
| - nbh |
| - ncf |
| - nco |
| - ncx |
| - ndi |
| - ng |
| - ngi |
| - nhg |
| - nhi |
| - nhn |
| - nhq |
| - nja |
| - nl |
| - nla |
| - nlv |
| - nmg |
| - nmz |
| - nn |
| - nnh |
| - 'no' |
| - noe |
| - npi |
| - nso |
| - ny |
| - nyu |
| - oc |
| - odk |
| - odu |
| - ogo |
| - om |
| - orc |
| - oru |
| - ory |
| - os |
| - pa |
| - pbs |
| - pbt |
| - pbu |
| - pcm |
| - pex |
| - phl |
| - phr |
| - pip |
| - piy |
| - pko |
| - pl |
| - plk |
| - plt |
| - pmq |
| - pms |
| - pmy |
| - pnb |
| - poc |
| - poe |
| - pow |
| - prq |
| - ps |
| - pst |
| - pt |
| - pua |
| - pwn |
| - qug |
| - qum |
| - qup |
| - qur |
| - qus |
| - quv |
| - qux |
| - quy |
| - qva |
| - qvi |
| - qvj |
| - qvl |
| - qwa |
| - qws |
| - qxa |
| - qxp |
| - qxt |
| - qxu |
| - qxw |
| - rag |
| - rm |
| - ro |
| - rob |
| - rof |
| - roo |
| - rth |
| - ru |
| - rup |
| - rw |
| - sa |
| - sah |
| - sat |
| - sau |
| - say |
| - sbn |
| - sc |
| - scl |
| - scn |
| - sd |
| - sei |
| - shu |
| - si |
| - sip |
| - siw |
| - sjr |
| - sk |
| - skg |
| - skr |
| - sl |
| - sn |
| - snc |
| - snk |
| - so |
| - sol |
| - sps |
| - sq |
| - sr |
| - src |
| - sro |
| - ssi |
| - ste |
| - sua |
| - sv |
| - sva |
| - sw |
| - szy |
| - ta |
| - tan |
| - tar |
| - tay |
| - tbf |
| - tcf |
| - tcy |
| - tdn |
| - tdx |
| - te |
| - tg |
| - tgc |
| - th |
| - the |
| - thq |
| - thr |
| - thv |
| - ti |
| - tig |
| - tio |
| - tk |
| - tkg |
| - tkt |
| - tli |
| - tlp |
| - tn |
| - tok |
| - tpl |
| - tpz |
| - tqp |
| - tr |
| - trp |
| - trq |
| - trv |
| - trw |
| - tt |
| - ttj |
| - ttr |
| - ttu |
| - tui |
| - tul |
| - tuq |
| - tuv |
| - tuy |
| - tvo |
| - tvu |
| - tw |
| - twu |
| - txs |
| - txy |
| - udl |
| - ug |
| - uk |
| - uki |
| - umb |
| - ur |
| - ush |
| - uz |
| - uzn |
| - vai |
| - var |
| - ver |
| - vi |
| - vmc |
| - vmj |
| - vmm |
| - vmp |
| - vmz |
| - vot |
| - vro |
| - wbl |
| - wci |
| - weo |
| - wes |
| - wja |
| - wji |
| - wo |
| - wof |
| - xh |
| - xhe |
| - xka |
| - xmf |
| - xmv |
| - xmw |
| - xpe |
| - xti |
| - xtu |
| - yaq |
| - yav |
| - yay |
| - ydd |
| - ydg |
| - yer |
| - 'yes' |
| - yi |
| - yo |
| - yue |
| - zga |
| - zgh |
| - zh |
| - zoc |
| - zoh |
| - zor |
| - zpv |
| - zpy |
| - ztg |
| - ztn |
| - ztp |
| - zts |
| - ztu |
| - zu |
| - zza |
| license: apache-2.0 |
| pipeline_tag: text-to-speech |
| tags: |
| - zero-shot |
| - multilingual |
| - voice-cloning |
| - voice-design |
| --- |
| |
| # OmniVoice 🌍 |
|
|
| <p align="center"> |
| <img width="200" height="200" alt="OmniVoice" src="https://zhu-han.github.io/omnivoice/pics/omnivoice.jpg" /> |
| </p> |
|
|
| <p align="center"> |
| <a href="https://huggingface.co/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Model-FFD21E" alt="Hugging Face Model"></a> |
| |
| <a href="https://huggingface.co/spaces/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Space-blue" alt="Hugging Face Space"></a> |
| |
| <a href="https://huggingface.co/papers/2604.00688"><img src="https://img.shields.io/badge/arXiv-Paper-B31B1B.svg"></a> |
| |
| <a href="https://github.com/k2-fsa/OmniVoice"><img src="https://img.shields.io/badge/GitHub-Code-181717?logo=GitHub" alt="GitHub Code"></a> |
| |
| <a href="https://zhu-han.github.io/omnivoice"><img src="https://img.shields.io/badge/GitHub.io-Demo_Page-blue?logo=GitHub&style=flat-square"></a> |
| </p> |
|
|
|
|
| OmniVoice is a massively multilingual zero-shot text-to-speech (TTS) model supporting over 600 languages. Built on a novel diffusion language model-style architecture, it delivers high-quality speech with superior inference speed, supporting voice cloning and voice design. |
|
|
| - **Paper:** [OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models](https://huggingface.co/papers/2604.00688) |
| - **Repository:** [GitHub](https://github.com/k2-fsa/OmniVoice) |
| - **Demo:** [Hugging Face Space](https://huggingface.co/spaces/k2-fsa/OmniVoice) |
|
|
| ## Key Features |
|
|
| - **600+ Languages Supported**: The broadest language coverage among zero-shot TTS models. |
| - **Voice Cloning**: State-of-the-art voice cloning quality from a short reference audio. |
| - **Voice Design**: Control voices via assigned speaker attributes (gender, age, pitch, dialect/accent, whisper, etc.). |
| - **Fine-grained Control**: Non-verbal symbols (e.g., `[laughter]`) and pronunciation correction via pinyin or phonemes. |
| - **Fast Inference**: RTF as low as 0.025 (40x faster than real-time). |
| - **Diffusion Language Model-style Architecture**: A clean, streamlined, and scalable design that delivers both quality and speed. |
|
|
| ## Sample Usage |
|
|
| To get started, install the `omnivoice` library: |
|
|
| > We recommend using a fresh virtual environment (e.g., `conda`, `venv`, etc.) to avoid conflicts. |
|
|
| **Step 1**: Install PyTorch |
|
|
| <details> |
| <summary>NVIDIA GPU</summary> |
|
|
| ```bash |
| # Install pytorch with your CUDA version, e.g. |
| pip install torch==2.8.0+cu128 torchaudio==2.8.0+cu128 --extra-index-url https://download.pytorch.org/whl/cu128 |
| ``` |
| > See [PyTorch official site](https://pytorch.org/get-started/locally/) for other versions installation. |
|
|
| </details> |
|
|
| <details> |
| <summary>Apple Silicon</summary> |
|
|
| ```bash |
| pip install torch==2.8.0 torchaudio==2.8.0 |
| ``` |
|
|
| </details> |
|
|
| **Step 2**: Install OmniVoice |
|
|
| ```bash |
| pip install omnivoice |
| ``` |
|
|
| ### Python API |
|
|
| You can use OmniVoice for zero-shot voice cloning as follows: |
|
|
| ```python |
| from omnivoice import OmniVoice |
| import torch |
| import torchaudio |
| |
| # Load the model |
| model = OmniVoice.from_pretrained( |
| "k2-fsa/OmniVoice", |
| device_map="cuda:0", |
| dtype=torch.float16 |
| ) |
| |
| # Generate audio |
| audio = model.generate( |
| text="Hello, this is a test of zero-shot voice cloning.", |
| ref_audio="ref.wav", |
| ref_text="Transcription of the reference audio.", |
| ) # audio is a list of `torch.Tensor` with shape (1, T) at 24 kHz. |
| |
| torchaudio.save("out.wav", audio[0], 24000) |
| ``` |
|
|
| For more generation modes (e.g., voice design), functions (e.g., non-verbal symbols, pronunciation correction) and comprehensive usage instructions, see our [GitHub Repository](https://github.com/k2-fsa/OmniVoice). |
|
|
|
|
| ## Discussion & Communication |
|
|
| You can directly discuss on [GitHub Issues](https://github.com/k2-fsa/OmniVoice/issues). |
|
|
| You can also scan the QR code to join our wechat group or follow our wechat official account. |
|
|
| | Wechat Group | Wechat Official Account | |
| | ------------ | ----------------------- | |
| | | | |
|
|
| ## Citation |
|
|
| ```bibtex |
| @article{zhu2026omnivoice, |
| title={OmniVoice: Towards Omnilingual Zero-Shot Text-to-Speech with Diffusion Language Models}, |
| author={Zhu, Han and Ye, Lingxuan and Kang, Wei and Yao, Zengwei and Guo, Liyong and Kuang, Fangjun and Han, Zhifeng and Zhuang, Weiji and Lin, Long and Povey, Daniel}, |
| journal={arXiv preprint arXiv:2604.00688}, |
| year={2026} |
| } |
| ``` |