Upload 48 files
Browse files- .gitattributes +8 -0
- iteration_1/README.md +151 -0
- iteration_1/manifest.json +216 -0
- iteration_1/packages/.DS_Store +0 -0
- iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/bert_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/decoder_pre_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/decoder_upsample_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/diffusion_unet_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/duration_predictor_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/f0n_predictor_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/har_source.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/har_source.mlpackage/Manifest.json +18 -0
- iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/ref_encoder_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- iteration_1/packages/text_encoder_fp16.mlpackage/Manifest.json +18 -0
- iteration_1/samples/sample_python.wav +3 -0
- iteration_1/samples/sample_swift.wav +3 -0
- iteration_1/voices/1221-135767-0014.wav +0 -0
- iteration_1/voices/1789_142896_000022_000005.wav +3 -0
- iteration_1/voices/3.wav +0 -0
- iteration_1/voices/4.wav +0 -0
- iteration_1/voices/4077-13754-0000.wav +0 -0
- iteration_1/voices/5.wav +0 -0
- iteration_1/voices/5639-40744-0020.wav +0 -0
- iteration_1/voices/696_92939_000016_000006.wav +3 -0
- iteration_1/voices/908-157963-0027.wav +0 -0
- iteration_1/voices/Gavin.wav +3 -0
- iteration_1/voices/Nima.wav +3 -0
- iteration_1/voices/Vinay.wav +3 -0
- iteration_1/voices/Yinghao.wav +3 -0
- iteration_1/voices/amused.wav +0 -0
- iteration_1/voices/anger.wav +0 -0
- iteration_1/voices/disgusted.wav +0 -0
- iteration_1/voices/sleepy.wav +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
iteration_1/samples/sample_python.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
iteration_1/samples/sample_swift.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
iteration_1/voices/1789_142896_000022_000005.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
iteration_1/voices/696_92939_000016_000006.wav filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
iteration_1/voices/Gavin.wav filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
iteration_1/voices/Nima.wav filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
iteration_1/voices/Vinay.wav filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
iteration_1/voices/Yinghao.wav filter=lfs diff=lfs merge=lfs -text
|
iteration_1/README.md
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
license: mit
|
| 5 |
+
library_name: coreml
|
| 6 |
+
tags:
|
| 7 |
+
- text-to-speech
|
| 8 |
+
- tts
|
| 9 |
+
- styletts2
|
| 10 |
+
- coreml
|
| 11 |
+
- apple-silicon
|
| 12 |
+
- voice-cloning
|
| 13 |
+
pipeline_tag: text-to-speech
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# StyleTTS2 LibriTTS — CoreML
|
| 17 |
+
|
| 18 |
+
Apple CoreML port of [yl4579/StyleTTS2](https://github.com/yl4579/StyleTTS2) (LibriTTS 2nd-stage checkpoint, epoch 20). 9-stage `.mlpackage` chain with mixed-precision and per-stage compute-unit assignments tuned for Apple Silicon (CPU + ANE + GPU).
|
| 19 |
+
|
| 20 |
+
24 kHz mono synthesis. Zero-shot voice cloning from a 3-10 second reference WAV.
|
| 21 |
+
|
| 22 |
+
## Highlights
|
| 23 |
+
|
| 24 |
+
- **9 stages, 258 MB on disk**, all fp16 except `har_source` (fp32 required for sin(2π·cumsum(f0)) numerical stability)
|
| 25 |
+
- **~390 ms warm CoreML predict** per utterance (M-series, mixed CPU+ANE+GPU)
|
| 26 |
+
- **RTFx ~9.4×** end-to-end (3.7 s of audio in ~390 ms)
|
| 27 |
+
- **~13 s cold start** (Apple `anecompilerservice` compiles ANE-targeted graphs on first call; fully cached afterwards)
|
| 28 |
+
- **Per-stage placement**: `text_encoder`/`duration_predictor`/`decoder_upsample` on CPU, `bert`/`ref_encoder`/`diffusion_unet`/`f0n_predictor`/`decoder_pre` on ANE, `har_source` on GPU
|
| 29 |
+
|
| 30 |
+
## Repository contents
|
| 31 |
+
|
| 32 |
+
```
|
| 33 |
+
packages/ 9 mlpackages (258 MB)
|
| 34 |
+
text_encoder_fp16.mlpackage 11 MB text → 512-dim embedding (LSTM, RangeDim T)
|
| 35 |
+
bert_fp16.mlpackage 12 MB Albert + bert_encoder (fixed T=57)
|
| 36 |
+
ref_encoder_fp16.mlpackage 53 MB reference mel → 256-dim style (CNN)
|
| 37 |
+
diffusion_unet_fp16.mlpackage 48 MB cross-attention U-Net (fixed T=57; ADPM2 sampler)
|
| 38 |
+
duration_predictor_fp16.mlpackage 15 MB LSTM + duration logits (RangeDim T)
|
| 39 |
+
f0n_predictor_fp16.mlpackage 16 MB F0 + noise prediction (RangeDim F)
|
| 40 |
+
har_source.mlpackage 12 KB F0 → harmonic source (RangeDim F0_LEN, fp32)
|
| 41 |
+
decoder_pre_fp16.mlpackage 64 MB AdaIN encode/decode + F0/N convs (RangeDim F)
|
| 42 |
+
decoder_upsample_fp16.mlpackage 40 MB HiFi-GAN Generator (RangeDim F→audio)
|
| 43 |
+
voices/ 17 reference clips (4 MB)
|
| 44 |
+
Yinghao.wav, Nima.wav, Gavin.wav, Vinay.wav Identity speakers
|
| 45 |
+
amused.wav, anger.wav, disgusted.wav, sleepy.wav Emotion clips
|
| 46 |
+
*.wav LibriTTS samples
|
| 47 |
+
samples/ End-to-end synthesis samples
|
| 48 |
+
sample_swift.wav Produced by the Swift CoreML driver
|
| 49 |
+
sample_python.wav Produced by the Python CoreML pipeline
|
| 50 |
+
manifest.json Machine-readable spec for all stages
|
| 51 |
+
README.md This file
|
| 52 |
+
```
|
| 53 |
+
|
| 54 |
+
## Limits
|
| 55 |
+
|
| 56 |
+
- **Phoneme cap: 57.** `bert` and `diffusion_unet` are pinned to a fixed token axis of 57 because the CoreML CPU MLProgram backend rejects RangeDim on their cross-attention shape ops. Inputs that phonemize to >57 tokens will fail. The other 7 stages support flexible token (1-512) and frame (1-2048) axes.
|
| 57 |
+
- **ANE compile fails** for the HiFi-GAN ConvTranspose1d ups stack inside `decoder_upsample`. CPU is the most predictable placement; GPU has slightly lower warm latency but contends with `har_source`.
|
| 58 |
+
- **Apple Silicon recommended.** Intel Macs have not been validated for CoreML mlprogram inference at scale.
|
| 59 |
+
|
| 60 |
+
## Pipeline (per utterance)
|
| 61 |
+
|
| 62 |
+
```
|
| 63 |
+
text → espeak-ng IPA → tokenize → token_ids
|
| 64 |
+
│
|
| 65 |
+
┌───────────────────────────────┼──────────────────────────────────┐
|
| 66 |
+
│ │ │
|
| 67 |
+
▼ ▼ ▼
|
| 68 |
+
text_encoder bert (fixed T=57) reference WAV → mel → ref_encoder
|
| 69 |
+
t_en [1,512,T] bert_dur [1,57,768] ref_s [1,256]
|
| 70 |
+
d_en [1,512,57]
|
| 71 |
+
│
|
| 72 |
+
▼
|
| 73 |
+
diffusion_unet × 5 ADPM2 steps (10 dispatches)
|
| 74 |
+
│
|
| 75 |
+
▼
|
| 76 |
+
s_pred [1,256]
|
| 77 |
+
↓ blend(α, β, ref_s) ↓
|
| 78 |
+
ref [1,128] s [1,128]
|
| 79 |
+
│
|
| 80 |
+
▼
|
| 81 |
+
duration_predictor
|
| 82 |
+
d [1,T,640] pred_dur → pred_aln_trg
|
| 83 |
+
│
|
| 84 |
+
▼ (matmul + hifigan tail-shift)
|
| 85 |
+
en [1,640,F] asr [1,512,F]
|
| 86 |
+
│
|
| 87 |
+
▼
|
| 88 |
+
f0n_predictor
|
| 89 |
+
f0_pred, n_pred [1, 2F]
|
| 90 |
+
│
|
| 91 |
+
▼
|
| 92 |
+
har_source
|
| 93 |
+
har [1,1,600F]
|
| 94 |
+
│
|
| 95 |
+
▼
|
| 96 |
+
decoder_pre
|
| 97 |
+
x_pre [1,512,2F]
|
| 98 |
+
│
|
| 99 |
+
▼
|
| 100 |
+
decoder_upsample
|
| 101 |
+
audio [1,1,72k+]
|
| 102 |
+
│
|
| 103 |
+
▼
|
| 104 |
+
tail-trim 50 samples → WAV @ 24 kHz
|
| 105 |
+
```
|
| 106 |
+
|
| 107 |
+
The 5 non-CoreML steps (espeak phonemize, ADPM2 sampler loop, mel extraction, alignment matrix, tail-shift) run host-side. See `manifest.json#non_coreml_pipeline_steps` for exact specs.
|
| 108 |
+
|
| 109 |
+
## Voices
|
| 110 |
+
|
| 111 |
+
`voices/*.wav` are zero-shot reference clips. The `ref_encoder` stage reads a mel of the chosen reference and produces a 256-dim style embedding that conditions every downstream stage. Bring your own clip — any 3-10 s mono recording at any sample rate works (resampled to 24 kHz internally). Quality is sensitive to reference cleanliness (background noise transfers).
|
| 112 |
+
|
| 113 |
+
## Quick demo (Swift)
|
| 114 |
+
|
| 115 |
+
A self-contained Swift demo exists that drives the last 4 stages directly from CoreML, given pre-computed inputs from the Python preprocessor. End-to-end Swift synthesis (no Python) requires porting espeak phonemize + mel + ADPM2 sampler + alignment, ~600 lines of Swift on top of these packages.
|
| 116 |
+
|
| 117 |
+
## Quick demo (Python)
|
| 118 |
+
|
| 119 |
+
```bash
|
| 120 |
+
git clone https://github.com/yl4579/StyleTTS2 # for the espeak/text frontend + checkpoint config
|
| 121 |
+
# Place this repo's packages/ as coreml/packages/ in StyleTTS2 working tree.
|
| 122 |
+
uv run python coreml/inference.py \
|
| 123 |
+
--text "StyleTTS 2 is a text to speech model." \
|
| 124 |
+
--reference voices/Yinghao.wav \
|
| 125 |
+
--output out.wav
|
| 126 |
+
```
|
| 127 |
+
|
| 128 |
+
## Conversion notes
|
| 129 |
+
|
| 130 |
+
- Source: PyTorch StyleTTS2 LibriTTS 2nd-stage checkpoint (yl4579/StyleTTS2 epoch 20).
|
| 131 |
+
- coremltools mlprogram, deployment target macOS15, fp16 compute precision.
|
| 132 |
+
- Mixed-precision: 7 stages fp16, 1 stage fp32 (`har_source`), 1 stage split for ANE compatibility (`decoder` → `decoder_pre` + `decoder_upsample`).
|
| 133 |
+
- Trace parity: all 9 stages mse=0 against eager PyTorch on the trace input.
|
| 134 |
+
- Quantization trials (linear int8, 8-bit k-means palettization) tested on `decoder_upsample`; both rejected — int8 is slower than fp16 on CPU (no native ConvTranspose1d kernel) and lossy quality (19 dB SNR) for palettization. fp16 is the production setting.
|
| 135 |
+
|
| 136 |
+
## License
|
| 137 |
+
|
| 138 |
+
MIT (matches upstream yl4579/StyleTTS2). LibriTTS reference clips inherit their LibriTTS / Apache-2.0 licensing.
|
| 139 |
+
|
| 140 |
+
## Citation
|
| 141 |
+
|
| 142 |
+
If you use this port, please cite the original StyleTTS2 paper:
|
| 143 |
+
|
| 144 |
+
```bibtex
|
| 145 |
+
@article{li2023styletts,
|
| 146 |
+
title={StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models},
|
| 147 |
+
author={Li, Yinghao Aaron and Han, Cong and Raghavan, Vinay and Mischler, Gavin and Mesgarani, Nima},
|
| 148 |
+
journal={arXiv preprint arXiv:2306.07691},
|
| 149 |
+
year={2023}
|
| 150 |
+
}
|
| 151 |
+
```
|
iteration_1/manifest.json
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_name": "styletts2-libritts-coreml",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"base_model": "yl4579/StyleTTS2 LibriTTS checkpoint (epoch 20, 2nd-stage)",
|
| 5 |
+
"sample_rate": 24000,
|
| 6 |
+
"frame_hop": 300,
|
| 7 |
+
"phoneme_vocab": "espeak-ng en-us IPA + StyleTTS2 TextCleaner",
|
| 8 |
+
"limits": {
|
| 9 |
+
"max_phonemes": 57,
|
| 10 |
+
"note": "bert and diffusion_unet stages have a fixed token axis of 57 (CoreML CPU MLProgram backend rejects RangeDim on these graphs). Inputs producing more than 57 phonemes will fail until token-bucketed packages are added."
|
| 11 |
+
},
|
| 12 |
+
"stages": [
|
| 13 |
+
{
|
| 14 |
+
"name": "text_encoder",
|
| 15 |
+
"package": "packages/text_encoder_fp16.mlpackage",
|
| 16 |
+
"precision": "fp16",
|
| 17 |
+
"compute_units": "CPU_ONLY",
|
| 18 |
+
"inputs": [
|
| 19 |
+
{ "name": "tokens", "shape": [1, "T_token"], "dtype": "int32", "range": [1, 512] },
|
| 20 |
+
{ "name": "input_lengths", "shape": [1], "dtype": "int32" },
|
| 21 |
+
{ "name": "text_mask", "shape": [1, "T_token"], "dtype": "float32" }
|
| 22 |
+
],
|
| 23 |
+
"outputs": [
|
| 24 |
+
{ "name": "t_en", "shape": [1, 512, "T_token"], "dtype": "float32" }
|
| 25 |
+
]
|
| 26 |
+
},
|
| 27 |
+
{
|
| 28 |
+
"name": "bert",
|
| 29 |
+
"package": "packages/bert_fp16.mlpackage",
|
| 30 |
+
"precision": "fp16",
|
| 31 |
+
"compute_units": "CPU_AND_NE",
|
| 32 |
+
"fixed_token_axis": 57,
|
| 33 |
+
"inputs": [
|
| 34 |
+
{ "name": "tokens", "shape": [1, 57], "dtype": "int32" },
|
| 35 |
+
{ "name": "attention_mask", "shape": [1, 57], "dtype": "int32" }
|
| 36 |
+
],
|
| 37 |
+
"outputs": [
|
| 38 |
+
{ "name": "bert_dur", "shape": [1, 57, 768] },
|
| 39 |
+
{ "name": "d_en", "shape": [1, 512, 57] }
|
| 40 |
+
]
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"name": "ref_encoder",
|
| 44 |
+
"package": "packages/ref_encoder_fp16.mlpackage",
|
| 45 |
+
"precision": "fp16",
|
| 46 |
+
"compute_units": "CPU_AND_NE",
|
| 47 |
+
"inputs": [
|
| 48 |
+
{ "name": "mel", "shape": [1, 1, 80, "T_mel"], "dtype": "float32",
|
| 49 |
+
"note": "24 kHz mel spectrogram of reference audio. n_fft=2048, hop=300, win=1200, n_mels=80." }
|
| 50 |
+
],
|
| 51 |
+
"outputs": [
|
| 52 |
+
{ "name": "ref_s", "shape": [1, 256], "dtype": "float32",
|
| 53 |
+
"note": "Style embedding. ref_s[:, :128] is reference timbre, ref_s[:, 128:] is reference prosody." }
|
| 54 |
+
]
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "diffusion_unet",
|
| 58 |
+
"package": "packages/diffusion_unet_fp16.mlpackage",
|
| 59 |
+
"precision": "fp16",
|
| 60 |
+
"compute_units": "CPU_AND_NE",
|
| 61 |
+
"fixed_token_axis": 57,
|
| 62 |
+
"inputs": [
|
| 63 |
+
{ "name": "x_noisy", "shape": [1, 1, 256] },
|
| 64 |
+
{ "name": "sigma", "shape": [1] },
|
| 65 |
+
{ "name": "embedding", "shape": [1, 57, 768] },
|
| 66 |
+
{ "name": "features", "shape": [1, 256] }
|
| 67 |
+
],
|
| 68 |
+
"outputs": [
|
| 69 |
+
{ "name": "x_denoised", "shape": [1, 1, 256] }
|
| 70 |
+
],
|
| 71 |
+
"note": "Called num_steps × 2 dispatches per utterance under ADPM2 sampler. Use Karras sigmas (sigma_min=0.0001, sigma_max=3.0, rho_schedule=9.0). 5 steps default."
|
| 72 |
+
},
|
| 73 |
+
{
|
| 74 |
+
"name": "duration_predictor",
|
| 75 |
+
"package": "packages/duration_predictor_fp16.mlpackage",
|
| 76 |
+
"precision": "fp16",
|
| 77 |
+
"compute_units": "CPU_ONLY",
|
| 78 |
+
"inputs": [
|
| 79 |
+
{ "name": "d_en", "shape": [1, 512, "T_token"] },
|
| 80 |
+
{ "name": "s", "shape": [1, 128] },
|
| 81 |
+
{ "name": "text_mask", "shape": [1, "T_token"] }
|
| 82 |
+
],
|
| 83 |
+
"outputs": [
|
| 84 |
+
{ "name": "d", "shape": [1, "T_token", 640] },
|
| 85 |
+
{ "name": "duration_logits", "shape": [1, "T_token", 50] }
|
| 86 |
+
]
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"name": "f0n_predictor",
|
| 90 |
+
"package": "packages/f0n_predictor_fp16.mlpackage",
|
| 91 |
+
"precision": "fp16",
|
| 92 |
+
"compute_units": "CPU_AND_NE",
|
| 93 |
+
"inputs": [
|
| 94 |
+
{ "name": "en", "shape": [1, 640, "T_frame"] },
|
| 95 |
+
{ "name": "s", "shape": [1, 128] }
|
| 96 |
+
],
|
| 97 |
+
"outputs": [
|
| 98 |
+
{ "name": "f0_pred", "shape": [1, "F0_LEN"] },
|
| 99 |
+
{ "name": "n_pred", "shape": [1, "F0_LEN"] }
|
| 100 |
+
],
|
| 101 |
+
"note": "F0_LEN = 2 * T_frame."
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"name": "har_source",
|
| 105 |
+
"package": "packages/har_source.mlpackage",
|
| 106 |
+
"precision": "fp32",
|
| 107 |
+
"compute_units": "CPU_AND_GPU",
|
| 108 |
+
"inputs": [
|
| 109 |
+
{ "name": "f0", "shape": [1, "F0_LEN"] }
|
| 110 |
+
],
|
| 111 |
+
"outputs": [
|
| 112 |
+
{ "name": "har", "shape": [1, 1, "HAR_LEN"] }
|
| 113 |
+
],
|
| 114 |
+
"note": "HAR_LEN = 300 * F0_LEN. fp32 required: computes sin(2π · cumsum(f0)) at audio rate; fp16 cumsum drifts ~10 bits over 74400 samples and produces audible phase distortion."
|
| 115 |
+
},
|
| 116 |
+
{
|
| 117 |
+
"name": "decoder_pre",
|
| 118 |
+
"package": "packages/decoder_pre_fp16.mlpackage",
|
| 119 |
+
"precision": "fp16",
|
| 120 |
+
"compute_units": "CPU_AND_NE",
|
| 121 |
+
"inputs": [
|
| 122 |
+
{ "name": "asr", "shape": [1, 512, "T_frame"] },
|
| 123 |
+
{ "name": "f0_pred", "shape": [1, "F0_LEN"] },
|
| 124 |
+
{ "name": "n_pred", "shape": [1, "F0_LEN"] },
|
| 125 |
+
{ "name": "ref", "shape": [1, 128] }
|
| 126 |
+
],
|
| 127 |
+
"outputs": [
|
| 128 |
+
{ "name": "x_pre", "shape": [1, 512, "T_frame2"] }
|
| 129 |
+
],
|
| 130 |
+
"note": "T_frame2 = 2 * T_frame. Splits the HiFi-GAN decoder: pre-stage (AdaIN encode/decode + F0/N convs) is ANE-clean."
|
| 131 |
+
},
|
| 132 |
+
{
|
| 133 |
+
"name": "decoder_upsample",
|
| 134 |
+
"package": "packages/decoder_upsample_fp16.mlpackage",
|
| 135 |
+
"precision": "fp16",
|
| 136 |
+
"compute_units": "CPU_ONLY",
|
| 137 |
+
"inputs": [
|
| 138 |
+
{ "name": "x_pre", "shape": [1, 512, "T_frame2"] },
|
| 139 |
+
{ "name": "ref", "shape": [1, 128] },
|
| 140 |
+
{ "name": "har_source", "shape": [1, 1, "HAR_LEN"] }
|
| 141 |
+
],
|
| 142 |
+
"outputs": [
|
| 143 |
+
{ "name": "audio", "shape": [1, 1, "AUDIO_LEN"] }
|
| 144 |
+
],
|
| 145 |
+
"note": "HiFi-GAN Generator (ConvTranspose1d ups stack). ANE compile fails (ANECCompile() FAILED), CPU_ONLY is the most predictable. Tail-trim 50 samples."
|
| 146 |
+
}
|
| 147 |
+
],
|
| 148 |
+
"pipeline_order": [
|
| 149 |
+
"text_encoder",
|
| 150 |
+
"bert",
|
| 151 |
+
"ref_encoder",
|
| 152 |
+
"diffusion_unet (×N steps × 2 dispatches under ADPM2)",
|
| 153 |
+
"duration_predictor",
|
| 154 |
+
"f0n_predictor",
|
| 155 |
+
"har_source",
|
| 156 |
+
"decoder_pre",
|
| 157 |
+
"decoder_upsample"
|
| 158 |
+
],
|
| 159 |
+
"non_coreml_pipeline_steps": [
|
| 160 |
+
"espeak-ng phonemize + StyleTTS2 TextCleaner tokenize",
|
| 161 |
+
"Karras sigma schedule (CPU)",
|
| 162 |
+
"ADPM2 step loop (5 steps default; each step = 2 diffusion_unet dispatches + RNG noise add)",
|
| 163 |
+
"Style blend: ref = α · s_pred[:, :128] + (1-α) · ref_s[:, :128]; s = β · s_pred[:, 128:] + (1-β) · ref_s[:, 128:]",
|
| 164 |
+
"Reference mel: librosa.load(sr=24000) → librosa.effects.trim(top_db=30) → mel(n_fft=2048, hop=300, win=1200, n_mels=80, fmin=0, fmax=8000)",
|
| 165 |
+
"pred_aln_trg construction from rounded predicted durations (data-dependent)",
|
| 166 |
+
"en/asr matmul: en = d.transpose(-1,-2) @ pred_aln_trg; asr = t_en @ pred_aln_trg",
|
| 167 |
+
"HiFi-GAN tail shift: roll asr/en right by one frame, repeat first frame"
|
| 168 |
+
],
|
| 169 |
+
"totals": {
|
| 170 |
+
"n_stages": 9,
|
| 171 |
+
"disk_size_mb": 258,
|
| 172 |
+
"warm_predict_ms_typical": 390,
|
| 173 |
+
"rtfx_typical": 9.4,
|
| 174 |
+
"cold_start_s_typical": 13,
|
| 175 |
+
"cold_start_breakdown": {
|
| 176 |
+
"anecompiler_first_call": "12s (Apple ANE compilation cache miss)",
|
| 177 |
+
"fp16_load": "~1s warm"
|
| 178 |
+
}
|
| 179 |
+
},
|
| 180 |
+
"voices": {
|
| 181 |
+
"directory": "voices/",
|
| 182 |
+
"type": "zero-shot reference clips (any 3-10s mono 24 kHz WAV; the model copies timbre + prosody)",
|
| 183 |
+
"samples": [
|
| 184 |
+
{"file": "Yinghao.wav", "lang": "en", "note": "neutral male"},
|
| 185 |
+
{"file": "Nima.wav", "lang": "en", "note": "neutral male"},
|
| 186 |
+
{"file": "Gavin.wav", "lang": "en", "note": "neutral male"},
|
| 187 |
+
{"file": "Vinay.wav", "lang": "en", "note": "neutral male"},
|
| 188 |
+
{"file": "amused.wav", "lang": "en", "note": "amused emotion"},
|
| 189 |
+
{"file": "anger.wav", "lang": "en", "note": "angry emotion"},
|
| 190 |
+
{"file": "disgusted.wav","lang": "en", "note": "disgusted emotion"},
|
| 191 |
+
{"file": "sleepy.wav", "lang": "en", "note": "sleepy emotion"},
|
| 192 |
+
{"file": "696_92939_000016_000006.wav", "lang": "en", "note": "LibriTTS sample, default reference"},
|
| 193 |
+
{"file": "1221-135767-0014.wav", "lang": "en", "note": "LibriTTS sample"},
|
| 194 |
+
{"file": "1789_142896_000022_000005.wav","lang":"en", "note": "LibriTTS sample"},
|
| 195 |
+
{"file": "4077-13754-0000.wav", "lang": "en", "note": "LibriTTS sample"},
|
| 196 |
+
{"file": "5639-40744-0020.wav", "lang": "en", "note": "LibriTTS sample"},
|
| 197 |
+
{"file": "908-157963-0027.wav", "lang": "en", "note": "LibriTTS sample"},
|
| 198 |
+
{"file": "3.wav", "lang": "en", "note": "misc reference"},
|
| 199 |
+
{"file": "4.wav", "lang": "en", "note": "misc reference"},
|
| 200 |
+
{"file": "5.wav", "lang": "en", "note": "misc reference"}
|
| 201 |
+
]
|
| 202 |
+
},
|
| 203 |
+
"samples": {
|
| 204 |
+
"directory": "samples/",
|
| 205 |
+
"files": [
|
| 206 |
+
{"file": "sample_swift.wav", "text": "Hello, this is StyleTTS 2.", "voice": "696_92939_000016_000006.wav", "produced_by": "Swift CoreML driver", "duration_s": 3.02},
|
| 207 |
+
{"file": "sample_python.wav", "text": "StyleTTS 2 is a text to speech model.", "voice": "696_92939_000016_000006.wav", "produced_by": "Python CoreML pipeline (coreml/inference.py)"}
|
| 208 |
+
]
|
| 209 |
+
},
|
| 210 |
+
"platform_requirements": {
|
| 211 |
+
"macos_min": "14.0",
|
| 212 |
+
"ios_min": "17.0 (mlprogram macOS15 deployment target — verify on iOS)",
|
| 213 |
+
"deployment_target": "macOS15",
|
| 214 |
+
"hardware": "Apple Silicon recommended"
|
| 215 |
+
}
|
| 216 |
+
}
|
iteration_1/packages/.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92c06d63856f46e8788c54fb2f2e7228d7da9798e2192c3078fb96a5f1de4074
|
| 3 |
+
size 85458
|
iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc4a9fb3870729f9572b0830993351524b04b99eba6cab982cef2a17507d9ba0
|
| 3 |
+
size 12090496
|
iteration_1/packages/bert_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"514C9E67-3E15-43D6-AE2B-6179B9113D2E": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"BED7A6A1-56C6-4FB3-AB4B-06ADAD7C844E": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "BED7A6A1-56C6-4FB3-AB4B-06ADAD7C844E"
|
| 18 |
+
}
|
iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:753dbab37d3232a69b52d48f5d0732632e9307d388ed5224736e9c585db6029c
|
| 3 |
+
size 55933
|
iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:db81849a38ce1959ea345219332051947f22f00dc2445cb9b7a119673ca4bf93
|
| 3 |
+
size 67190976
|
iteration_1/packages/decoder_pre_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"211DC47B-E839-4B47-B64D-EE04F9C081B9": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"BE7D7840-FCB4-4491-B2ED-0D81B5FD33AA": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "211DC47B-E839-4B47-B64D-EE04F9C081B9"
|
| 18 |
+
}
|
iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff29829e3c92a4208ef07d307293fd576c4484c6048e519b90cd32ee80180038
|
| 3 |
+
size 491796
|
iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43161151f001bb951c34952465adfc3c4f5fb8ab2845f31903be09ea9f1a6bc5
|
| 3 |
+
size 41400320
|
iteration_1/packages/decoder_upsample_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"26E8FCA8-BD9B-4185-B59E-00453487B2B3": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"BFFB197D-D576-4F27-85E5-48F5438F08C2": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "26E8FCA8-BD9B-4185-B59E-00453487B2B3"
|
| 18 |
+
}
|
iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f9638ea3705d5cc55e8b5572e3c38562536b8c20656bcb4fa1047edbb8af375b
|
| 3 |
+
size 54989
|
iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17ba48a8bdc68851289a23593b223573aaddd1b445e8c77765f5350feed8a251
|
| 3 |
+
size 49873792
|
iteration_1/packages/diffusion_unet_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"97A6E9E7-9101-417E-A712-61EF425AB960": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"F9ED6F9B-AAC4-4FAA-90E0-E2C4593DDE7D": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "97A6E9E7-9101-417E-A712-61EF425AB960"
|
| 18 |
+
}
|
iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b60eccf1aff0c09069d4eeebb5611c11caee89788229d0780ef606ac8fa1384
|
| 3 |
+
size 29886
|
iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:75ba0b7b2f7dc6a687e9ec01d226c300b09f07832d8e4aac2705a16b5079910c
|
| 3 |
+
size 15543524
|
iteration_1/packages/duration_predictor_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"EA4FC14C-8DE2-414B-A6C4-B93190F89ED0": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"EAAA83DE-C745-4884-AE8D-1ED5C06BC490": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "EAAA83DE-C745-4884-AE8D-1ED5C06BC490"
|
| 18 |
+
}
|
iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:578305e49608b3685aac87a7f45aa188709e27b1f10b096a0a6ae0a66170871d
|
| 3 |
+
size 62172
|
iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b21f183d2ff876842ea2df14cdc033c8935a1805382b70b241c4f5a1bf32b3a8
|
| 3 |
+
size 16822272
|
iteration_1/packages/f0n_predictor_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"85A98E5E-641F-442C-9B6C-DCFEFD1BB71F": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"C8FFE55D-6CFE-4EFD-9798-D2D005C4676F": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "C8FFE55D-6CFE-4EFD-9798-D2D005C4676F"
|
| 18 |
+
}
|
iteration_1/packages/har_source.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49dae5b07689818410c81a0fb8af58a586d88ef211be419ccdc8fd5dc6467ae5
|
| 3 |
+
size 6956
|
iteration_1/packages/har_source.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"43EF7B77-88D8-4FB5-B59A-B9551E121DB3": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"C511A11E-3A7D-41FC-82E5-3BEB08F2D35D": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "43EF7B77-88D8-4FB5-B59A-B9551E121DB3"
|
| 18 |
+
}
|
iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5cbd0cf223b874ed6b2de35606a5690bc6355b4890ea32ec30119db5dc00497e
|
| 3 |
+
size 68843
|
iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:011d14fdb46589dfb79efb619d63846430be4e4ac86372f8819f35f5e0157391
|
| 3 |
+
size 55386048
|
iteration_1/packages/ref_encoder_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"32FE6195-7355-4635-AECB-58D9F49F1E17": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Weights",
|
| 7 |
+
"name": "weights",
|
| 8 |
+
"path": "com.apple.CoreML/weights"
|
| 9 |
+
},
|
| 10 |
+
"343F4722-A338-4705-8547-09E9A93DE8EC": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Specification",
|
| 13 |
+
"name": "model.mlmodel",
|
| 14 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "343F4722-A338-4705-8547-09E9A93DE8EC"
|
| 18 |
+
}
|
iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d88b74cb84892f7ff1e4d013517dd3d4dab56688b0a0fb4d920f72d0caf9e961
|
| 3 |
+
size 16587
|
iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0d7f6e5869bb9d523956183e0facdff160c301d28113290efa329ae7bf72d3ce
|
| 3 |
+
size 11208000
|
iteration_1/packages/text_encoder_fp16.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"7F3243AB-2AFC-40E5-A6DE-069619301D63": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"F67A2205-52AD-4B8E-A19F-A7FB9AEB48F9": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "7F3243AB-2AFC-40E5-A6DE-069619301D63"
|
| 18 |
+
}
|
iteration_1/samples/sample_python.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0aa2e0b8bd1b89e8d1db8c38666d6d16b970a58eb19c056b9075eece852d422a
|
| 3 |
+
size 176344
|
iteration_1/samples/sample_swift.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f95f71e282b55d77fd70ac68cc3b41280337a60dff0179481ae85950f69d0cdd
|
| 3 |
+
size 145144
|
iteration_1/voices/1221-135767-0014.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/1789_142896_000022_000005.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0afdfcafb65ee88a6db67dc83c53e1e1c73346813df381afc9a0812c43f8ddbd
|
| 3 |
+
size 150284
|
iteration_1/voices/3.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/4.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/4077-13754-0000.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/5.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/5639-40744-0020.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/696_92939_000016_000006.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6a7d39beddd2c24d864163ce38e799b261ab0bc23cbea492f0ece046feb131f1
|
| 3 |
+
size 145484
|
iteration_1/voices/908-157963-0027.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/Gavin.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:912208b6beaeff83cd6307bacf6b3842a4b32c9f0780f95146ea6806af436c83
|
| 3 |
+
size 998740
|
iteration_1/voices/Nima.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:44695a7c2723a6d857a6c075d75f57b525febe601f9162984adda8c713bc1ad4
|
| 3 |
+
size 758318
|
iteration_1/voices/Vinay.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc194f600ad9a7fff3c8b8914851006fabea010c48258602933f92c9b0b8bbf1
|
| 3 |
+
size 694194
|
iteration_1/voices/Yinghao.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1d37acfa68e59401afec39c89b17509dda48b40f7ac60650e0e668145355799b
|
| 3 |
+
size 404574
|
iteration_1/voices/amused.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/anger.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/disgusted.wav
ADDED
|
Binary file (96 kB). View file
|
|
|
iteration_1/voices/sleepy.wav
ADDED
|
Binary file (96 kB). View file
|
|
|