alexwengg commited on
Commit
77c9940
·
verified ·
1 Parent(s): 677224d

Upload 48 files

Browse files
Files changed (49) hide show
  1. .gitattributes +8 -0
  2. iteration_1/README.md +151 -0
  3. iteration_1/manifest.json +216 -0
  4. iteration_1/packages/.DS_Store +0 -0
  5. iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  6. iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  7. iteration_1/packages/bert_fp16.mlpackage/Manifest.json +18 -0
  8. iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  9. iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  10. iteration_1/packages/decoder_pre_fp16.mlpackage/Manifest.json +18 -0
  11. iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  12. iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  13. iteration_1/packages/decoder_upsample_fp16.mlpackage/Manifest.json +18 -0
  14. iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  15. iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  16. iteration_1/packages/diffusion_unet_fp16.mlpackage/Manifest.json +18 -0
  17. iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  18. iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  19. iteration_1/packages/duration_predictor_fp16.mlpackage/Manifest.json +18 -0
  20. iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  21. iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  22. iteration_1/packages/f0n_predictor_fp16.mlpackage/Manifest.json +18 -0
  23. iteration_1/packages/har_source.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  24. iteration_1/packages/har_source.mlpackage/Manifest.json +18 -0
  25. iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  26. iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  27. iteration_1/packages/ref_encoder_fp16.mlpackage/Manifest.json +18 -0
  28. iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  29. iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  30. iteration_1/packages/text_encoder_fp16.mlpackage/Manifest.json +18 -0
  31. iteration_1/samples/sample_python.wav +3 -0
  32. iteration_1/samples/sample_swift.wav +3 -0
  33. iteration_1/voices/1221-135767-0014.wav +0 -0
  34. iteration_1/voices/1789_142896_000022_000005.wav +3 -0
  35. iteration_1/voices/3.wav +0 -0
  36. iteration_1/voices/4.wav +0 -0
  37. iteration_1/voices/4077-13754-0000.wav +0 -0
  38. iteration_1/voices/5.wav +0 -0
  39. iteration_1/voices/5639-40744-0020.wav +0 -0
  40. iteration_1/voices/696_92939_000016_000006.wav +3 -0
  41. iteration_1/voices/908-157963-0027.wav +0 -0
  42. iteration_1/voices/Gavin.wav +3 -0
  43. iteration_1/voices/Nima.wav +3 -0
  44. iteration_1/voices/Vinay.wav +3 -0
  45. iteration_1/voices/Yinghao.wav +3 -0
  46. iteration_1/voices/amused.wav +0 -0
  47. iteration_1/voices/anger.wav +0 -0
  48. iteration_1/voices/disgusted.wav +0 -0
  49. iteration_1/voices/sleepy.wav +0 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ iteration_1/samples/sample_python.wav filter=lfs diff=lfs merge=lfs -text
37
+ iteration_1/samples/sample_swift.wav filter=lfs diff=lfs merge=lfs -text
38
+ iteration_1/voices/1789_142896_000022_000005.wav filter=lfs diff=lfs merge=lfs -text
39
+ iteration_1/voices/696_92939_000016_000006.wav filter=lfs diff=lfs merge=lfs -text
40
+ iteration_1/voices/Gavin.wav filter=lfs diff=lfs merge=lfs -text
41
+ iteration_1/voices/Nima.wav filter=lfs diff=lfs merge=lfs -text
42
+ iteration_1/voices/Vinay.wav filter=lfs diff=lfs merge=lfs -text
43
+ iteration_1/voices/Yinghao.wav filter=lfs diff=lfs merge=lfs -text
iteration_1/README.md ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language:
3
+ - en
4
+ license: mit
5
+ library_name: coreml
6
+ tags:
7
+ - text-to-speech
8
+ - tts
9
+ - styletts2
10
+ - coreml
11
+ - apple-silicon
12
+ - voice-cloning
13
+ pipeline_tag: text-to-speech
14
+ ---
15
+
16
+ # StyleTTS2 LibriTTS — CoreML
17
+
18
+ Apple CoreML port of [yl4579/StyleTTS2](https://github.com/yl4579/StyleTTS2) (LibriTTS 2nd-stage checkpoint, epoch 20). 9-stage `.mlpackage` chain with mixed-precision and per-stage compute-unit assignments tuned for Apple Silicon (CPU + ANE + GPU).
19
+
20
+ 24 kHz mono synthesis. Zero-shot voice cloning from a 3-10 second reference WAV.
21
+
22
+ ## Highlights
23
+
24
+ - **9 stages, 258 MB on disk**, all fp16 except `har_source` (fp32 required for sin(2π·cumsum(f0)) numerical stability)
25
+ - **~390 ms warm CoreML predict** per utterance (M-series, mixed CPU+ANE+GPU)
26
+ - **RTFx ~9.4×** end-to-end (3.7 s of audio in ~390 ms)
27
+ - **~13 s cold start** (Apple `anecompilerservice` compiles ANE-targeted graphs on first call; fully cached afterwards)
28
+ - **Per-stage placement**: `text_encoder`/`duration_predictor`/`decoder_upsample` on CPU, `bert`/`ref_encoder`/`diffusion_unet`/`f0n_predictor`/`decoder_pre` on ANE, `har_source` on GPU
29
+
30
+ ## Repository contents
31
+
32
+ ```
33
+ packages/ 9 mlpackages (258 MB)
34
+ text_encoder_fp16.mlpackage 11 MB text → 512-dim embedding (LSTM, RangeDim T)
35
+ bert_fp16.mlpackage 12 MB Albert + bert_encoder (fixed T=57)
36
+ ref_encoder_fp16.mlpackage 53 MB reference mel → 256-dim style (CNN)
37
+ diffusion_unet_fp16.mlpackage 48 MB cross-attention U-Net (fixed T=57; ADPM2 sampler)
38
+ duration_predictor_fp16.mlpackage 15 MB LSTM + duration logits (RangeDim T)
39
+ f0n_predictor_fp16.mlpackage 16 MB F0 + noise prediction (RangeDim F)
40
+ har_source.mlpackage 12 KB F0 → harmonic source (RangeDim F0_LEN, fp32)
41
+ decoder_pre_fp16.mlpackage 64 MB AdaIN encode/decode + F0/N convs (RangeDim F)
42
+ decoder_upsample_fp16.mlpackage 40 MB HiFi-GAN Generator (RangeDim F→audio)
43
+ voices/ 17 reference clips (4 MB)
44
+ Yinghao.wav, Nima.wav, Gavin.wav, Vinay.wav Identity speakers
45
+ amused.wav, anger.wav, disgusted.wav, sleepy.wav Emotion clips
46
+ *.wav LibriTTS samples
47
+ samples/ End-to-end synthesis samples
48
+ sample_swift.wav Produced by the Swift CoreML driver
49
+ sample_python.wav Produced by the Python CoreML pipeline
50
+ manifest.json Machine-readable spec for all stages
51
+ README.md This file
52
+ ```
53
+
54
+ ## Limits
55
+
56
+ - **Phoneme cap: 57.** `bert` and `diffusion_unet` are pinned to a fixed token axis of 57 because the CoreML CPU MLProgram backend rejects RangeDim on their cross-attention shape ops. Inputs that phonemize to >57 tokens will fail. The other 7 stages support flexible token (1-512) and frame (1-2048) axes.
57
+ - **ANE compile fails** for the HiFi-GAN ConvTranspose1d ups stack inside `decoder_upsample`. CPU is the most predictable placement; GPU has slightly lower warm latency but contends with `har_source`.
58
+ - **Apple Silicon recommended.** Intel Macs have not been validated for CoreML mlprogram inference at scale.
59
+
60
+ ## Pipeline (per utterance)
61
+
62
+ ```
63
+ text → espeak-ng IPA → tokenize → token_ids
64
+
65
+ ┌───────────────────────────────┼──────────────────────────────────┐
66
+ │ │ │
67
+ ▼ ▼ ▼
68
+ text_encoder bert (fixed T=57) reference WAV → mel → ref_encoder
69
+ t_en [1,512,T] bert_dur [1,57,768] ref_s [1,256]
70
+ d_en [1,512,57]
71
+
72
+
73
+ diffusion_unet × 5 ADPM2 steps (10 dispatches)
74
+
75
+
76
+ s_pred [1,256]
77
+ ↓ blend(α, β, ref_s) ↓
78
+ ref [1,128] s [1,128]
79
+
80
+
81
+ duration_predictor
82
+ d [1,T,640] pred_dur → pred_aln_trg
83
+
84
+ ▼ (matmul + hifigan tail-shift)
85
+ en [1,640,F] asr [1,512,F]
86
+
87
+
88
+ f0n_predictor
89
+ f0_pred, n_pred [1, 2F]
90
+
91
+
92
+ har_source
93
+ har [1,1,600F]
94
+
95
+
96
+ decoder_pre
97
+ x_pre [1,512,2F]
98
+
99
+
100
+ decoder_upsample
101
+ audio [1,1,72k+]
102
+
103
+
104
+ tail-trim 50 samples → WAV @ 24 kHz
105
+ ```
106
+
107
+ The 5 non-CoreML steps (espeak phonemize, ADPM2 sampler loop, mel extraction, alignment matrix, tail-shift) run host-side. See `manifest.json#non_coreml_pipeline_steps` for exact specs.
108
+
109
+ ## Voices
110
+
111
+ `voices/*.wav` are zero-shot reference clips. The `ref_encoder` stage reads a mel of the chosen reference and produces a 256-dim style embedding that conditions every downstream stage. Bring your own clip — any 3-10 s mono recording at any sample rate works (resampled to 24 kHz internally). Quality is sensitive to reference cleanliness (background noise transfers).
112
+
113
+ ## Quick demo (Swift)
114
+
115
+ A self-contained Swift demo exists that drives the last 4 stages directly from CoreML, given pre-computed inputs from the Python preprocessor. End-to-end Swift synthesis (no Python) requires porting espeak phonemize + mel + ADPM2 sampler + alignment, ~600 lines of Swift on top of these packages.
116
+
117
+ ## Quick demo (Python)
118
+
119
+ ```bash
120
+ git clone https://github.com/yl4579/StyleTTS2 # for the espeak/text frontend + checkpoint config
121
+ # Place this repo's packages/ as coreml/packages/ in StyleTTS2 working tree.
122
+ uv run python coreml/inference.py \
123
+ --text "StyleTTS 2 is a text to speech model." \
124
+ --reference voices/Yinghao.wav \
125
+ --output out.wav
126
+ ```
127
+
128
+ ## Conversion notes
129
+
130
+ - Source: PyTorch StyleTTS2 LibriTTS 2nd-stage checkpoint (yl4579/StyleTTS2 epoch 20).
131
+ - coremltools mlprogram, deployment target macOS15, fp16 compute precision.
132
+ - Mixed-precision: 7 stages fp16, 1 stage fp32 (`har_source`), 1 stage split for ANE compatibility (`decoder` → `decoder_pre` + `decoder_upsample`).
133
+ - Trace parity: all 9 stages mse=0 against eager PyTorch on the trace input.
134
+ - Quantization trials (linear int8, 8-bit k-means palettization) tested on `decoder_upsample`; both rejected — int8 is slower than fp16 on CPU (no native ConvTranspose1d kernel) and lossy quality (19 dB SNR) for palettization. fp16 is the production setting.
135
+
136
+ ## License
137
+
138
+ MIT (matches upstream yl4579/StyleTTS2). LibriTTS reference clips inherit their LibriTTS / Apache-2.0 licensing.
139
+
140
+ ## Citation
141
+
142
+ If you use this port, please cite the original StyleTTS2 paper:
143
+
144
+ ```bibtex
145
+ @article{li2023styletts,
146
+ title={StyleTTS 2: Towards Human-Level Text-to-Speech through Style Diffusion and Adversarial Training with Large Speech Language Models},
147
+ author={Li, Yinghao Aaron and Han, Cong and Raghavan, Vinay and Mischler, Gavin and Mesgarani, Nima},
148
+ journal={arXiv preprint arXiv:2306.07691},
149
+ year={2023}
150
+ }
151
+ ```
iteration_1/manifest.json ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "styletts2-libritts-coreml",
3
+ "version": "1.0.0",
4
+ "base_model": "yl4579/StyleTTS2 LibriTTS checkpoint (epoch 20, 2nd-stage)",
5
+ "sample_rate": 24000,
6
+ "frame_hop": 300,
7
+ "phoneme_vocab": "espeak-ng en-us IPA + StyleTTS2 TextCleaner",
8
+ "limits": {
9
+ "max_phonemes": 57,
10
+ "note": "bert and diffusion_unet stages have a fixed token axis of 57 (CoreML CPU MLProgram backend rejects RangeDim on these graphs). Inputs producing more than 57 phonemes will fail until token-bucketed packages are added."
11
+ },
12
+ "stages": [
13
+ {
14
+ "name": "text_encoder",
15
+ "package": "packages/text_encoder_fp16.mlpackage",
16
+ "precision": "fp16",
17
+ "compute_units": "CPU_ONLY",
18
+ "inputs": [
19
+ { "name": "tokens", "shape": [1, "T_token"], "dtype": "int32", "range": [1, 512] },
20
+ { "name": "input_lengths", "shape": [1], "dtype": "int32" },
21
+ { "name": "text_mask", "shape": [1, "T_token"], "dtype": "float32" }
22
+ ],
23
+ "outputs": [
24
+ { "name": "t_en", "shape": [1, 512, "T_token"], "dtype": "float32" }
25
+ ]
26
+ },
27
+ {
28
+ "name": "bert",
29
+ "package": "packages/bert_fp16.mlpackage",
30
+ "precision": "fp16",
31
+ "compute_units": "CPU_AND_NE",
32
+ "fixed_token_axis": 57,
33
+ "inputs": [
34
+ { "name": "tokens", "shape": [1, 57], "dtype": "int32" },
35
+ { "name": "attention_mask", "shape": [1, 57], "dtype": "int32" }
36
+ ],
37
+ "outputs": [
38
+ { "name": "bert_dur", "shape": [1, 57, 768] },
39
+ { "name": "d_en", "shape": [1, 512, 57] }
40
+ ]
41
+ },
42
+ {
43
+ "name": "ref_encoder",
44
+ "package": "packages/ref_encoder_fp16.mlpackage",
45
+ "precision": "fp16",
46
+ "compute_units": "CPU_AND_NE",
47
+ "inputs": [
48
+ { "name": "mel", "shape": [1, 1, 80, "T_mel"], "dtype": "float32",
49
+ "note": "24 kHz mel spectrogram of reference audio. n_fft=2048, hop=300, win=1200, n_mels=80." }
50
+ ],
51
+ "outputs": [
52
+ { "name": "ref_s", "shape": [1, 256], "dtype": "float32",
53
+ "note": "Style embedding. ref_s[:, :128] is reference timbre, ref_s[:, 128:] is reference prosody." }
54
+ ]
55
+ },
56
+ {
57
+ "name": "diffusion_unet",
58
+ "package": "packages/diffusion_unet_fp16.mlpackage",
59
+ "precision": "fp16",
60
+ "compute_units": "CPU_AND_NE",
61
+ "fixed_token_axis": 57,
62
+ "inputs": [
63
+ { "name": "x_noisy", "shape": [1, 1, 256] },
64
+ { "name": "sigma", "shape": [1] },
65
+ { "name": "embedding", "shape": [1, 57, 768] },
66
+ { "name": "features", "shape": [1, 256] }
67
+ ],
68
+ "outputs": [
69
+ { "name": "x_denoised", "shape": [1, 1, 256] }
70
+ ],
71
+ "note": "Called num_steps × 2 dispatches per utterance under ADPM2 sampler. Use Karras sigmas (sigma_min=0.0001, sigma_max=3.0, rho_schedule=9.0). 5 steps default."
72
+ },
73
+ {
74
+ "name": "duration_predictor",
75
+ "package": "packages/duration_predictor_fp16.mlpackage",
76
+ "precision": "fp16",
77
+ "compute_units": "CPU_ONLY",
78
+ "inputs": [
79
+ { "name": "d_en", "shape": [1, 512, "T_token"] },
80
+ { "name": "s", "shape": [1, 128] },
81
+ { "name": "text_mask", "shape": [1, "T_token"] }
82
+ ],
83
+ "outputs": [
84
+ { "name": "d", "shape": [1, "T_token", 640] },
85
+ { "name": "duration_logits", "shape": [1, "T_token", 50] }
86
+ ]
87
+ },
88
+ {
89
+ "name": "f0n_predictor",
90
+ "package": "packages/f0n_predictor_fp16.mlpackage",
91
+ "precision": "fp16",
92
+ "compute_units": "CPU_AND_NE",
93
+ "inputs": [
94
+ { "name": "en", "shape": [1, 640, "T_frame"] },
95
+ { "name": "s", "shape": [1, 128] }
96
+ ],
97
+ "outputs": [
98
+ { "name": "f0_pred", "shape": [1, "F0_LEN"] },
99
+ { "name": "n_pred", "shape": [1, "F0_LEN"] }
100
+ ],
101
+ "note": "F0_LEN = 2 * T_frame."
102
+ },
103
+ {
104
+ "name": "har_source",
105
+ "package": "packages/har_source.mlpackage",
106
+ "precision": "fp32",
107
+ "compute_units": "CPU_AND_GPU",
108
+ "inputs": [
109
+ { "name": "f0", "shape": [1, "F0_LEN"] }
110
+ ],
111
+ "outputs": [
112
+ { "name": "har", "shape": [1, 1, "HAR_LEN"] }
113
+ ],
114
+ "note": "HAR_LEN = 300 * F0_LEN. fp32 required: computes sin(2π · cumsum(f0)) at audio rate; fp16 cumsum drifts ~10 bits over 74400 samples and produces audible phase distortion."
115
+ },
116
+ {
117
+ "name": "decoder_pre",
118
+ "package": "packages/decoder_pre_fp16.mlpackage",
119
+ "precision": "fp16",
120
+ "compute_units": "CPU_AND_NE",
121
+ "inputs": [
122
+ { "name": "asr", "shape": [1, 512, "T_frame"] },
123
+ { "name": "f0_pred", "shape": [1, "F0_LEN"] },
124
+ { "name": "n_pred", "shape": [1, "F0_LEN"] },
125
+ { "name": "ref", "shape": [1, 128] }
126
+ ],
127
+ "outputs": [
128
+ { "name": "x_pre", "shape": [1, 512, "T_frame2"] }
129
+ ],
130
+ "note": "T_frame2 = 2 * T_frame. Splits the HiFi-GAN decoder: pre-stage (AdaIN encode/decode + F0/N convs) is ANE-clean."
131
+ },
132
+ {
133
+ "name": "decoder_upsample",
134
+ "package": "packages/decoder_upsample_fp16.mlpackage",
135
+ "precision": "fp16",
136
+ "compute_units": "CPU_ONLY",
137
+ "inputs": [
138
+ { "name": "x_pre", "shape": [1, 512, "T_frame2"] },
139
+ { "name": "ref", "shape": [1, 128] },
140
+ { "name": "har_source", "shape": [1, 1, "HAR_LEN"] }
141
+ ],
142
+ "outputs": [
143
+ { "name": "audio", "shape": [1, 1, "AUDIO_LEN"] }
144
+ ],
145
+ "note": "HiFi-GAN Generator (ConvTranspose1d ups stack). ANE compile fails (ANECCompile() FAILED), CPU_ONLY is the most predictable. Tail-trim 50 samples."
146
+ }
147
+ ],
148
+ "pipeline_order": [
149
+ "text_encoder",
150
+ "bert",
151
+ "ref_encoder",
152
+ "diffusion_unet (×N steps × 2 dispatches under ADPM2)",
153
+ "duration_predictor",
154
+ "f0n_predictor",
155
+ "har_source",
156
+ "decoder_pre",
157
+ "decoder_upsample"
158
+ ],
159
+ "non_coreml_pipeline_steps": [
160
+ "espeak-ng phonemize + StyleTTS2 TextCleaner tokenize",
161
+ "Karras sigma schedule (CPU)",
162
+ "ADPM2 step loop (5 steps default; each step = 2 diffusion_unet dispatches + RNG noise add)",
163
+ "Style blend: ref = α · s_pred[:, :128] + (1-α) · ref_s[:, :128]; s = β · s_pred[:, 128:] + (1-β) · ref_s[:, 128:]",
164
+ "Reference mel: librosa.load(sr=24000) → librosa.effects.trim(top_db=30) → mel(n_fft=2048, hop=300, win=1200, n_mels=80, fmin=0, fmax=8000)",
165
+ "pred_aln_trg construction from rounded predicted durations (data-dependent)",
166
+ "en/asr matmul: en = d.transpose(-1,-2) @ pred_aln_trg; asr = t_en @ pred_aln_trg",
167
+ "HiFi-GAN tail shift: roll asr/en right by one frame, repeat first frame"
168
+ ],
169
+ "totals": {
170
+ "n_stages": 9,
171
+ "disk_size_mb": 258,
172
+ "warm_predict_ms_typical": 390,
173
+ "rtfx_typical": 9.4,
174
+ "cold_start_s_typical": 13,
175
+ "cold_start_breakdown": {
176
+ "anecompiler_first_call": "12s (Apple ANE compilation cache miss)",
177
+ "fp16_load": "~1s warm"
178
+ }
179
+ },
180
+ "voices": {
181
+ "directory": "voices/",
182
+ "type": "zero-shot reference clips (any 3-10s mono 24 kHz WAV; the model copies timbre + prosody)",
183
+ "samples": [
184
+ {"file": "Yinghao.wav", "lang": "en", "note": "neutral male"},
185
+ {"file": "Nima.wav", "lang": "en", "note": "neutral male"},
186
+ {"file": "Gavin.wav", "lang": "en", "note": "neutral male"},
187
+ {"file": "Vinay.wav", "lang": "en", "note": "neutral male"},
188
+ {"file": "amused.wav", "lang": "en", "note": "amused emotion"},
189
+ {"file": "anger.wav", "lang": "en", "note": "angry emotion"},
190
+ {"file": "disgusted.wav","lang": "en", "note": "disgusted emotion"},
191
+ {"file": "sleepy.wav", "lang": "en", "note": "sleepy emotion"},
192
+ {"file": "696_92939_000016_000006.wav", "lang": "en", "note": "LibriTTS sample, default reference"},
193
+ {"file": "1221-135767-0014.wav", "lang": "en", "note": "LibriTTS sample"},
194
+ {"file": "1789_142896_000022_000005.wav","lang":"en", "note": "LibriTTS sample"},
195
+ {"file": "4077-13754-0000.wav", "lang": "en", "note": "LibriTTS sample"},
196
+ {"file": "5639-40744-0020.wav", "lang": "en", "note": "LibriTTS sample"},
197
+ {"file": "908-157963-0027.wav", "lang": "en", "note": "LibriTTS sample"},
198
+ {"file": "3.wav", "lang": "en", "note": "misc reference"},
199
+ {"file": "4.wav", "lang": "en", "note": "misc reference"},
200
+ {"file": "5.wav", "lang": "en", "note": "misc reference"}
201
+ ]
202
+ },
203
+ "samples": {
204
+ "directory": "samples/",
205
+ "files": [
206
+ {"file": "sample_swift.wav", "text": "Hello, this is StyleTTS 2.", "voice": "696_92939_000016_000006.wav", "produced_by": "Swift CoreML driver", "duration_s": 3.02},
207
+ {"file": "sample_python.wav", "text": "StyleTTS 2 is a text to speech model.", "voice": "696_92939_000016_000006.wav", "produced_by": "Python CoreML pipeline (coreml/inference.py)"}
208
+ ]
209
+ },
210
+ "platform_requirements": {
211
+ "macos_min": "14.0",
212
+ "ios_min": "17.0 (mlprogram macOS15 deployment target — verify on iOS)",
213
+ "deployment_target": "macOS15",
214
+ "hardware": "Apple Silicon recommended"
215
+ }
216
+ }
iteration_1/packages/.DS_Store ADDED
Binary file (10.2 kB). View file
 
iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c06d63856f46e8788c54fb2f2e7228d7da9798e2192c3078fb96a5f1de4074
3
+ size 85458
iteration_1/packages/bert_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc4a9fb3870729f9572b0830993351524b04b99eba6cab982cef2a17507d9ba0
3
+ size 12090496
iteration_1/packages/bert_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "514C9E67-3E15-43D6-AE2B-6179B9113D2E": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "BED7A6A1-56C6-4FB3-AB4B-06ADAD7C844E": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "BED7A6A1-56C6-4FB3-AB4B-06ADAD7C844E"
18
+ }
iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:753dbab37d3232a69b52d48f5d0732632e9307d388ed5224736e9c585db6029c
3
+ size 55933
iteration_1/packages/decoder_pre_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db81849a38ce1959ea345219332051947f22f00dc2445cb9b7a119673ca4bf93
3
+ size 67190976
iteration_1/packages/decoder_pre_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "211DC47B-E839-4B47-B64D-EE04F9C081B9": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "BE7D7840-FCB4-4491-B2ED-0D81B5FD33AA": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "211DC47B-E839-4B47-B64D-EE04F9C081B9"
18
+ }
iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff29829e3c92a4208ef07d307293fd576c4484c6048e519b90cd32ee80180038
3
+ size 491796
iteration_1/packages/decoder_upsample_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:43161151f001bb951c34952465adfc3c4f5fb8ab2845f31903be09ea9f1a6bc5
3
+ size 41400320
iteration_1/packages/decoder_upsample_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "26E8FCA8-BD9B-4185-B59E-00453487B2B3": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "BFFB197D-D576-4F27-85E5-48F5438F08C2": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "26E8FCA8-BD9B-4185-B59E-00453487B2B3"
18
+ }
iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9638ea3705d5cc55e8b5572e3c38562536b8c20656bcb4fa1047edbb8af375b
3
+ size 54989
iteration_1/packages/diffusion_unet_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ba48a8bdc68851289a23593b223573aaddd1b445e8c77765f5350feed8a251
3
+ size 49873792
iteration_1/packages/diffusion_unet_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "97A6E9E7-9101-417E-A712-61EF425AB960": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "F9ED6F9B-AAC4-4FAA-90E0-E2C4593DDE7D": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "97A6E9E7-9101-417E-A712-61EF425AB960"
18
+ }
iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b60eccf1aff0c09069d4eeebb5611c11caee89788229d0780ef606ac8fa1384
3
+ size 29886
iteration_1/packages/duration_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ba0b7b2f7dc6a687e9ec01d226c300b09f07832d8e4aac2705a16b5079910c
3
+ size 15543524
iteration_1/packages/duration_predictor_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "EA4FC14C-8DE2-414B-A6C4-B93190F89ED0": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "EAAA83DE-C745-4884-AE8D-1ED5C06BC490": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "EAAA83DE-C745-4884-AE8D-1ED5C06BC490"
18
+ }
iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:578305e49608b3685aac87a7f45aa188709e27b1f10b096a0a6ae0a66170871d
3
+ size 62172
iteration_1/packages/f0n_predictor_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b21f183d2ff876842ea2df14cdc033c8935a1805382b70b241c4f5a1bf32b3a8
3
+ size 16822272
iteration_1/packages/f0n_predictor_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "85A98E5E-641F-442C-9B6C-DCFEFD1BB71F": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "C8FFE55D-6CFE-4EFD-9798-D2D005C4676F": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "C8FFE55D-6CFE-4EFD-9798-D2D005C4676F"
18
+ }
iteration_1/packages/har_source.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49dae5b07689818410c81a0fb8af58a586d88ef211be419ccdc8fd5dc6467ae5
3
+ size 6956
iteration_1/packages/har_source.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "43EF7B77-88D8-4FB5-B59A-B9551E121DB3": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "C511A11E-3A7D-41FC-82E5-3BEB08F2D35D": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "43EF7B77-88D8-4FB5-B59A-B9551E121DB3"
18
+ }
iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cbd0cf223b874ed6b2de35606a5690bc6355b4890ea32ec30119db5dc00497e
3
+ size 68843
iteration_1/packages/ref_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:011d14fdb46589dfb79efb619d63846430be4e4ac86372f8819f35f5e0157391
3
+ size 55386048
iteration_1/packages/ref_encoder_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "32FE6195-7355-4635-AECB-58D9F49F1E17": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Weights",
7
+ "name": "weights",
8
+ "path": "com.apple.CoreML/weights"
9
+ },
10
+ "343F4722-A338-4705-8547-09E9A93DE8EC": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Specification",
13
+ "name": "model.mlmodel",
14
+ "path": "com.apple.CoreML/model.mlmodel"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "343F4722-A338-4705-8547-09E9A93DE8EC"
18
+ }
iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d88b74cb84892f7ff1e4d013517dd3d4dab56688b0a0fb4d920f72d0caf9e961
3
+ size 16587
iteration_1/packages/text_encoder_fp16.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d7f6e5869bb9d523956183e0facdff160c301d28113290efa329ae7bf72d3ce
3
+ size 11208000
iteration_1/packages/text_encoder_fp16.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "7F3243AB-2AFC-40E5-A6DE-069619301D63": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "F67A2205-52AD-4B8E-A19F-A7FB9AEB48F9": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "7F3243AB-2AFC-40E5-A6DE-069619301D63"
18
+ }
iteration_1/samples/sample_python.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0aa2e0b8bd1b89e8d1db8c38666d6d16b970a58eb19c056b9075eece852d422a
3
+ size 176344
iteration_1/samples/sample_swift.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f95f71e282b55d77fd70ac68cc3b41280337a60dff0179481ae85950f69d0cdd
3
+ size 145144
iteration_1/voices/1221-135767-0014.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/1789_142896_000022_000005.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0afdfcafb65ee88a6db67dc83c53e1e1c73346813df381afc9a0812c43f8ddbd
3
+ size 150284
iteration_1/voices/3.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/4.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/4077-13754-0000.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/5.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/5639-40744-0020.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/696_92939_000016_000006.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a7d39beddd2c24d864163ce38e799b261ab0bc23cbea492f0ece046feb131f1
3
+ size 145484
iteration_1/voices/908-157963-0027.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/Gavin.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:912208b6beaeff83cd6307bacf6b3842a4b32c9f0780f95146ea6806af436c83
3
+ size 998740
iteration_1/voices/Nima.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:44695a7c2723a6d857a6c075d75f57b525febe601f9162984adda8c713bc1ad4
3
+ size 758318
iteration_1/voices/Vinay.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc194f600ad9a7fff3c8b8914851006fabea010c48258602933f92c9b0b8bbf1
3
+ size 694194
iteration_1/voices/Yinghao.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d37acfa68e59401afec39c89b17509dda48b40f7ac60650e0e668145355799b
3
+ size 404574
iteration_1/voices/amused.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/anger.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/disgusted.wav ADDED
Binary file (96 kB). View file
 
iteration_1/voices/sleepy.wav ADDED
Binary file (96 kB). View file