Upload 401 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +20 -0
- cli/CtcEarningsBenchmark.swift +1048 -0
- cli/HybridEarningsBenchmark.swift +554 -0
- convert/.DS_Store +0 -0
- convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py +323 -0
- convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json +66 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil +24 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json +118 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil +45 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json +105 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil +0 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json +102 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil +58 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json +123 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil +69 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json +112 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil +191 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json +247 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json +1 -0
- convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py +697 -0
- convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json +35 -0
- convert/parakeet-tdt-ctc-110m/coreml/individual_components.py +265 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/metadata.json +247 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Manifest.json +18 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Manifest.json +18 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
- convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute.wav filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
convert/parakeet-tdt-v2-0.6b/coreml/context/Efficient[[:space:]]Sequence[[:space:]]Transduction[[:space:]]by[[:space:]]Jointly[[:space:]]Predicting[[:space:]]Tokens[[:space:]]and[[:space:]]Durations.pdf filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
convert/parakeet-tdt-v2-0.6b/coreml/context/FAST[[:space:]]CONFORMER[[:space:]]WITH[[:space:]]LINEARLY[[:space:]]SCALABLE[[:space:]]ATTENTION.pdf filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
convert/parakeet-tdt-v2-0.6b/coreml/plots/compare-components/mel_composite.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compile.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compression.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_quality.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute.wav filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
parakeet-tdt-v2-0.6b/coreml/context/Efficient[[:space:]]Sequence[[:space:]]Transduction[[:space:]]by[[:space:]]Jointly[[:space:]]Predicting[[:space:]]Tokens[[:space:]]and[[:space:]]Durations.pdf filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
parakeet-tdt-v2-0.6b/coreml/context/FAST[[:space:]]CONFORMER[[:space:]]WITH[[:space:]]LINEARLY[[:space:]]SCALABLE[[:space:]]ATTENTION.pdf filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
parakeet-tdt-v2-0.6b/coreml/plots/compare-components/mel_composite.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compile.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compression.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_quality.png filter=lfs diff=lfs merge=lfs -text
|
cli/CtcEarningsBenchmark.swift
ADDED
|
@@ -0,0 +1,1048 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#if os(macOS)
|
| 2 |
+
import AVFoundation
|
| 3 |
+
import CoreML
|
| 4 |
+
import FluidAudio
|
| 5 |
+
import Foundation
|
| 6 |
+
|
| 7 |
+
/// Earnings22 benchmark using TDT for transcription + CTC for keyword spotting.
|
| 8 |
+
/// TDT provides low WER transcription, CTC provides high recall dictionary detection.
|
| 9 |
+
public enum CtcEarningsBenchmark {
|
| 10 |
+
|
| 11 |
+
private enum KeywordMode: String {
|
| 12 |
+
case chunk
|
| 13 |
+
case file
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
/// Default CTC model directory
|
| 17 |
+
private static func defaultCtcModelPath() -> String? {
|
| 18 |
+
let appSupport = FileManager.default.urls(
|
| 19 |
+
for: .applicationSupportDirectory, in: .userDomainMask
|
| 20 |
+
).first!
|
| 21 |
+
let modelPath = appSupport.appendingPathComponent("FluidAudio/Models/parakeet-ctc-110m-coreml")
|
| 22 |
+
if FileManager.default.fileExists(atPath: modelPath.path) {
|
| 23 |
+
return modelPath.path
|
| 24 |
+
}
|
| 25 |
+
return nil
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
/// Default data directory (from download command)
|
| 29 |
+
private static func defaultDataDir() -> String? {
|
| 30 |
+
let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
|
| 31 |
+
if FileManager.default.fileExists(atPath: dataDir.path) {
|
| 32 |
+
return dataDir.path
|
| 33 |
+
}
|
| 34 |
+
return nil
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
public static func runCLI(arguments: [String]) async {
|
| 38 |
+
// Check for help
|
| 39 |
+
if arguments.contains("--help") || arguments.contains("-h") {
|
| 40 |
+
printUsage()
|
| 41 |
+
return
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
// Parse arguments
|
| 45 |
+
var dataDir: String? = nil
|
| 46 |
+
var outputFile = "ctc_earnings_benchmark.json"
|
| 47 |
+
var maxFiles: Int? = nil
|
| 48 |
+
var ctcModelPath: String? = nil
|
| 49 |
+
// Note: Using v2 by default because v3 has issues with certain audio files
|
| 50 |
+
// (returns empty transcription for ~7 files in Earnings22 dataset)
|
| 51 |
+
var tdtVersion: AsrModelVersion = .v2
|
| 52 |
+
var autoDownload = false
|
| 53 |
+
var keywordMode: KeywordMode = .chunk
|
| 54 |
+
|
| 55 |
+
var i = 0
|
| 56 |
+
while i < arguments.count {
|
| 57 |
+
switch arguments[i] {
|
| 58 |
+
case "--data-dir":
|
| 59 |
+
if i + 1 < arguments.count {
|
| 60 |
+
dataDir = arguments[i + 1]
|
| 61 |
+
i += 1
|
| 62 |
+
}
|
| 63 |
+
case "--output", "-o":
|
| 64 |
+
if i + 1 < arguments.count {
|
| 65 |
+
outputFile = arguments[i + 1]
|
| 66 |
+
i += 1
|
| 67 |
+
}
|
| 68 |
+
case "--max-files":
|
| 69 |
+
if i + 1 < arguments.count {
|
| 70 |
+
maxFiles = Int(arguments[i + 1])
|
| 71 |
+
i += 1
|
| 72 |
+
}
|
| 73 |
+
case "--ctc-model":
|
| 74 |
+
if i + 1 < arguments.count {
|
| 75 |
+
ctcModelPath = arguments[i + 1]
|
| 76 |
+
i += 1
|
| 77 |
+
}
|
| 78 |
+
case "--tdt-version":
|
| 79 |
+
if i + 1 < arguments.count {
|
| 80 |
+
if arguments[i + 1] == "v2" || arguments[i + 1] == "2" {
|
| 81 |
+
tdtVersion = .v2
|
| 82 |
+
}
|
| 83 |
+
i += 1
|
| 84 |
+
}
|
| 85 |
+
case "--auto-download":
|
| 86 |
+
autoDownload = true
|
| 87 |
+
case "--keyword-mode":
|
| 88 |
+
if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
|
| 89 |
+
keywordMode = mode
|
| 90 |
+
i += 1
|
| 91 |
+
}
|
| 92 |
+
default:
|
| 93 |
+
break
|
| 94 |
+
}
|
| 95 |
+
i += 1
|
| 96 |
+
}
|
| 97 |
+
|
| 98 |
+
// Use defaults if not specified
|
| 99 |
+
if dataDir == nil {
|
| 100 |
+
dataDir = defaultDataDir()
|
| 101 |
+
}
|
| 102 |
+
if ctcModelPath == nil {
|
| 103 |
+
ctcModelPath = defaultCtcModelPath()
|
| 104 |
+
}
|
| 105 |
+
|
| 106 |
+
// Handle auto-download for dataset
|
| 107 |
+
if autoDownload && dataDir == nil {
|
| 108 |
+
print("📥 Downloading earnings22-kws dataset...")
|
| 109 |
+
await DatasetDownloader.downloadEarnings22KWS(force: false)
|
| 110 |
+
dataDir = defaultDataDir()
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
// Handle auto-download for CTC models
|
| 114 |
+
if autoDownload && ctcModelPath == nil {
|
| 115 |
+
print("📥 Downloading CTC models...")
|
| 116 |
+
do {
|
| 117 |
+
_ = try await CtcModels.download()
|
| 118 |
+
ctcModelPath = defaultCtcModelPath()
|
| 119 |
+
} catch {
|
| 120 |
+
print("ERROR: Failed to download CTC models: \(error)")
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
print("Earnings Benchmark (TDT transcription + CTC keyword spotting)")
|
| 125 |
+
print(" Data directory: \(dataDir ?? "not found")")
|
| 126 |
+
print(" Output file: \(outputFile)")
|
| 127 |
+
print(" TDT version: \(tdtVersion == .v2 ? "v2" : "v3")")
|
| 128 |
+
print(" CTC model: \(ctcModelPath ?? "not found")")
|
| 129 |
+
print(" Keyword mode: \(keywordMode.rawValue)")
|
| 130 |
+
|
| 131 |
+
guard let finalDataDir = dataDir else {
|
| 132 |
+
print("ERROR: Data directory not found")
|
| 133 |
+
print("💡 Download with: fluidaudio download --dataset earnings22-kws")
|
| 134 |
+
print(" Or specify: --data-dir <path>")
|
| 135 |
+
printUsage()
|
| 136 |
+
return
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
guard let modelPath = ctcModelPath else {
|
| 140 |
+
print("ERROR: CTC model not found")
|
| 141 |
+
print("💡 Download parakeet-ctc-110m-coreml model to:")
|
| 142 |
+
print(" ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
|
| 143 |
+
print(" Or specify: --ctc-model <path>")
|
| 144 |
+
printUsage()
|
| 145 |
+
return
|
| 146 |
+
}
|
| 147 |
+
|
| 148 |
+
let dataDirResolved = finalDataDir
|
| 149 |
+
|
| 150 |
+
do {
|
| 151 |
+
// Load TDT models for transcription
|
| 152 |
+
print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...")
|
| 153 |
+
let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
|
| 154 |
+
let asrManager = AsrManager(config: .default)
|
| 155 |
+
try await asrManager.initialize(models: tdtModels)
|
| 156 |
+
print("TDT models loaded successfully")
|
| 157 |
+
|
| 158 |
+
// Load CTC models for keyword spotting
|
| 159 |
+
print("Loading CTC models from: \(modelPath)")
|
| 160 |
+
let modelDir = URL(fileURLWithPath: modelPath)
|
| 161 |
+
let ctcModels = try await CtcModels.loadDirect(from: modelDir)
|
| 162 |
+
print("Loaded CTC vocabulary with \(ctcModels.vocabulary.count) tokens")
|
| 163 |
+
|
| 164 |
+
// Create keyword spotter
|
| 165 |
+
let vocabSize = ctcModels.vocabulary.count
|
| 166 |
+
let blankId = vocabSize // Blank is at index = vocab_size
|
| 167 |
+
let spotter = CtcKeywordSpotter(models: ctcModels, blankId: blankId)
|
| 168 |
+
print("Created CTC spotter with blankId=\(blankId)")
|
| 169 |
+
|
| 170 |
+
// Collect test files
|
| 171 |
+
let dataDirURL = URL(fileURLWithPath: dataDirResolved)
|
| 172 |
+
let fileIds = try collectFileIds(from: dataDirURL, maxFiles: maxFiles)
|
| 173 |
+
let keywordIndex = try buildKeywordIndex(dataDir: dataDirURL, keywordMode: keywordMode)
|
| 174 |
+
|
| 175 |
+
if fileIds.isEmpty {
|
| 176 |
+
print("ERROR: No test files found in \(dataDirResolved)")
|
| 177 |
+
return
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
print("Processing \(fileIds.count) test files...")
|
| 181 |
+
|
| 182 |
+
var results: [[String: Any]] = []
|
| 183 |
+
var totalWer = 0.0
|
| 184 |
+
var totalKeywordReference = 0
|
| 185 |
+
var totalKeywordPredicted = 0
|
| 186 |
+
var totalKeywordTruePositives = 0
|
| 187 |
+
var totalKeywordFalsePositives = 0
|
| 188 |
+
var totalKeywordFalseNegatives = 0
|
| 189 |
+
var totalAudioDuration = 0.0
|
| 190 |
+
var totalProcessingTime = 0.0
|
| 191 |
+
|
| 192 |
+
for (index, fileId) in fileIds.enumerated() {
|
| 193 |
+
print("[\(index + 1)/\(fileIds.count)] \(fileId)")
|
| 194 |
+
|
| 195 |
+
if let result = try await processFile(
|
| 196 |
+
fileId: fileId,
|
| 197 |
+
dataDir: dataDirURL,
|
| 198 |
+
asrManager: asrManager,
|
| 199 |
+
ctcModels: ctcModels,
|
| 200 |
+
spotter: spotter,
|
| 201 |
+
keywordMode: keywordMode,
|
| 202 |
+
keywordIndex: keywordIndex
|
| 203 |
+
) {
|
| 204 |
+
results.append(result)
|
| 205 |
+
totalWer += result["wer"] as? Double ?? 0
|
| 206 |
+
totalKeywordReference += result["keywordReference"] as? Int ?? 0
|
| 207 |
+
totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
|
| 208 |
+
totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
|
| 209 |
+
totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
|
| 210 |
+
totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
|
| 211 |
+
totalAudioDuration += result["audioLength"] as? Double ?? 0
|
| 212 |
+
totalProcessingTime += result["processingTime"] as? Double ?? 0
|
| 213 |
+
|
| 214 |
+
let wer = result["wer"] as? Double ?? 0
|
| 215 |
+
let precision = result["keywordPrecision"] as? Double ?? 0
|
| 216 |
+
let recall = result["keywordRecall"] as? Double ?? 0
|
| 217 |
+
let fscore = result["keywordFscore"] as? Double ?? 0
|
| 218 |
+
print(
|
| 219 |
+
" WER: \(String(format: "%.1f", wer))%, " +
|
| 220 |
+
"KW P/R/F: \(String(format: "%.2f", precision))/" +
|
| 221 |
+
"\(String(format: "%.2f", recall))/" +
|
| 222 |
+
"\(String(format: "%.2f", fscore))"
|
| 223 |
+
)
|
| 224 |
+
}
|
| 225 |
+
}
|
| 226 |
+
|
| 227 |
+
// Calculate summary
|
| 228 |
+
let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
|
| 229 |
+
let keywordPrecision =
|
| 230 |
+
totalKeywordPredicted > 0
|
| 231 |
+
? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
|
| 232 |
+
: 0
|
| 233 |
+
let keywordRecall =
|
| 234 |
+
totalKeywordReference > 0
|
| 235 |
+
? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
|
| 236 |
+
: 0
|
| 237 |
+
let keywordFscore =
|
| 238 |
+
(keywordPrecision + keywordRecall) > 0
|
| 239 |
+
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 240 |
+
: 0
|
| 241 |
+
|
| 242 |
+
// Print summary
|
| 243 |
+
print("\n" + String(repeating: "=", count: 60))
|
| 244 |
+
print("EARNINGS22 BENCHMARK (TDT + CTC)")
|
| 245 |
+
print(String(repeating: "=", count: 60))
|
| 246 |
+
print("Model: \(modelPath)")
|
| 247 |
+
print("Total tests: \(results.count)")
|
| 248 |
+
print("Average WER: \(String(format: "%.2f", avgWer))%")
|
| 249 |
+
print(
|
| 250 |
+
"Keyword Precision/Recall/F1: " +
|
| 251 |
+
"\(String(format: "%.2f", keywordPrecision))/" +
|
| 252 |
+
"\(String(format: "%.2f", keywordRecall))/" +
|
| 253 |
+
"\(String(format: "%.2f", keywordFscore))"
|
| 254 |
+
)
|
| 255 |
+
print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
|
| 256 |
+
print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
|
| 257 |
+
if totalProcessingTime > 0 {
|
| 258 |
+
print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
|
| 259 |
+
}
|
| 260 |
+
print(String(repeating: "=", count: 60))
|
| 261 |
+
|
| 262 |
+
// Sort results by WER descending (worst first)
|
| 263 |
+
let sortedResults = results.sorted { r1, r2 in
|
| 264 |
+
let wer1 = r1["wer"] as? Double ?? 0
|
| 265 |
+
let wer2 = r2["wer"] as? Double ?? 0
|
| 266 |
+
return wer1 > wer2
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
// Save to JSON
|
| 270 |
+
let summaryDict: [String: Any] = [
|
| 271 |
+
"totalTests": results.count,
|
| 272 |
+
"avgWer": round(avgWer * 100) / 100,
|
| 273 |
+
"keywordTruePositives": totalKeywordTruePositives,
|
| 274 |
+
"keywordFalsePositives": totalKeywordFalsePositives,
|
| 275 |
+
"keywordFalseNegatives": totalKeywordFalseNegatives,
|
| 276 |
+
"keywordPredicted": totalKeywordPredicted,
|
| 277 |
+
"keywordReference": totalKeywordReference,
|
| 278 |
+
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 279 |
+
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 280 |
+
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 281 |
+
"totalAudioDuration": round(totalAudioDuration * 100) / 100,
|
| 282 |
+
"totalProcessingTime": round(totalProcessingTime * 100) / 100,
|
| 283 |
+
]
|
| 284 |
+
|
| 285 |
+
let output: [String: Any] = [
|
| 286 |
+
"model": modelPath,
|
| 287 |
+
"keywordMode": keywordMode.rawValue,
|
| 288 |
+
"summary": summaryDict,
|
| 289 |
+
"results": sortedResults,
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
|
| 293 |
+
try jsonData.write(to: URL(fileURLWithPath: outputFile))
|
| 294 |
+
print("\nResults written to: \(outputFile)")
|
| 295 |
+
|
| 296 |
+
} catch {
|
| 297 |
+
print("ERROR: Benchmark failed: \(error)")
|
| 298 |
+
}
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
|
| 302 |
+
var fileIds: [String] = []
|
| 303 |
+
let suffix = ".dictionary.txt"
|
| 304 |
+
|
| 305 |
+
let fileManager = FileManager.default
|
| 306 |
+
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 307 |
+
|
| 308 |
+
for url in contents.sorted(by: { $0.path < $1.path }) {
|
| 309 |
+
let name = url.lastPathComponent
|
| 310 |
+
if name.hasSuffix(suffix) {
|
| 311 |
+
let data = try? Data(contentsOf: url)
|
| 312 |
+
if let data = data, !data.isEmpty {
|
| 313 |
+
let fileId = String(name.dropLast(suffix.count))
|
| 314 |
+
fileIds.append(fileId)
|
| 315 |
+
}
|
| 316 |
+
}
|
| 317 |
+
}
|
| 318 |
+
|
| 319 |
+
if let maxFiles = maxFiles {
|
| 320 |
+
return Array(fileIds.prefix(maxFiles))
|
| 321 |
+
}
|
| 322 |
+
return fileIds
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
private static func processFile(
|
| 326 |
+
fileId: String,
|
| 327 |
+
dataDir: URL,
|
| 328 |
+
asrManager: AsrManager,
|
| 329 |
+
ctcModels: CtcModels,
|
| 330 |
+
spotter: CtcKeywordSpotter,
|
| 331 |
+
keywordMode: KeywordMode,
|
| 332 |
+
keywordIndex: [String: [String]]
|
| 333 |
+
) async throws -> [String: Any]? {
|
| 334 |
+
let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
|
| 335 |
+
let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
|
| 336 |
+
let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
|
| 337 |
+
|
| 338 |
+
let fm = FileManager.default
|
| 339 |
+
guard fm.fileExists(atPath: wavFile.path),
|
| 340 |
+
fm.fileExists(atPath: dictionaryFile.path)
|
| 341 |
+
else {
|
| 342 |
+
return nil
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
// Load dictionary words (chunk or file keywords)
|
| 346 |
+
let dictionaryWords = try loadDictionaryWords(
|
| 347 |
+
fileId: fileId,
|
| 348 |
+
dictionaryFile: dictionaryFile,
|
| 349 |
+
keywordMode: keywordMode,
|
| 350 |
+
keywordIndex: keywordIndex
|
| 351 |
+
)
|
| 352 |
+
|
| 353 |
+
// Load reference text
|
| 354 |
+
let referenceRaw =
|
| 355 |
+
(try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
| 356 |
+
|
| 357 |
+
// Get audio samples
|
| 358 |
+
let audioFile = try AVAudioFile(forReading: wavFile)
|
| 359 |
+
let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
|
| 360 |
+
let format = audioFile.processingFormat
|
| 361 |
+
let frameCount = AVAudioFrameCount(audioFile.length)
|
| 362 |
+
|
| 363 |
+
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
|
| 364 |
+
throw NSError(
|
| 365 |
+
domain: "CtcEarningsBenchmark", code: 1,
|
| 366 |
+
userInfo: [NSLocalizedDescriptionKey: "Failed to create audio buffer"])
|
| 367 |
+
}
|
| 368 |
+
try audioFile.read(into: buffer)
|
| 369 |
+
|
| 370 |
+
// Resample to 16kHz
|
| 371 |
+
let converter = AudioConverter()
|
| 372 |
+
let samples = try converter.resampleBuffer(buffer)
|
| 373 |
+
|
| 374 |
+
let startTime = Date()
|
| 375 |
+
|
| 376 |
+
// 1. TDT transcription for low WER
|
| 377 |
+
let tdtResult = try await asrManager.transcribe(wavFile)
|
| 378 |
+
|
| 379 |
+
// Skip files where TDT returns empty (some audio files cause model issues)
|
| 380 |
+
if tdtResult.text.isEmpty {
|
| 381 |
+
print(" SKIPPED: TDT returned empty transcription")
|
| 382 |
+
return nil
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
// 2. Build custom vocabulary for CTC keyword spotting
|
| 386 |
+
var vocabTerms: [CustomVocabularyTerm] = []
|
| 387 |
+
for word in dictionaryWords {
|
| 388 |
+
let tokenIds = tokenize(word, vocabulary: ctcModels.vocabulary)
|
| 389 |
+
if !tokenIds.isEmpty {
|
| 390 |
+
let term = CustomVocabularyTerm(
|
| 391 |
+
text: word,
|
| 392 |
+
weight: nil,
|
| 393 |
+
aliases: nil,
|
| 394 |
+
tokenIds: nil,
|
| 395 |
+
ctcTokenIds: tokenIds
|
| 396 |
+
)
|
| 397 |
+
vocabTerms.append(term)
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
let customVocab = CustomVocabularyContext(terms: vocabTerms)
|
| 401 |
+
|
| 402 |
+
// 3. CTC keyword spotting for high recall dictionary detection
|
| 403 |
+
let spotResult = try await spotter.spotKeywordsWithLogProbs(
|
| 404 |
+
audioSamples: samples,
|
| 405 |
+
customVocabulary: customVocab,
|
| 406 |
+
minScore: nil
|
| 407 |
+
)
|
| 408 |
+
|
| 409 |
+
// 4. Post-process: Use VocabularyRescorer with Argmax-style parameters
|
| 410 |
+
// Argmax uses cbw=3.0 (context-biasing weight) for boosting vocab terms
|
| 411 |
+
let useRescorer = ProcessInfo.processInfo.environment["NO_CTC_RESCORING"] != "1"
|
| 412 |
+
let hypothesis: String
|
| 413 |
+
if useRescorer {
|
| 414 |
+
let rescorerConfig = VocabularyRescorer.Config(
|
| 415 |
+
minScoreAdvantage: 1.0, // Lower threshold - rely more on CTC scoring
|
| 416 |
+
minVocabScore: -15.0, // Permissive to include more detections
|
| 417 |
+
maxOriginalScoreForReplacement: -2.0, // Don't replace very confident words
|
| 418 |
+
vocabBoostWeight: 3.0 // Argmax cbw=3.0
|
| 419 |
+
)
|
| 420 |
+
let rescorer = VocabularyRescorer(
|
| 421 |
+
spotter: spotter,
|
| 422 |
+
vocabulary: customVocab,
|
| 423 |
+
config: rescorerConfig
|
| 424 |
+
)
|
| 425 |
+
let rescoreResult = rescorer.rescore(transcript: tdtResult.text, spotResult: spotResult)
|
| 426 |
+
hypothesis = rescoreResult.text
|
| 427 |
+
} else {
|
| 428 |
+
hypothesis = tdtResult.text // Baseline: no CTC corrections
|
| 429 |
+
}
|
| 430 |
+
|
| 431 |
+
let processingTime = Date().timeIntervalSince(startTime)
|
| 432 |
+
|
| 433 |
+
// Normalize texts
|
| 434 |
+
let referenceNormalized = TextNormalizer.normalize(referenceRaw)
|
| 435 |
+
let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
|
| 436 |
+
|
| 437 |
+
// Keyword sets for precision/recall
|
| 438 |
+
let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
|
| 439 |
+
let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
|
| 440 |
+
let truePositives = referenceKeywords.intersection(predictedKeywords)
|
| 441 |
+
let falsePositives = predictedKeywords.subtracting(referenceKeywords)
|
| 442 |
+
let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
|
| 443 |
+
let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
|
| 444 |
+
let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
|
| 445 |
+
let keywordFscore =
|
| 446 |
+
(keywordPrecision + keywordRecall) > 0
|
| 447 |
+
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 448 |
+
: 0
|
| 449 |
+
|
| 450 |
+
let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 451 |
+
!$0.isEmpty
|
| 452 |
+
}
|
| 453 |
+
let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 454 |
+
!$0.isEmpty
|
| 455 |
+
}
|
| 456 |
+
|
| 457 |
+
// Calculate WER
|
| 458 |
+
let wer: Double
|
| 459 |
+
if referenceWords.isEmpty {
|
| 460 |
+
wer = hypothesisWords.isEmpty ? 0.0 : 1.0
|
| 461 |
+
} else {
|
| 462 |
+
wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
// Count dictionary detections (debug only)
|
| 466 |
+
let minCtcScore: Float = -15.0 // Permissive threshold for detection
|
| 467 |
+
var detectionDetails: [[String: Any]] = []
|
| 468 |
+
var ctcFoundWords: Set<String> = []
|
| 469 |
+
|
| 470 |
+
// 1. CTC detections
|
| 471 |
+
for detection in spotResult.detections {
|
| 472 |
+
let inRef = referenceKeywords.contains(detection.term.text.lowercased())
|
| 473 |
+
let detail: [String: Any] = [
|
| 474 |
+
"word": detection.term.text,
|
| 475 |
+
"score": round(Double(detection.score) * 100) / 100,
|
| 476 |
+
"startTime": round(detection.startTime * 100) / 100,
|
| 477 |
+
"endTime": round(detection.endTime * 100) / 100,
|
| 478 |
+
"source": "ctc",
|
| 479 |
+
"inReference": inRef,
|
| 480 |
+
]
|
| 481 |
+
detectionDetails.append(detail)
|
| 482 |
+
|
| 483 |
+
if detection.score >= minCtcScore { // Use >= to include edge cases
|
| 484 |
+
ctcFoundWords.insert(detection.term.text.lowercased())
|
| 485 |
+
}
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
// 2. Fallback: check hypothesis for dictionary words not found by CTC
|
| 489 |
+
let hypothesisLower = hypothesis.lowercased()
|
| 490 |
+
for word in dictionaryWords {
|
| 491 |
+
let wordLower = word.lowercased()
|
| 492 |
+
if !ctcFoundWords.contains(wordLower) {
|
| 493 |
+
// Check if word appears as whole word in hypothesis (avoid substring false positives)
|
| 494 |
+
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 495 |
+
if let regex = try? NSRegularExpression(pattern: pattern, options: []),
|
| 496 |
+
regex.firstMatch(
|
| 497 |
+
in: hypothesisLower, options: [],
|
| 498 |
+
range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
|
| 499 |
+
{
|
| 500 |
+
ctcFoundWords.insert(wordLower)
|
| 501 |
+
let inRef = referenceKeywords.contains(wordLower)
|
| 502 |
+
let detail: [String: Any] = [
|
| 503 |
+
"word": word,
|
| 504 |
+
"score": 0.0,
|
| 505 |
+
"startTime": 0.0,
|
| 506 |
+
"endTime": 0.0,
|
| 507 |
+
"source": "hypothesis",
|
| 508 |
+
"inReference": inRef,
|
| 509 |
+
]
|
| 510 |
+
detectionDetails.append(detail)
|
| 511 |
+
}
|
| 512 |
+
}
|
| 513 |
+
}
|
| 514 |
+
|
| 515 |
+
let result: [String: Any] = [
|
| 516 |
+
"fileId": fileId,
|
| 517 |
+
"reference": referenceNormalized,
|
| 518 |
+
"hypothesis": hypothesisNormalized,
|
| 519 |
+
"wer": round(wer * 10000) / 100,
|
| 520 |
+
"dictFound": predictedKeywords.count,
|
| 521 |
+
"dictTotal": referenceKeywords.count,
|
| 522 |
+
"keywordPredicted": predictedKeywords.count,
|
| 523 |
+
"keywordReference": referenceKeywords.count,
|
| 524 |
+
"keywordTruePositives": truePositives.count,
|
| 525 |
+
"keywordFalsePositives": falsePositives.count,
|
| 526 |
+
"keywordFalseNegatives": falseNegatives.count,
|
| 527 |
+
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 528 |
+
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 529 |
+
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 530 |
+
"audioLength": round(audioLength * 100) / 100,
|
| 531 |
+
"processingTime": round(processingTime * 1000) / 1000,
|
| 532 |
+
"ctcDetections": detectionDetails,
|
| 533 |
+
]
|
| 534 |
+
return result
|
| 535 |
+
}
|
| 536 |
+
|
| 537 |
+
/// Simple tokenization using vocabulary lookup
|
| 538 |
+
private static func tokenize(_ text: String, vocabulary: [Int: String]) -> [Int] {
|
| 539 |
+
// Build reverse vocabulary (token -> id)
|
| 540 |
+
var tokenToId: [String: Int] = [:]
|
| 541 |
+
for (id, token) in vocabulary {
|
| 542 |
+
tokenToId[token] = id
|
| 543 |
+
}
|
| 544 |
+
|
| 545 |
+
let normalizedText = text.lowercased()
|
| 546 |
+
var result: [Int] = []
|
| 547 |
+
var position = normalizedText.startIndex
|
| 548 |
+
var isWordStart = true
|
| 549 |
+
|
| 550 |
+
while position < normalizedText.endIndex {
|
| 551 |
+
var matched = false
|
| 552 |
+
let remaining = normalizedText.distance(from: position, to: normalizedText.endIndex)
|
| 553 |
+
var matchLength = min(20, remaining)
|
| 554 |
+
|
| 555 |
+
while matchLength > 0 {
|
| 556 |
+
let endPos = normalizedText.index(position, offsetBy: matchLength)
|
| 557 |
+
let substring = String(normalizedText[position..<endPos])
|
| 558 |
+
|
| 559 |
+
// Try with SentencePiece prefix for word start
|
| 560 |
+
let withPrefix = isWordStart ? "▁" + substring : substring
|
| 561 |
+
|
| 562 |
+
if let tokenId = tokenToId[withPrefix] {
|
| 563 |
+
result.append(tokenId)
|
| 564 |
+
position = endPos
|
| 565 |
+
isWordStart = false
|
| 566 |
+
matched = true
|
| 567 |
+
break
|
| 568 |
+
} else if let tokenId = tokenToId[substring] {
|
| 569 |
+
result.append(tokenId)
|
| 570 |
+
position = endPos
|
| 571 |
+
isWordStart = false
|
| 572 |
+
matched = true
|
| 573 |
+
break
|
| 574 |
+
}
|
| 575 |
+
|
| 576 |
+
matchLength -= 1
|
| 577 |
+
}
|
| 578 |
+
|
| 579 |
+
if !matched {
|
| 580 |
+
let char = normalizedText[position]
|
| 581 |
+
if char == " " {
|
| 582 |
+
isWordStart = true
|
| 583 |
+
position = normalizedText.index(after: position)
|
| 584 |
+
} else {
|
| 585 |
+
// Unknown character - skip
|
| 586 |
+
position = normalizedText.index(after: position)
|
| 587 |
+
isWordStart = false
|
| 588 |
+
}
|
| 589 |
+
}
|
| 590 |
+
}
|
| 591 |
+
|
| 592 |
+
return result
|
| 593 |
+
}
|
| 594 |
+
|
| 595 |
+
/// Apply CTC keyword corrections to TDT transcription using multiple strategies:
|
| 596 |
+
/// 1. Fuzzy matching (for words that are phonetically similar)
|
| 597 |
+
/// 2. Context pattern matching (for "this is X" type patterns)
|
| 598 |
+
/// 3. Proper noun replacement (for names after common patterns)
|
| 599 |
+
private static func applyKeywordCorrections(
|
| 600 |
+
tdtResult: ASRResult,
|
| 601 |
+
detections: [CtcKeywordSpotter.KeywordDetection],
|
| 602 |
+
minScore: Float
|
| 603 |
+
) -> String {
|
| 604 |
+
// Filter detections by score
|
| 605 |
+
let validDetections = detections.filter { $0.score >= minScore }
|
| 606 |
+
guard !validDetections.isEmpty else {
|
| 607 |
+
return tdtResult.text
|
| 608 |
+
}
|
| 609 |
+
|
| 610 |
+
var text = tdtResult.text
|
| 611 |
+
var usedDetections: Set<String> = []
|
| 612 |
+
|
| 613 |
+
// PASS 1: Fuzzy matching for phonetically similar words
|
| 614 |
+
for detection in validDetections {
|
| 615 |
+
let keyword = detection.term.text
|
| 616 |
+
let keywordLower = keyword.lowercased()
|
| 617 |
+
let keywordParts = keywordLower.components(separatedBy: " ").filter { !$0.isEmpty }
|
| 618 |
+
|
| 619 |
+
let words = text.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }
|
| 620 |
+
|
| 621 |
+
// Handle multi-word keywords
|
| 622 |
+
if keywordParts.count > 1 {
|
| 623 |
+
for i in 0..<(words.count - keywordParts.count + 1) {
|
| 624 |
+
var allMatch = true
|
| 625 |
+
var matchedWords: [String] = []
|
| 626 |
+
|
| 627 |
+
for j in 0..<keywordParts.count {
|
| 628 |
+
let wordClean = words[i + j].trimmingCharacters(in: .punctuationCharacters).lowercased()
|
| 629 |
+
if isSimilar(wordClean, keywordParts[j]) {
|
| 630 |
+
matchedWords.append(words[i + j])
|
| 631 |
+
} else {
|
| 632 |
+
allMatch = false
|
| 633 |
+
break
|
| 634 |
+
}
|
| 635 |
+
}
|
| 636 |
+
|
| 637 |
+
if allMatch && !matchedWords.isEmpty {
|
| 638 |
+
let originalPhrase = matchedWords.joined(separator: " ")
|
| 639 |
+
let replacement = matchCase(keyword, to: matchedWords[0])
|
| 640 |
+
text = text.replacingOccurrences(of: originalPhrase, with: replacement)
|
| 641 |
+
usedDetections.insert(keyword)
|
| 642 |
+
break
|
| 643 |
+
}
|
| 644 |
+
}
|
| 645 |
+
} else {
|
| 646 |
+
// Single word keyword
|
| 647 |
+
for word in words {
|
| 648 |
+
let wordClean = word.trimmingCharacters(in: .punctuationCharacters).lowercased()
|
| 649 |
+
guard !wordClean.isEmpty else { continue }
|
| 650 |
+
|
| 651 |
+
if isSimilar(wordClean, keywordLower) && wordClean != keywordLower {
|
| 652 |
+
let replacement = matchCase(keyword, to: word)
|
| 653 |
+
text = text.replacingOccurrences(of: word, with: replacement)
|
| 654 |
+
usedDetections.insert(keyword)
|
| 655 |
+
break
|
| 656 |
+
}
|
| 657 |
+
}
|
| 658 |
+
}
|
| 659 |
+
}
|
| 660 |
+
|
| 661 |
+
// PASS 2: Context pattern matching - specifically for "this is X" pattern
|
| 662 |
+
// Only replace if keyword is NOT already in the text
|
| 663 |
+
for detection in validDetections {
|
| 664 |
+
let keyword = detection.term.text
|
| 665 |
+
guard !usedDetections.contains(keyword) else { continue }
|
| 666 |
+
|
| 667 |
+
let keywordLower = keyword.lowercased()
|
| 668 |
+
|
| 669 |
+
// Skip if keyword already exists in text (case-insensitive)
|
| 670 |
+
if text.lowercased().contains(keywordLower) {
|
| 671 |
+
usedDetections.insert(keyword) // Mark as handled
|
| 672 |
+
continue
|
| 673 |
+
}
|
| 674 |
+
|
| 675 |
+
// Check if keyword looks like a proper noun (starts with uppercase)
|
| 676 |
+
let isProperNoun =
|
| 677 |
+
keyword.first?.isUppercase == true
|
| 678 |
+
&& keyword.count >= 3
|
| 679 |
+
&& !stopWords.contains(keywordLower)
|
| 680 |
+
|
| 681 |
+
guard isProperNoun else { continue }
|
| 682 |
+
|
| 683 |
+
// Look for "this is X" pattern specifically for names
|
| 684 |
+
let thisIsPattern = try? NSRegularExpression(pattern: "this is ([A-Z][a-z]+)", options: [])
|
| 685 |
+
if let regex = thisIsPattern {
|
| 686 |
+
let textRange = NSRange(text.startIndex..., in: text)
|
| 687 |
+
if let match = regex.firstMatch(in: text, options: [], range: textRange),
|
| 688 |
+
match.numberOfRanges > 1,
|
| 689 |
+
let captureRange = Range(match.range(at: 1), in: text)
|
| 690 |
+
{
|
| 691 |
+
let capturedWord = String(text[captureRange])
|
| 692 |
+
let capturedLower = capturedWord.lowercased()
|
| 693 |
+
|
| 694 |
+
// Skip if captured word is already a detected keyword
|
| 695 |
+
let isOtherKeyword = validDetections.contains { det in
|
| 696 |
+
det.term.text.lowercased() == capturedLower
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
if !isOtherKeyword && !stopWords.contains(capturedLower) {
|
| 700 |
+
// Similar length check
|
| 701 |
+
if abs(capturedWord.count - keyword.count) <= 3 {
|
| 702 |
+
text = text.replacingOccurrences(of: capturedWord, with: keyword)
|
| 703 |
+
usedDetections.insert(keyword)
|
| 704 |
+
}
|
| 705 |
+
}
|
| 706 |
+
}
|
| 707 |
+
}
|
| 708 |
+
}
|
| 709 |
+
|
| 710 |
+
return text
|
| 711 |
+
}
|
| 712 |
+
|
| 713 |
+
/// Build word timings by merging subword tokens (tokens starting with "▁" begin new words)
|
| 714 |
+
private static func buildWordTimings(
|
| 715 |
+
from tokenTimings: [TokenTiming]
|
| 716 |
+
) -> [(word: String, startTime: Double, endTime: Double)] {
|
| 717 |
+
var wordTimings: [(word: String, startTime: Double, endTime: Double)] = []
|
| 718 |
+
var currentWord = ""
|
| 719 |
+
var wordStart: Double = 0
|
| 720 |
+
var wordEnd: Double = 0
|
| 721 |
+
|
| 722 |
+
for timing in tokenTimings {
|
| 723 |
+
let token = timing.token
|
| 724 |
+
|
| 725 |
+
// Skip special tokens
|
| 726 |
+
if token.isEmpty || token == "<blank>" || token == "<pad>" {
|
| 727 |
+
continue
|
| 728 |
+
}
|
| 729 |
+
|
| 730 |
+
// Check if this starts a new word (has ▁ prefix or is first token)
|
| 731 |
+
let startsNewWord = token.hasPrefix("▁") || currentWord.isEmpty
|
| 732 |
+
|
| 733 |
+
if startsNewWord && !currentWord.isEmpty {
|
| 734 |
+
// Save previous word
|
| 735 |
+
wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
|
| 736 |
+
currentWord = ""
|
| 737 |
+
}
|
| 738 |
+
|
| 739 |
+
if startsNewWord {
|
| 740 |
+
currentWord = token.hasPrefix("▁") ? String(token.dropFirst()) : token
|
| 741 |
+
wordStart = timing.startTime
|
| 742 |
+
} else {
|
| 743 |
+
currentWord += token
|
| 744 |
+
}
|
| 745 |
+
wordEnd = timing.endTime
|
| 746 |
+
}
|
| 747 |
+
|
| 748 |
+
// Save final word
|
| 749 |
+
if !currentWord.isEmpty {
|
| 750 |
+
wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
|
| 751 |
+
}
|
| 752 |
+
|
| 753 |
+
return wordTimings
|
| 754 |
+
}
|
| 755 |
+
|
| 756 |
+
/// Common English words that should never be replaced by keyword matching
|
| 757 |
+
private static let stopWords: Set<String> = [
|
| 758 |
+
// Pronouns
|
| 759 |
+
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
|
| 760 |
+
"my", "your", "his", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
|
| 761 |
+
"this", "that", "these", "those", "who", "whom", "what", "which", "whose",
|
| 762 |
+
// Common verbs
|
| 763 |
+
"is", "are", "was", "were", "be", "been", "being", "am",
|
| 764 |
+
"have", "has", "had", "having", "do", "does", "did", "doing", "done",
|
| 765 |
+
"will", "would", "shall", "should", "may", "might", "must", "can", "could",
|
| 766 |
+
"get", "got", "getting", "go", "goes", "went", "going", "gone",
|
| 767 |
+
"come", "came", "coming", "see", "saw", "seen", "know", "knew", "known",
|
| 768 |
+
"think", "thought", "make", "made", "take", "took", "taken", "give", "gave", "given",
|
| 769 |
+
"say", "said", "tell", "told", "ask", "asked", "use", "used", "want", "wanted",
|
| 770 |
+
"need", "needed", "try", "tried", "let", "put", "keep", "kept", "look", "looked",
|
| 771 |
+
// Articles and determiners
|
| 772 |
+
"a", "an", "the", "some", "any", "no", "every", "each", "all", "both", "few", "many",
|
| 773 |
+
"much", "more", "most", "other", "another", "such",
|
| 774 |
+
// Prepositions
|
| 775 |
+
"in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "down", "out",
|
| 776 |
+
"about", "into", "over", "after", "before", "between", "under", "through", "during",
|
| 777 |
+
// Conjunctions
|
| 778 |
+
"and", "or", "but", "so", "yet", "nor", "if", "then", "than", "because", "while",
|
| 779 |
+
"although", "unless", "since", "when", "where", "as",
|
| 780 |
+
// Adverbs
|
| 781 |
+
"not", "very", "just", "also", "only", "even", "still", "already", "always", "never",
|
| 782 |
+
"often", "sometimes", "usually", "really", "well", "now", "here", "there", "how", "why",
|
| 783 |
+
// Common words
|
| 784 |
+
"yes", "no", "okay", "ok", "thank", "thanks", "please", "sorry", "hello", "hi", "bye",
|
| 785 |
+
"good", "great", "bad", "new", "old", "first", "last", "long", "short", "big", "small",
|
| 786 |
+
"high", "low", "right", "left", "next", "back", "same", "different", "own", "able",
|
| 787 |
+
"way", "thing", "things", "time", "times", "year", "years", "day", "days", "week", "weeks",
|
| 788 |
+
"part", "place", "case", "point", "fact", "end", "kind", "lot", "set",
|
| 789 |
+
// Numbers
|
| 790 |
+
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
|
| 791 |
+
"hundred", "thousand", "million", "billion",
|
| 792 |
+
]
|
| 793 |
+
|
| 794 |
+
/// Check if two words are similar (edit distance / length ratio)
|
| 795 |
+
private static func isSimilar(_ a: String, _ b: String) -> Bool {
|
| 796 |
+
// Never match stop words - they're too common to be proper nouns
|
| 797 |
+
if stopWords.contains(a) || stopWords.contains(b) {
|
| 798 |
+
return false
|
| 799 |
+
}
|
| 800 |
+
|
| 801 |
+
let maxLen = max(a.count, b.count)
|
| 802 |
+
let minLen = min(a.count, b.count)
|
| 803 |
+
guard maxLen > 0, minLen >= 3 else { return false }
|
| 804 |
+
|
| 805 |
+
// Allow more length difference for longer words
|
| 806 |
+
let lenDiff = abs(a.count - b.count)
|
| 807 |
+
if lenDiff > max(3, maxLen / 2) { return false }
|
| 808 |
+
|
| 809 |
+
// Calculate edit distance
|
| 810 |
+
let distance = editDistance(a, b)
|
| 811 |
+
|
| 812 |
+
// More aggressive threshold: allow up to 40% of max length as edits
|
| 813 |
+
let threshold = max(2, Int(Double(maxLen) * 0.4))
|
| 814 |
+
|
| 815 |
+
// Also check if one is substring of other (handles "Erik" in "Ririek")
|
| 816 |
+
if a.contains(b) || b.contains(a) {
|
| 817 |
+
return true
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
// Check common prefix/suffix (handles "Heri" vs "Harry")
|
| 821 |
+
let commonPrefix = commonPrefixLength(a, b)
|
| 822 |
+
let commonSuffix = commonSuffixLength(a, b)
|
| 823 |
+
if commonPrefix >= 2 || commonSuffix >= 2 {
|
| 824 |
+
return distance <= threshold + 1
|
| 825 |
+
}
|
| 826 |
+
|
| 827 |
+
return distance <= threshold
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
/// Get length of common prefix
|
| 831 |
+
private static func commonPrefixLength(_ a: String, _ b: String) -> Int {
|
| 832 |
+
let aChars = Array(a)
|
| 833 |
+
let bChars = Array(b)
|
| 834 |
+
var count = 0
|
| 835 |
+
for i in 0..<min(aChars.count, bChars.count) {
|
| 836 |
+
if aChars[i] == bChars[i] {
|
| 837 |
+
count += 1
|
| 838 |
+
} else {
|
| 839 |
+
break
|
| 840 |
+
}
|
| 841 |
+
}
|
| 842 |
+
return count
|
| 843 |
+
}
|
| 844 |
+
|
| 845 |
+
/// Get length of common suffix
|
| 846 |
+
private static func commonSuffixLength(_ a: String, _ b: String) -> Int {
|
| 847 |
+
let aChars = Array(a.reversed())
|
| 848 |
+
let bChars = Array(b.reversed())
|
| 849 |
+
var count = 0
|
| 850 |
+
for i in 0..<min(aChars.count, bChars.count) {
|
| 851 |
+
if aChars[i] == bChars[i] {
|
| 852 |
+
count += 1
|
| 853 |
+
} else {
|
| 854 |
+
break
|
| 855 |
+
}
|
| 856 |
+
}
|
| 857 |
+
return count
|
| 858 |
+
}
|
| 859 |
+
|
| 860 |
+
/// Simple edit distance calculation
|
| 861 |
+
private static func editDistance(_ a: String, _ b: String) -> Int {
|
| 862 |
+
let a = Array(a)
|
| 863 |
+
let b = Array(b)
|
| 864 |
+
let m = a.count
|
| 865 |
+
let n = b.count
|
| 866 |
+
|
| 867 |
+
if m == 0 { return n }
|
| 868 |
+
if n == 0 { return m }
|
| 869 |
+
|
| 870 |
+
var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
|
| 871 |
+
|
| 872 |
+
for i in 0...m { dp[i][0] = i }
|
| 873 |
+
for j in 0...n { dp[0][j] = j }
|
| 874 |
+
|
| 875 |
+
for i in 1...m {
|
| 876 |
+
for j in 1...n {
|
| 877 |
+
if a[i - 1] == b[j - 1] {
|
| 878 |
+
dp[i][j] = dp[i - 1][j - 1]
|
| 879 |
+
} else {
|
| 880 |
+
dp[i][j] = 1 + min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1]))
|
| 881 |
+
}
|
| 882 |
+
}
|
| 883 |
+
}
|
| 884 |
+
|
| 885 |
+
return dp[m][n]
|
| 886 |
+
}
|
| 887 |
+
|
| 888 |
+
/// Match the case pattern of the original word
|
| 889 |
+
private static func matchCase(_ keyword: String, to original: String) -> String {
|
| 890 |
+
let origClean = original.trimmingCharacters(in: .punctuationCharacters)
|
| 891 |
+
|
| 892 |
+
// Check case pattern
|
| 893 |
+
if origClean.first?.isUppercase == true {
|
| 894 |
+
// Capitalize first letter
|
| 895 |
+
return keyword.prefix(1).uppercased() + keyword.dropFirst()
|
| 896 |
+
}
|
| 897 |
+
return keyword
|
| 898 |
+
}
|
| 899 |
+
|
| 900 |
+
private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
|
| 901 |
+
if reference.isEmpty {
|
| 902 |
+
return hypothesis.isEmpty ? 0.0 : 1.0
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
let m = reference.count
|
| 906 |
+
let n = hypothesis.count
|
| 907 |
+
var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
|
| 908 |
+
|
| 909 |
+
for i in 0...m { dp[i][0] = i }
|
| 910 |
+
for j in 0...n { dp[0][j] = j }
|
| 911 |
+
|
| 912 |
+
for i in 1...m {
|
| 913 |
+
for j in 1...n {
|
| 914 |
+
if reference[i - 1] == hypothesis[j - 1] {
|
| 915 |
+
dp[i][j] = dp[i - 1][j - 1]
|
| 916 |
+
} else {
|
| 917 |
+
dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
|
| 918 |
+
}
|
| 919 |
+
}
|
| 920 |
+
}
|
| 921 |
+
|
| 922 |
+
return Double(dp[m][n]) / Double(m)
|
| 923 |
+
}
|
| 924 |
+
|
| 925 |
+
private static func printUsage() {
|
| 926 |
+
print(
|
| 927 |
+
"""
|
| 928 |
+
CTC Earnings Benchmark (TDT + CTC keyword spotting)
|
| 929 |
+
|
| 930 |
+
Usage: fluidaudio ctc-earnings-benchmark [options]
|
| 931 |
+
|
| 932 |
+
Options:
|
| 933 |
+
--data-dir <path> Path to earnings test dataset (auto-detected if downloaded)
|
| 934 |
+
--ctc-model <path> Path to CTC model directory (auto-detected if in standard location)
|
| 935 |
+
--max-files <n> Maximum number of files to process
|
| 936 |
+
--output, -o <path> Output JSON file (default: ctc_earnings_benchmark.json)
|
| 937 |
+
--auto-download Download earnings22-kws dataset if not found
|
| 938 |
+
--keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
|
| 939 |
+
|
| 940 |
+
Default locations:
|
| 941 |
+
Dataset: ~/Library/Application Support/FluidAudio/earnings22-kws/test-dataset/
|
| 942 |
+
CTC Model: ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/
|
| 943 |
+
|
| 944 |
+
Setup:
|
| 945 |
+
1. Download dataset: fluidaudio download --dataset earnings22-kws
|
| 946 |
+
2. Place CTC model in standard location
|
| 947 |
+
3. Run: fluidaudio ctc-earnings-benchmark
|
| 948 |
+
|
| 949 |
+
Examples:
|
| 950 |
+
# Run with auto-detected paths
|
| 951 |
+
fluidaudio ctc-earnings-benchmark
|
| 952 |
+
|
| 953 |
+
# Run with auto-download
|
| 954 |
+
fluidaudio ctc-earnings-benchmark --auto-download
|
| 955 |
+
|
| 956 |
+
# Run with explicit paths
|
| 957 |
+
fluidaudio ctc-earnings-benchmark \\
|
| 958 |
+
--data-dir /path/to/test-dataset \\
|
| 959 |
+
--ctc-model /path/to/parakeet-ctc-110m-coreml \\
|
| 960 |
+
--max-files 100
|
| 961 |
+
""")
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
private static func parseKeywordMode(_ value: String) -> KeywordMode? {
|
| 965 |
+
switch value.lowercased() {
|
| 966 |
+
case "chunk", "chunk-keywords":
|
| 967 |
+
return .chunk
|
| 968 |
+
case "file", "file-keywords":
|
| 969 |
+
return .file
|
| 970 |
+
default:
|
| 971 |
+
return nil
|
| 972 |
+
}
|
| 973 |
+
}
|
| 974 |
+
|
| 975 |
+
private static func parentId(from fileId: String) -> String {
|
| 976 |
+
guard let range = fileId.range(of: "_chunk") else {
|
| 977 |
+
return fileId
|
| 978 |
+
}
|
| 979 |
+
return String(fileId[..<range.lowerBound])
|
| 980 |
+
}
|
| 981 |
+
|
| 982 |
+
private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
|
| 983 |
+
guard keywordMode == .file else {
|
| 984 |
+
return [:]
|
| 985 |
+
}
|
| 986 |
+
|
| 987 |
+
var index: [String: Set<String>] = [:]
|
| 988 |
+
let suffix = ".dictionary.txt"
|
| 989 |
+
let fileManager = FileManager.default
|
| 990 |
+
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 991 |
+
|
| 992 |
+
for url in contents {
|
| 993 |
+
let name = url.lastPathComponent
|
| 994 |
+
guard name.hasSuffix(suffix) else { continue }
|
| 995 |
+
let fileId = String(name.dropLast(suffix.count))
|
| 996 |
+
let parent = parentId(from: fileId)
|
| 997 |
+
let words = try loadDictionaryWords(from: url)
|
| 998 |
+
var set = index[parent] ?? Set<String>()
|
| 999 |
+
set.formUnion(words)
|
| 1000 |
+
index[parent] = set
|
| 1001 |
+
}
|
| 1002 |
+
|
| 1003 |
+
return index.mapValues { Array($0).sorted() }
|
| 1004 |
+
}
|
| 1005 |
+
|
| 1006 |
+
private static func loadDictionaryWords(
|
| 1007 |
+
fileId: String,
|
| 1008 |
+
dictionaryFile: URL,
|
| 1009 |
+
keywordMode: KeywordMode,
|
| 1010 |
+
keywordIndex: [String: [String]]
|
| 1011 |
+
) throws -> [String] {
|
| 1012 |
+
switch keywordMode {
|
| 1013 |
+
case .chunk:
|
| 1014 |
+
return try loadDictionaryWords(from: dictionaryFile)
|
| 1015 |
+
case .file:
|
| 1016 |
+
let parent = parentId(from: fileId)
|
| 1017 |
+
if let words = keywordIndex[parent] {
|
| 1018 |
+
return words
|
| 1019 |
+
}
|
| 1020 |
+
return try loadDictionaryWords(from: dictionaryFile)
|
| 1021 |
+
}
|
| 1022 |
+
}
|
| 1023 |
+
|
| 1024 |
+
private static func loadDictionaryWords(from url: URL) throws -> [String] {
|
| 1025 |
+
let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
|
| 1026 |
+
return dictionaryContent
|
| 1027 |
+
.components(separatedBy: .newlines)
|
| 1028 |
+
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
| 1029 |
+
.filter { !$0.isEmpty }
|
| 1030 |
+
}
|
| 1031 |
+
|
| 1032 |
+
private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
|
| 1033 |
+
let textLower = text.lowercased()
|
| 1034 |
+
var result: Set<String> = []
|
| 1035 |
+
|
| 1036 |
+
for word in dictionaryWords {
|
| 1037 |
+
let wordLower = word.lowercased()
|
| 1038 |
+
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 1039 |
+
guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
|
| 1040 |
+
let range = NSRange(textLower.startIndex..., in: textLower)
|
| 1041 |
+
if regex.firstMatch(in: textLower, options: [], range: range) != nil {
|
| 1042 |
+
result.insert(wordLower)
|
| 1043 |
+
}
|
| 1044 |
+
}
|
| 1045 |
+
return result
|
| 1046 |
+
}
|
| 1047 |
+
}
|
| 1048 |
+
#endif
|
cli/HybridEarningsBenchmark.swift
ADDED
|
@@ -0,0 +1,554 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#if os(macOS)
|
| 2 |
+
import AVFoundation
|
| 3 |
+
import FluidAudio
|
| 4 |
+
import Foundation
|
| 5 |
+
|
| 6 |
+
/// Earnings22 benchmark using ONLY the Hybrid 110M model (single encoder).
|
| 7 |
+
/// CTC head provides both transcription AND keyword spotting from the same encoder.
|
| 8 |
+
public enum HybridEarningsBenchmark {
|
| 9 |
+
|
| 10 |
+
private enum KeywordMode: String {
|
| 11 |
+
case chunk
|
| 12 |
+
case file
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
public static func runCLI(arguments: [String]) async {
|
| 16 |
+
if arguments.contains("--help") || arguments.contains("-h") {
|
| 17 |
+
printUsage()
|
| 18 |
+
return
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
// Parse arguments
|
| 22 |
+
var outputFile = "hybrid_earnings_benchmark.json"
|
| 23 |
+
var maxFiles: Int? = nil
|
| 24 |
+
var decodingMode: HybridDecodingMode = .tdt
|
| 25 |
+
var useRescoring = false
|
| 26 |
+
var keywordMode: KeywordMode = .chunk
|
| 27 |
+
|
| 28 |
+
var i = 0
|
| 29 |
+
while i < arguments.count {
|
| 30 |
+
switch arguments[i] {
|
| 31 |
+
case "--output", "-o":
|
| 32 |
+
if i + 1 < arguments.count {
|
| 33 |
+
outputFile = arguments[i + 1]
|
| 34 |
+
i += 1
|
| 35 |
+
}
|
| 36 |
+
case "--max-files":
|
| 37 |
+
if i + 1 < arguments.count {
|
| 38 |
+
maxFiles = Int(arguments[i + 1])
|
| 39 |
+
i += 1
|
| 40 |
+
}
|
| 41 |
+
case "--ctc":
|
| 42 |
+
decodingMode = .ctc
|
| 43 |
+
case "--tdt":
|
| 44 |
+
decodingMode = .tdt
|
| 45 |
+
case "--rescore":
|
| 46 |
+
useRescoring = true
|
| 47 |
+
case "--keyword-mode":
|
| 48 |
+
if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
|
| 49 |
+
keywordMode = mode
|
| 50 |
+
i += 1
|
| 51 |
+
}
|
| 52 |
+
default:
|
| 53 |
+
break
|
| 54 |
+
}
|
| 55 |
+
i += 1
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
|
| 59 |
+
guard FileManager.default.fileExists(atPath: dataDir.path) else {
|
| 60 |
+
print("ERROR: Earnings dataset not found at \(dataDir.path)")
|
| 61 |
+
print("Download with: fluidaudio download --dataset earnings22-kws")
|
| 62 |
+
return
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
let modeStr = decodingMode == .ctc ? "CTC" : "TDT"
|
| 66 |
+
let rescoringStr = useRescoring ? " + Rescoring" : ""
|
| 67 |
+
print("Hybrid 110M Earnings Benchmark (Decoding: \(modeStr)\(rescoringStr))")
|
| 68 |
+
print(" Output file: \(outputFile)")
|
| 69 |
+
print(" Decoding mode: \(modeStr)")
|
| 70 |
+
print(" Rescoring: \(useRescoring ? "enabled" : "disabled")")
|
| 71 |
+
print(" Keyword mode: \(keywordMode.rawValue)")
|
| 72 |
+
|
| 73 |
+
do {
|
| 74 |
+
// Load Hybrid 110M model (single encoder with CTC head)
|
| 75 |
+
print("Loading Hybrid 110M model...")
|
| 76 |
+
let hybridModels = try await HybridAsrModels.downloadAndLoad()
|
| 77 |
+
let hybridManager = HybridAsrManager(models: hybridModels, decodingMode: decodingMode)
|
| 78 |
+
let spotter = HybridKeywordSpotter(vocabulary: hybridModels.vocabulary, blankId: hybridModels.blankId)
|
| 79 |
+
print(" Vocab size: \(hybridModels.vocabSize)")
|
| 80 |
+
|
| 81 |
+
// Collect test files
|
| 82 |
+
let fileIds = try collectFileIds(from: dataDir, maxFiles: maxFiles)
|
| 83 |
+
let keywordIndex = try buildKeywordIndex(dataDir: dataDir, keywordMode: keywordMode)
|
| 84 |
+
|
| 85 |
+
if fileIds.isEmpty {
|
| 86 |
+
print("ERROR: No test files found")
|
| 87 |
+
return
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
print("Processing \(fileIds.count) test files...")
|
| 91 |
+
|
| 92 |
+
var results: [[String: Any]] = []
|
| 93 |
+
var totalWer = 0.0
|
| 94 |
+
var totalKeywordReference = 0
|
| 95 |
+
var totalKeywordPredicted = 0
|
| 96 |
+
var totalKeywordTruePositives = 0
|
| 97 |
+
var totalKeywordFalsePositives = 0
|
| 98 |
+
var totalKeywordFalseNegatives = 0
|
| 99 |
+
var totalAudioDuration = 0.0
|
| 100 |
+
var totalProcessingTime = 0.0
|
| 101 |
+
|
| 102 |
+
for (index, fileId) in fileIds.enumerated() {
|
| 103 |
+
print("[\(index + 1)/\(fileIds.count)] \(fileId)")
|
| 104 |
+
|
| 105 |
+
if let result = try await processFile(
|
| 106 |
+
fileId: fileId,
|
| 107 |
+
dataDir: dataDir,
|
| 108 |
+
hybridManager: hybridManager,
|
| 109 |
+
spotter: spotter,
|
| 110 |
+
useRescoring: useRescoring,
|
| 111 |
+
keywordMode: keywordMode,
|
| 112 |
+
keywordIndex: keywordIndex
|
| 113 |
+
) {
|
| 114 |
+
results.append(result)
|
| 115 |
+
totalWer += result["wer"] as? Double ?? 0
|
| 116 |
+
totalKeywordReference += result["keywordReference"] as? Int ?? 0
|
| 117 |
+
totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
|
| 118 |
+
totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
|
| 119 |
+
totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
|
| 120 |
+
totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
|
| 121 |
+
totalAudioDuration += result["audioLength"] as? Double ?? 0
|
| 122 |
+
totalProcessingTime += result["processingTime"] as? Double ?? 0
|
| 123 |
+
|
| 124 |
+
let wer = result["wer"] as? Double ?? 0
|
| 125 |
+
let precision = result["keywordPrecision"] as? Double ?? 0
|
| 126 |
+
let recall = result["keywordRecall"] as? Double ?? 0
|
| 127 |
+
let fscore = result["keywordFscore"] as? Double ?? 0
|
| 128 |
+
print(
|
| 129 |
+
" WER: \(String(format: "%.1f", wer))%, " +
|
| 130 |
+
"KW P/R/F: \(String(format: "%.2f", precision))/" +
|
| 131 |
+
"\(String(format: "%.2f", recall))/" +
|
| 132 |
+
"\(String(format: "%.2f", fscore))"
|
| 133 |
+
)
|
| 134 |
+
}
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
// Calculate summary
|
| 138 |
+
let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
|
| 139 |
+
let keywordPrecision =
|
| 140 |
+
totalKeywordPredicted > 0
|
| 141 |
+
? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
|
| 142 |
+
: 0
|
| 143 |
+
let keywordRecall =
|
| 144 |
+
totalKeywordReference > 0
|
| 145 |
+
? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
|
| 146 |
+
: 0
|
| 147 |
+
let keywordFscore =
|
| 148 |
+
(keywordPrecision + keywordRecall) > 0
|
| 149 |
+
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 150 |
+
: 0
|
| 151 |
+
|
| 152 |
+
// Print summary
|
| 153 |
+
print("\n" + String(repeating: "=", count: 60))
|
| 154 |
+
print("HYBRID 110M BENCHMARK (\(modeStr)\(rescoringStr))")
|
| 155 |
+
print(String(repeating: "=", count: 60))
|
| 156 |
+
print("Model: parakeet-tdt-ctc-110m-hybrid")
|
| 157 |
+
print("Decoding: \(modeStr), Rescoring: \(useRescoring ? "yes" : "no")")
|
| 158 |
+
print("Total tests: \(results.count)")
|
| 159 |
+
print("Average WER: \(String(format: "%.2f", avgWer))%")
|
| 160 |
+
print(
|
| 161 |
+
"Keyword Precision/Recall/F1: " +
|
| 162 |
+
"\(String(format: "%.2f", keywordPrecision))/" +
|
| 163 |
+
"\(String(format: "%.2f", keywordRecall))/" +
|
| 164 |
+
"\(String(format: "%.2f", keywordFscore))"
|
| 165 |
+
)
|
| 166 |
+
print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
|
| 167 |
+
print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
|
| 168 |
+
if totalProcessingTime > 0 {
|
| 169 |
+
print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
|
| 170 |
+
}
|
| 171 |
+
print(String(repeating: "=", count: 60))
|
| 172 |
+
|
| 173 |
+
// Sort results by WER descending (worst first)
|
| 174 |
+
let sortedResults = results.sorted { r1, r2 in
|
| 175 |
+
let wer1 = r1["wer"] as? Double ?? 0
|
| 176 |
+
let wer2 = r2["wer"] as? Double ?? 0
|
| 177 |
+
return wer1 > wer2
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
// Save to JSON
|
| 181 |
+
let summaryDict: [String: Any] = [
|
| 182 |
+
"totalTests": results.count,
|
| 183 |
+
"avgWer": round(avgWer * 100) / 100,
|
| 184 |
+
"keywordTruePositives": totalKeywordTruePositives,
|
| 185 |
+
"keywordFalsePositives": totalKeywordFalsePositives,
|
| 186 |
+
"keywordFalseNegatives": totalKeywordFalseNegatives,
|
| 187 |
+
"keywordPredicted": totalKeywordPredicted,
|
| 188 |
+
"keywordReference": totalKeywordReference,
|
| 189 |
+
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 190 |
+
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 191 |
+
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 192 |
+
"totalAudioDuration": round(totalAudioDuration * 100) / 100,
|
| 193 |
+
"totalProcessingTime": round(totalProcessingTime * 100) / 100,
|
| 194 |
+
]
|
| 195 |
+
|
| 196 |
+
let output: [String: Any] = [
|
| 197 |
+
"model": "parakeet-tdt-ctc-110m-hybrid",
|
| 198 |
+
"approach": "single-encoder",
|
| 199 |
+
"decodingMode": modeStr,
|
| 200 |
+
"rescoring": useRescoring,
|
| 201 |
+
"keywordMode": keywordMode.rawValue,
|
| 202 |
+
"summary": summaryDict,
|
| 203 |
+
"results": sortedResults,
|
| 204 |
+
]
|
| 205 |
+
|
| 206 |
+
let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
|
| 207 |
+
try jsonData.write(to: URL(fileURLWithPath: outputFile))
|
| 208 |
+
print("\nResults written to: \(outputFile)")
|
| 209 |
+
|
| 210 |
+
} catch {
|
| 211 |
+
print("ERROR: \(error)")
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
|
| 216 |
+
var fileIds: [String] = []
|
| 217 |
+
let suffix = ".dictionary.txt"
|
| 218 |
+
|
| 219 |
+
let fileManager = FileManager.default
|
| 220 |
+
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 221 |
+
|
| 222 |
+
for url in contents.sorted(by: { $0.path < $1.path }) {
|
| 223 |
+
let name = url.lastPathComponent
|
| 224 |
+
if name.hasSuffix(suffix) {
|
| 225 |
+
let data = try? Data(contentsOf: url)
|
| 226 |
+
if let data = data, !data.isEmpty {
|
| 227 |
+
let fileId = String(name.dropLast(suffix.count))
|
| 228 |
+
fileIds.append(fileId)
|
| 229 |
+
}
|
| 230 |
+
}
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
if let maxFiles = maxFiles {
|
| 234 |
+
return Array(fileIds.prefix(maxFiles))
|
| 235 |
+
}
|
| 236 |
+
return fileIds
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
private static func processFile(
|
| 240 |
+
fileId: String,
|
| 241 |
+
dataDir: URL,
|
| 242 |
+
hybridManager: HybridAsrManager,
|
| 243 |
+
spotter: HybridKeywordSpotter,
|
| 244 |
+
useRescoring: Bool,
|
| 245 |
+
keywordMode: KeywordMode,
|
| 246 |
+
keywordIndex: [String: [String]]
|
| 247 |
+
) async throws -> [String: Any]? {
|
| 248 |
+
let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
|
| 249 |
+
let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
|
| 250 |
+
let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
|
| 251 |
+
|
| 252 |
+
let fm = FileManager.default
|
| 253 |
+
guard fm.fileExists(atPath: wavFile.path),
|
| 254 |
+
fm.fileExists(atPath: dictionaryFile.path)
|
| 255 |
+
else {
|
| 256 |
+
return nil
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
// Load dictionary words (chunk or file keywords)
|
| 260 |
+
let dictionaryWords = try loadDictionaryWords(
|
| 261 |
+
fileId: fileId,
|
| 262 |
+
dictionaryFile: dictionaryFile,
|
| 263 |
+
keywordMode: keywordMode,
|
| 264 |
+
keywordIndex: keywordIndex
|
| 265 |
+
)
|
| 266 |
+
|
| 267 |
+
// Load reference text
|
| 268 |
+
let referenceRaw =
|
| 269 |
+
(try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
| 270 |
+
|
| 271 |
+
// Get audio samples
|
| 272 |
+
let audioFile = try AVAudioFile(forReading: wavFile)
|
| 273 |
+
let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
|
| 274 |
+
let format = audioFile.processingFormat
|
| 275 |
+
let frameCount = AVAudioFrameCount(audioFile.length)
|
| 276 |
+
|
| 277 |
+
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
|
| 278 |
+
return nil
|
| 279 |
+
}
|
| 280 |
+
try audioFile.read(into: buffer)
|
| 281 |
+
|
| 282 |
+
// Resample to 16kHz
|
| 283 |
+
let converter = AudioConverter()
|
| 284 |
+
let samples = try converter.resampleBuffer(buffer)
|
| 285 |
+
|
| 286 |
+
// Build custom vocabulary for keyword spotting
|
| 287 |
+
var vocabTerms: [CustomVocabularyTerm] = []
|
| 288 |
+
for word in dictionaryWords {
|
| 289 |
+
let term = CustomVocabularyTerm(
|
| 290 |
+
text: word,
|
| 291 |
+
weight: nil,
|
| 292 |
+
aliases: nil,
|
| 293 |
+
tokenIds: nil,
|
| 294 |
+
ctcTokenIds: nil
|
| 295 |
+
)
|
| 296 |
+
vocabTerms.append(term)
|
| 297 |
+
}
|
| 298 |
+
let customVocab = CustomVocabularyContext(terms: vocabTerms)
|
| 299 |
+
|
| 300 |
+
// Run Hybrid 110M using new API (TDT transcription + CTC keyword detection)
|
| 301 |
+
let rescorerConfig: HybridTextRescorer.Config? = useRescoring ? .default : nil
|
| 302 |
+
let hybridResult = try await hybridManager.transcribeHybrid(
|
| 303 |
+
audioSamples: samples,
|
| 304 |
+
customVocabulary: customVocab,
|
| 305 |
+
rescorerConfig: rescorerConfig
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
// Skip if empty transcription
|
| 309 |
+
if hybridResult.text.isEmpty {
|
| 310 |
+
print(" SKIPPED: Empty transcription")
|
| 311 |
+
return nil
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
let detections = hybridResult.keywordDetections
|
| 315 |
+
let processingTime = hybridResult.processingTime
|
| 316 |
+
|
| 317 |
+
// Use hybrid transcription as hypothesis (may be rescored if enabled)
|
| 318 |
+
let hypothesis = hybridResult.text
|
| 319 |
+
|
| 320 |
+
// Normalize texts
|
| 321 |
+
let referenceNormalized = TextNormalizer.normalize(referenceRaw)
|
| 322 |
+
let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
|
| 323 |
+
|
| 324 |
+
// Keyword sets for precision/recall
|
| 325 |
+
let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
|
| 326 |
+
let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
|
| 327 |
+
let truePositives = referenceKeywords.intersection(predictedKeywords)
|
| 328 |
+
let falsePositives = predictedKeywords.subtracting(referenceKeywords)
|
| 329 |
+
let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
|
| 330 |
+
let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
|
| 331 |
+
let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
|
| 332 |
+
let keywordFscore =
|
| 333 |
+
(keywordPrecision + keywordRecall) > 0
|
| 334 |
+
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 335 |
+
: 0
|
| 336 |
+
|
| 337 |
+
let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 338 |
+
!$0.isEmpty
|
| 339 |
+
}
|
| 340 |
+
let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 341 |
+
!$0.isEmpty
|
| 342 |
+
}
|
| 343 |
+
|
| 344 |
+
// Calculate WER
|
| 345 |
+
let wer: Double
|
| 346 |
+
if referenceWords.isEmpty {
|
| 347 |
+
wer = hypothesisWords.isEmpty ? 0.0 : 1.0
|
| 348 |
+
} else {
|
| 349 |
+
wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
|
| 350 |
+
}
|
| 351 |
+
|
| 352 |
+
// Count dictionary detections for debugging
|
| 353 |
+
let minCtcScore: Float = -15.0
|
| 354 |
+
var detectionDetails: [[String: Any]] = []
|
| 355 |
+
var foundWords: Set<String> = []
|
| 356 |
+
|
| 357 |
+
// CTC detections
|
| 358 |
+
for detection in detections {
|
| 359 |
+
let inRef = referenceKeywords.contains(detection.term.text.lowercased())
|
| 360 |
+
let detail: [String: Any] = [
|
| 361 |
+
"word": detection.term.text,
|
| 362 |
+
"score": round(Double(detection.score) * 100) / 100,
|
| 363 |
+
"startTime": round(detection.startTime * 100) / 100,
|
| 364 |
+
"endTime": round(detection.endTime * 100) / 100,
|
| 365 |
+
"source": "ctc",
|
| 366 |
+
"inReference": inRef,
|
| 367 |
+
]
|
| 368 |
+
detectionDetails.append(detail)
|
| 369 |
+
|
| 370 |
+
if detection.score >= minCtcScore {
|
| 371 |
+
foundWords.insert(detection.term.text.lowercased())
|
| 372 |
+
}
|
| 373 |
+
}
|
| 374 |
+
|
| 375 |
+
// Fallback: check hypothesis for dictionary words not found by CTC
|
| 376 |
+
let hypothesisLower = hypothesis.lowercased()
|
| 377 |
+
for word in dictionaryWords {
|
| 378 |
+
let wordLower = word.lowercased()
|
| 379 |
+
if !foundWords.contains(wordLower) {
|
| 380 |
+
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 381 |
+
if let regex = try? NSRegularExpression(pattern: pattern, options: []),
|
| 382 |
+
regex.firstMatch(
|
| 383 |
+
in: hypothesisLower, options: [],
|
| 384 |
+
range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
|
| 385 |
+
{
|
| 386 |
+
foundWords.insert(wordLower)
|
| 387 |
+
let inRef = referenceKeywords.contains(wordLower)
|
| 388 |
+
let detail: [String: Any] = [
|
| 389 |
+
"word": word,
|
| 390 |
+
"score": 0.0,
|
| 391 |
+
"startTime": 0.0,
|
| 392 |
+
"endTime": 0.0,
|
| 393 |
+
"source": "hypothesis",
|
| 394 |
+
"inReference": inRef,
|
| 395 |
+
]
|
| 396 |
+
detectionDetails.append(detail)
|
| 397 |
+
}
|
| 398 |
+
}
|
| 399 |
+
}
|
| 400 |
+
|
| 401 |
+
let result: [String: Any] = [
|
| 402 |
+
"fileId": fileId,
|
| 403 |
+
"reference": referenceNormalized,
|
| 404 |
+
"hypothesis": hypothesisNormalized,
|
| 405 |
+
"wer": round(wer * 10000) / 100,
|
| 406 |
+
"dictFound": predictedKeywords.count,
|
| 407 |
+
"dictTotal": referenceKeywords.count,
|
| 408 |
+
"keywordPredicted": predictedKeywords.count,
|
| 409 |
+
"keywordReference": referenceKeywords.count,
|
| 410 |
+
"keywordTruePositives": truePositives.count,
|
| 411 |
+
"keywordFalsePositives": falsePositives.count,
|
| 412 |
+
"keywordFalseNegatives": falseNegatives.count,
|
| 413 |
+
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 414 |
+
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 415 |
+
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 416 |
+
"audioLength": round(audioLength * 100) / 100,
|
| 417 |
+
"processingTime": round(processingTime * 1000) / 1000,
|
| 418 |
+
"ctcDetections": detectionDetails,
|
| 419 |
+
]
|
| 420 |
+
return result
|
| 421 |
+
}
|
| 422 |
+
|
| 423 |
+
private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
|
| 424 |
+
if reference.isEmpty {
|
| 425 |
+
return hypothesis.isEmpty ? 0.0 : 1.0
|
| 426 |
+
}
|
| 427 |
+
|
| 428 |
+
let m = reference.count
|
| 429 |
+
let n = hypothesis.count
|
| 430 |
+
var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
|
| 431 |
+
|
| 432 |
+
for i in 0...m { dp[i][0] = i }
|
| 433 |
+
for j in 0...n { dp[0][j] = j }
|
| 434 |
+
|
| 435 |
+
for i in 1...m {
|
| 436 |
+
for j in 1...n {
|
| 437 |
+
if reference[i - 1] == hypothesis[j - 1] {
|
| 438 |
+
dp[i][j] = dp[i - 1][j - 1]
|
| 439 |
+
} else {
|
| 440 |
+
dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
|
| 441 |
+
}
|
| 442 |
+
}
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
return Double(dp[m][n]) / Double(m)
|
| 446 |
+
}
|
| 447 |
+
|
| 448 |
+
private static func printUsage() {
|
| 449 |
+
print(
|
| 450 |
+
"""
|
| 451 |
+
Hybrid 110M Earnings Benchmark (Single Encoder)
|
| 452 |
+
|
| 453 |
+
Usage: fluidaudio hybrid-earnings-benchmark [options]
|
| 454 |
+
|
| 455 |
+
This benchmark uses ONLY the Hybrid 110M model:
|
| 456 |
+
- Single encoder provides CTC log-probs
|
| 457 |
+
- CTC greedy decode for transcription
|
| 458 |
+
- CTC keyword spotting from same encoder output
|
| 459 |
+
|
| 460 |
+
Options:
|
| 461 |
+
--max-files <n> Maximum number of files to process
|
| 462 |
+
--output, -o <path> Output JSON file (default: hybrid_earnings_benchmark.json)
|
| 463 |
+
--keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
|
| 464 |
+
|
| 465 |
+
Compare with:
|
| 466 |
+
fluidaudio ctc-earnings-benchmark (Canary-CTC + TDT 0.6B, two encoders)
|
| 467 |
+
""")
|
| 468 |
+
}
|
| 469 |
+
|
| 470 |
+
private static func parseKeywordMode(_ value: String) -> KeywordMode? {
|
| 471 |
+
switch value.lowercased() {
|
| 472 |
+
case "chunk", "chunk-keywords":
|
| 473 |
+
return .chunk
|
| 474 |
+
case "file", "file-keywords":
|
| 475 |
+
return .file
|
| 476 |
+
default:
|
| 477 |
+
return nil
|
| 478 |
+
}
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
private static func parentId(from fileId: String) -> String {
|
| 482 |
+
guard let range = fileId.range(of: "_chunk") else {
|
| 483 |
+
return fileId
|
| 484 |
+
}
|
| 485 |
+
return String(fileId[..<range.lowerBound])
|
| 486 |
+
}
|
| 487 |
+
|
| 488 |
+
private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
|
| 489 |
+
guard keywordMode == .file else {
|
| 490 |
+
return [:]
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
var index: [String: Set<String>] = [:]
|
| 494 |
+
let suffix = ".dictionary.txt"
|
| 495 |
+
let fileManager = FileManager.default
|
| 496 |
+
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 497 |
+
|
| 498 |
+
for url in contents {
|
| 499 |
+
let name = url.lastPathComponent
|
| 500 |
+
guard name.hasSuffix(suffix) else { continue }
|
| 501 |
+
let fileId = String(name.dropLast(suffix.count))
|
| 502 |
+
let parent = parentId(from: fileId)
|
| 503 |
+
let words = try loadDictionaryWords(from: url)
|
| 504 |
+
var set = index[parent] ?? Set<String>()
|
| 505 |
+
set.formUnion(words)
|
| 506 |
+
index[parent] = set
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
return index.mapValues { Array($0).sorted() }
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
private static func loadDictionaryWords(
|
| 513 |
+
fileId: String,
|
| 514 |
+
dictionaryFile: URL,
|
| 515 |
+
keywordMode: KeywordMode,
|
| 516 |
+
keywordIndex: [String: [String]]
|
| 517 |
+
) throws -> [String] {
|
| 518 |
+
switch keywordMode {
|
| 519 |
+
case .chunk:
|
| 520 |
+
return try loadDictionaryWords(from: dictionaryFile)
|
| 521 |
+
case .file:
|
| 522 |
+
let parent = parentId(from: fileId)
|
| 523 |
+
if let words = keywordIndex[parent] {
|
| 524 |
+
return words
|
| 525 |
+
}
|
| 526 |
+
return try loadDictionaryWords(from: dictionaryFile)
|
| 527 |
+
}
|
| 528 |
+
}
|
| 529 |
+
|
| 530 |
+
private static func loadDictionaryWords(from url: URL) throws -> [String] {
|
| 531 |
+
let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
|
| 532 |
+
return dictionaryContent
|
| 533 |
+
.components(separatedBy: .newlines)
|
| 534 |
+
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
| 535 |
+
.filter { !$0.isEmpty }
|
| 536 |
+
}
|
| 537 |
+
|
| 538 |
+
private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
|
| 539 |
+
let textLower = text.lowercased()
|
| 540 |
+
var result: Set<String> = []
|
| 541 |
+
|
| 542 |
+
for word in dictionaryWords {
|
| 543 |
+
let wordLower = word.lowercased()
|
| 544 |
+
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 545 |
+
guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
|
| 546 |
+
let range = NSRange(textLower.startIndex..., in: textLower)
|
| 547 |
+
if regex.firstMatch(in: textLower, options: [], range: range) != nil {
|
| 548 |
+
result.insert(wordLower)
|
| 549 |
+
}
|
| 550 |
+
}
|
| 551 |
+
return result
|
| 552 |
+
}
|
| 553 |
+
}
|
| 554 |
+
#endif
|
convert/.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py
ADDED
|
@@ -0,0 +1,323 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Convert Parakeet TDT-CTC 110M decoder components to CoreML.
|
| 4 |
+
|
| 5 |
+
This script exports the TDT decoder (prediction network) and joint network
|
| 6 |
+
with the SAME format as the working 0.6B model:
|
| 7 |
+
- JointDecision outputs token_id, token_prob, duration (argmax done inside)
|
| 8 |
+
- Uses shape [1, dim, 1] for encoder/decoder steps
|
| 9 |
+
- Matches the interface expected by TdtDecoderV3
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import argparse
|
| 13 |
+
import os
|
| 14 |
+
import torch
|
| 15 |
+
import torch.nn.functional as F
|
| 16 |
+
import coremltools as ct
|
| 17 |
+
import numpy as np
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
# NeMo imports
|
| 21 |
+
import nemo.collections.asr as nemo_asr
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def get_model_config(model):
|
| 25 |
+
"""Extract model configuration."""
|
| 26 |
+
encoder_dim = None
|
| 27 |
+
pred_hidden = 640 # Default for parakeet models
|
| 28 |
+
num_layers = 1
|
| 29 |
+
vocab_size = 1024
|
| 30 |
+
num_durations = 5
|
| 31 |
+
|
| 32 |
+
# Get encoder dimension
|
| 33 |
+
if hasattr(model, 'encoder'):
|
| 34 |
+
encoder = model.encoder
|
| 35 |
+
if hasattr(encoder, 'd_model'):
|
| 36 |
+
encoder_dim = encoder.d_model
|
| 37 |
+
elif hasattr(encoder, '_feat_out'):
|
| 38 |
+
encoder_dim = encoder._feat_out
|
| 39 |
+
|
| 40 |
+
# Get decoder config
|
| 41 |
+
if hasattr(model, 'decoder'):
|
| 42 |
+
decoder = model.decoder
|
| 43 |
+
if hasattr(decoder, 'pred_hidden'):
|
| 44 |
+
pred_hidden = decoder.pred_hidden
|
| 45 |
+
if hasattr(decoder, 'pred_rnn_layers'):
|
| 46 |
+
num_layers = decoder.pred_rnn_layers
|
| 47 |
+
|
| 48 |
+
# Get joint config
|
| 49 |
+
if hasattr(model, 'joint'):
|
| 50 |
+
joint = model.joint
|
| 51 |
+
if hasattr(joint, 'num_extra_outputs'):
|
| 52 |
+
num_durations = joint.num_extra_outputs
|
| 53 |
+
if hasattr(joint, 'num_classes'):
|
| 54 |
+
vocab_size = joint.num_classes - num_durations
|
| 55 |
+
|
| 56 |
+
return {
|
| 57 |
+
'encoder_dim': encoder_dim,
|
| 58 |
+
'pred_hidden': pred_hidden,
|
| 59 |
+
'num_layers': num_layers,
|
| 60 |
+
'vocab_size': vocab_size,
|
| 61 |
+
'num_durations': num_durations,
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
class DecoderWrapper(torch.nn.Module):
|
| 66 |
+
"""
|
| 67 |
+
Wrapper for the RNNT/TDT decoder (prediction network).
|
| 68 |
+
|
| 69 |
+
Matches 0.6B format:
|
| 70 |
+
- Input: targets[1,1], target_lengths[1], h_in[num_layers,1,pred_hidden], c_in[...]
|
| 71 |
+
- Output: decoder_output[1,pred_hidden,2], h_out[...], c_out[...]
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
def __init__(self, decoder, pred_hidden):
|
| 75 |
+
super().__init__()
|
| 76 |
+
self.decoder = decoder
|
| 77 |
+
self.pred_hidden = pred_hidden
|
| 78 |
+
|
| 79 |
+
def forward(self, targets, target_lengths, h_in, c_in):
|
| 80 |
+
"""
|
| 81 |
+
Args:
|
| 82 |
+
targets: [1, 1] - previous token ID
|
| 83 |
+
target_lengths: [1] - always 1
|
| 84 |
+
h_in: [num_layers, 1, pred_hidden]
|
| 85 |
+
c_in: [num_layers, 1, pred_hidden]
|
| 86 |
+
Returns:
|
| 87 |
+
decoder_output: [1, pred_hidden, 2] - prediction network output (transposed)
|
| 88 |
+
h_out: [num_layers, 1, pred_hidden]
|
| 89 |
+
c_out: [num_layers, 1, pred_hidden]
|
| 90 |
+
"""
|
| 91 |
+
state = (h_in, c_in)
|
| 92 |
+
# pred_output shape: [batch, time, pred_hidden] = [1, 1, pred_hidden]
|
| 93 |
+
pred_output, new_state = self.decoder.predict(targets, state=state, add_sos=False)
|
| 94 |
+
h_out, c_out = new_state
|
| 95 |
+
|
| 96 |
+
# Transpose to [batch, pred_hidden, time] and concat two time steps
|
| 97 |
+
# (0.6B outputs [1, 640, 2] - we match this by duplicating)
|
| 98 |
+
pred_transposed = pred_output.transpose(1, 2) # [1, pred_hidden, 1]
|
| 99 |
+
decoder_output = torch.cat([pred_transposed, pred_transposed], dim=2) # [1, pred_hidden, 2]
|
| 100 |
+
|
| 101 |
+
return decoder_output, h_out, c_out
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
class JointWrapper(torch.nn.Module):
|
| 105 |
+
"""
|
| 106 |
+
Wrapper for the TDT joint network with internal argmax.
|
| 107 |
+
|
| 108 |
+
Matches 0.6B format:
|
| 109 |
+
- Input: encoder_step[1,encoder_dim,1], decoder_step[1,pred_hidden,1]
|
| 110 |
+
- Output: token_id[1,1,1], token_prob[1,1,1], duration[1,1,1]
|
| 111 |
+
"""
|
| 112 |
+
|
| 113 |
+
def __init__(self, joint, vocab_size, num_durations=5):
|
| 114 |
+
super().__init__()
|
| 115 |
+
self.joint = joint
|
| 116 |
+
self.vocab_size = vocab_size
|
| 117 |
+
self.num_durations = num_durations
|
| 118 |
+
|
| 119 |
+
def forward(self, encoder_step, decoder_step):
|
| 120 |
+
"""
|
| 121 |
+
Args:
|
| 122 |
+
encoder_step: [1, encoder_dim, 1]
|
| 123 |
+
decoder_step: [1, pred_hidden, 1]
|
| 124 |
+
Returns:
|
| 125 |
+
token_id: [1, 1, 1] - argmax token ID
|
| 126 |
+
token_prob: [1, 1, 1] - probability of selected token
|
| 127 |
+
duration: [1, 1, 1] - argmax duration bin
|
| 128 |
+
"""
|
| 129 |
+
# Transpose to [batch, 1, dim] for joint network
|
| 130 |
+
enc = encoder_step.transpose(1, 2) # [1, 1, encoder_dim]
|
| 131 |
+
dec = decoder_step.transpose(1, 2) # [1, 1, pred_hidden]
|
| 132 |
+
|
| 133 |
+
# Run joint network
|
| 134 |
+
# Joint output: [1, 1, 1, vocab_size + 1 (blank) + num_durations]
|
| 135 |
+
joint_out = self.joint.joint(enc, dec)
|
| 136 |
+
|
| 137 |
+
# Debug: print shape on first call
|
| 138 |
+
if not hasattr(self, '_debug_printed'):
|
| 139 |
+
self._debug_printed = True
|
| 140 |
+
print(f" Joint output shape: {joint_out.shape}")
|
| 141 |
+
print(f" Expected: vocab={self.vocab_size} + blank=1 + durations={self.num_durations} = {self.vocab_size + 1 + self.num_durations}")
|
| 142 |
+
|
| 143 |
+
# Split: token logits include vocab + blank, durations are separate
|
| 144 |
+
# vocab_size = 1024 tokens (0-1023), blank = index 1024, durations = indices 1025+
|
| 145 |
+
num_tokens = self.vocab_size + 1 # Include blank at vocab_size
|
| 146 |
+
logits = joint_out[..., :num_tokens] # [1, 1, 1, vocab_size + 1]
|
| 147 |
+
duration_logits = joint_out[..., num_tokens:] # [1, 1, 1, num_durations]
|
| 148 |
+
|
| 149 |
+
# Apply softmax and get probabilities
|
| 150 |
+
probs = F.softmax(logits, dim=-1)
|
| 151 |
+
|
| 152 |
+
# Argmax for token
|
| 153 |
+
token_id = torch.argmax(logits, dim=-1, keepdim=True) # [1, 1, 1, 1]
|
| 154 |
+
token_id = token_id.squeeze(-1) # [1, 1, 1]
|
| 155 |
+
|
| 156 |
+
# Get probability of selected token
|
| 157 |
+
token_prob = torch.gather(probs, -1, token_id.unsqueeze(-1)) # [1, 1, 1, 1]
|
| 158 |
+
token_prob = token_prob.squeeze(-1) # [1, 1, 1]
|
| 159 |
+
|
| 160 |
+
# Argmax for duration
|
| 161 |
+
duration = torch.argmax(duration_logits, dim=-1, keepdim=False) # [1, 1, 1]
|
| 162 |
+
|
| 163 |
+
return token_id.int(), token_prob, duration.int()
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
def convert_decoder(model, config, output_dir: Path):
|
| 167 |
+
"""Convert decoder to CoreML."""
|
| 168 |
+
print(f"Converting Decoder...")
|
| 169 |
+
print(f" pred_hidden={config['pred_hidden']}, num_layers={config['num_layers']}")
|
| 170 |
+
|
| 171 |
+
wrapper = DecoderWrapper(model.decoder, config['pred_hidden'])
|
| 172 |
+
wrapper.eval()
|
| 173 |
+
|
| 174 |
+
# Create example inputs
|
| 175 |
+
targets = torch.zeros(1, 1, dtype=torch.long)
|
| 176 |
+
target_lengths = torch.ones(1, dtype=torch.long)
|
| 177 |
+
h_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
|
| 178 |
+
c_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
|
| 179 |
+
|
| 180 |
+
# Trace the model
|
| 181 |
+
with torch.no_grad():
|
| 182 |
+
traced = torch.jit.trace(wrapper, (targets, target_lengths, h_in, c_in))
|
| 183 |
+
|
| 184 |
+
# Convert to CoreML
|
| 185 |
+
mlmodel = ct.convert(
|
| 186 |
+
traced,
|
| 187 |
+
inputs=[
|
| 188 |
+
ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
|
| 189 |
+
ct.TensorType(name="target_lengths", shape=(1,), dtype=np.int32),
|
| 190 |
+
ct.TensorType(name="h_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
|
| 191 |
+
ct.TensorType(name="c_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
|
| 192 |
+
],
|
| 193 |
+
outputs=[
|
| 194 |
+
ct.TensorType(name="decoder_output"),
|
| 195 |
+
ct.TensorType(name="h_out"),
|
| 196 |
+
ct.TensorType(name="c_out"),
|
| 197 |
+
],
|
| 198 |
+
minimum_deployment_target=ct.target.iOS17,
|
| 199 |
+
compute_precision=ct.precision.FLOAT16,
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
# Add metadata
|
| 203 |
+
mlmodel.author = "Fluid Inference"
|
| 204 |
+
mlmodel.short_description = "Hybrid TDT Decoder (110M)"
|
| 205 |
+
|
| 206 |
+
# Save
|
| 207 |
+
output_path = output_dir / "Decoder.mlpackage"
|
| 208 |
+
mlmodel.save(str(output_path))
|
| 209 |
+
print(f" Saved to {output_path}")
|
| 210 |
+
|
| 211 |
+
return mlmodel
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def convert_joint(model, config, output_dir: Path):
|
| 215 |
+
"""Convert joint network to CoreML."""
|
| 216 |
+
print(f"Converting JointDecision...")
|
| 217 |
+
print(f" encoder_dim={config['encoder_dim']}, pred_hidden={config['pred_hidden']}")
|
| 218 |
+
print(f" vocab_size={config['vocab_size']}, num_durations={config['num_durations']}")
|
| 219 |
+
|
| 220 |
+
wrapper = JointWrapper(
|
| 221 |
+
model.joint,
|
| 222 |
+
vocab_size=config['vocab_size'],
|
| 223 |
+
num_durations=config['num_durations']
|
| 224 |
+
)
|
| 225 |
+
wrapper.eval()
|
| 226 |
+
|
| 227 |
+
# Create example inputs - shape [1, dim, 1]
|
| 228 |
+
encoder_step = torch.randn(1, config['encoder_dim'], 1)
|
| 229 |
+
decoder_step = torch.randn(1, config['pred_hidden'], 1)
|
| 230 |
+
|
| 231 |
+
# Trace the model
|
| 232 |
+
with torch.no_grad():
|
| 233 |
+
traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
|
| 234 |
+
|
| 235 |
+
# Convert to CoreML
|
| 236 |
+
mlmodel = ct.convert(
|
| 237 |
+
traced,
|
| 238 |
+
inputs=[
|
| 239 |
+
ct.TensorType(name="encoder_step", shape=(1, config['encoder_dim'], 1), dtype=np.float32),
|
| 240 |
+
ct.TensorType(name="decoder_step", shape=(1, config['pred_hidden'], 1), dtype=np.float32),
|
| 241 |
+
],
|
| 242 |
+
outputs=[
|
| 243 |
+
ct.TensorType(name="token_id"),
|
| 244 |
+
ct.TensorType(name="token_prob"),
|
| 245 |
+
ct.TensorType(name="duration"),
|
| 246 |
+
],
|
| 247 |
+
minimum_deployment_target=ct.target.iOS17,
|
| 248 |
+
compute_precision=ct.precision.FLOAT16,
|
| 249 |
+
)
|
| 250 |
+
|
| 251 |
+
# Add metadata
|
| 252 |
+
mlmodel.author = "Fluid Inference"
|
| 253 |
+
mlmodel.short_description = "Hybrid Joint Decision (110M)"
|
| 254 |
+
|
| 255 |
+
# Save
|
| 256 |
+
output_path = output_dir / "JointDecision.mlpackage"
|
| 257 |
+
mlmodel.save(str(output_path))
|
| 258 |
+
print(f" Saved to {output_path}")
|
| 259 |
+
|
| 260 |
+
return mlmodel
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
def main():
|
| 264 |
+
parser = argparse.ArgumentParser(description="Convert TDT decoder to CoreML (0.6B format)")
|
| 265 |
+
parser.add_argument(
|
| 266 |
+
"--model-name",
|
| 267 |
+
default="nvidia/parakeet-tdt_ctc-110m",
|
| 268 |
+
help="NeMo model name or path"
|
| 269 |
+
)
|
| 270 |
+
parser.add_argument(
|
| 271 |
+
"--output-dir",
|
| 272 |
+
type=Path,
|
| 273 |
+
default=Path("./output"),
|
| 274 |
+
help="Output directory for CoreML models"
|
| 275 |
+
)
|
| 276 |
+
args = parser.parse_args()
|
| 277 |
+
|
| 278 |
+
# Create output directory
|
| 279 |
+
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 280 |
+
|
| 281 |
+
# Load model
|
| 282 |
+
print(f"Loading model: {args.model_name}")
|
| 283 |
+
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(args.model_name)
|
| 284 |
+
model.eval()
|
| 285 |
+
|
| 286 |
+
# Get model configuration
|
| 287 |
+
config = get_model_config(model)
|
| 288 |
+
|
| 289 |
+
# Auto-detect encoder dim if not found
|
| 290 |
+
if config['encoder_dim'] is None:
|
| 291 |
+
print("Auto-detecting encoder dimension...")
|
| 292 |
+
dummy_audio = torch.randn(1, 16000)
|
| 293 |
+
dummy_length = torch.tensor([16000])
|
| 294 |
+
with torch.no_grad():
|
| 295 |
+
enc_out, enc_len = model.encoder(
|
| 296 |
+
audio_signal=dummy_audio,
|
| 297 |
+
length=dummy_length
|
| 298 |
+
)
|
| 299 |
+
config['encoder_dim'] = enc_out.shape[-1]
|
| 300 |
+
|
| 301 |
+
print(f"\nModel config:")
|
| 302 |
+
for k, v in config.items():
|
| 303 |
+
print(f" {k}: {v}")
|
| 304 |
+
|
| 305 |
+
# Convert components
|
| 306 |
+
print()
|
| 307 |
+
convert_decoder(model, config, args.output_dir)
|
| 308 |
+
convert_joint(model, config, args.output_dir)
|
| 309 |
+
|
| 310 |
+
print("\nConversion complete!")
|
| 311 |
+
print(f"Models saved to: {args.output_dir}")
|
| 312 |
+
print("\nNext steps:")
|
| 313 |
+
print("1. Compile to .mlmodelc:")
|
| 314 |
+
print(f" cd {args.output_dir}")
|
| 315 |
+
print(" xcrun coremlcompiler compile Decoder.mlpackage .")
|
| 316 |
+
print(" xcrun coremlcompiler compile JointDecision.mlpackage .")
|
| 317 |
+
print("2. Copy to model cache:")
|
| 318 |
+
print(" cp -r Decoder.mlmodelc JointDecision.mlmodelc ~/Library/Application\\ Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
|
| 319 |
+
print("3. Test with: swift run fluidaudio hybrid-earnings-benchmark --max-files 1")
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
if __name__ == "__main__":
|
| 323 |
+
main()
|
convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c79c8bc763b4efccb3e12f199ec0a59aa2edc5e9e4d21ca70fde8f36762d4147
|
| 3 |
+
size 480078
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc681823d92eca3dbece3a30c975afa7251eedae0e718b07ffbf1a8b4313b87e
|
| 3 |
+
size 243
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ebec8fc38c063de4b2159e21b1f981309fa5947c24d7e4883aca20f7c15fbb9
|
| 3 |
+
size 377
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet 110M CTC decoder head",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 188 × 1025)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 188, 1025]",
|
| 13 |
+
"name" : "ctc_logits",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"storagePrecision" : "Float16",
|
| 18 |
+
"modelParameters" : [
|
| 19 |
+
|
| 20 |
+
],
|
| 21 |
+
"author" : "Fluid Inference",
|
| 22 |
+
"specificationVersion" : 8,
|
| 23 |
+
"mlProgramOperationTypeHistogram" : {
|
| 24 |
+
"Ios17.cast" : 2,
|
| 25 |
+
"Ios17.conv" : 1,
|
| 26 |
+
"Ios17.transpose" : 1,
|
| 27 |
+
"Ios16.softmax" : 1,
|
| 28 |
+
"Ios17.log" : 1
|
| 29 |
+
},
|
| 30 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 31 |
+
"isUpdatable" : "0",
|
| 32 |
+
"stateSchema" : [
|
| 33 |
+
|
| 34 |
+
],
|
| 35 |
+
"availability" : {
|
| 36 |
+
"macOS" : "14.0",
|
| 37 |
+
"tvOS" : "17.0",
|
| 38 |
+
"visionOS" : "1.0",
|
| 39 |
+
"watchOS" : "10.0",
|
| 40 |
+
"iOS" : "17.0",
|
| 41 |
+
"macCatalyst" : "17.0"
|
| 42 |
+
},
|
| 43 |
+
"modelType" : {
|
| 44 |
+
"name" : "MLModelType_mlProgram"
|
| 45 |
+
},
|
| 46 |
+
"inputSchema" : [
|
| 47 |
+
{
|
| 48 |
+
"hasShapeFlexibility" : "0",
|
| 49 |
+
"isOptional" : "0",
|
| 50 |
+
"dataType" : "Float32",
|
| 51 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 188)",
|
| 52 |
+
"shortDescription" : "",
|
| 53 |
+
"shape" : "[1, 512, 188]",
|
| 54 |
+
"name" : "encoder_output",
|
| 55 |
+
"type" : "MultiArray"
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"userDefinedMetadata" : {
|
| 59 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 60 |
+
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 61 |
+
"com.github.apple.coremltools.version" : "8.3.0"
|
| 62 |
+
},
|
| 63 |
+
"generatedClassName" : "parakeet_ctc_head",
|
| 64 |
+
"method" : "predict"
|
| 65 |
+
}
|
| 66 |
+
]
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 512, 188]> encoder_output) {
|
| 5 |
+
tensor<int32, []> var_4 = const()[name = tensor<string, []>("op_4"), val = tensor<int32, []>(-1)];
|
| 6 |
+
tensor<string, []> var_18_pad_type_0 = const()[name = tensor<string, []>("op_18_pad_type_0"), val = tensor<string, []>("valid")];
|
| 7 |
+
tensor<int32, [1]> var_18_strides_0 = const()[name = tensor<string, []>("op_18_strides_0"), val = tensor<int32, [1]>([1])];
|
| 8 |
+
tensor<int32, [2]> var_18_pad_0 = const()[name = tensor<string, []>("op_18_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 9 |
+
tensor<int32, [1]> var_18_dilations_0 = const()[name = tensor<string, []>("op_18_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 10 |
+
tensor<int32, []> var_18_groups_0 = const()[name = tensor<string, []>("op_18_groups_0"), val = tensor<int32, []>(1)];
|
| 11 |
+
tensor<string, []> encoder_output_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_output_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 12 |
+
tensor<fp16, [1025, 512, 1]> module_decoder_layers_0_weight_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_weight_to_fp16"), val = tensor<fp16, [1025, 512, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 13 |
+
tensor<fp16, [1025]> module_decoder_layers_0_bias_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_bias_to_fp16"), val = tensor<fp16, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1049728)))];
|
| 14 |
+
tensor<fp16, [1, 512, 188]> encoder_output_to_fp16 = cast(dtype = encoder_output_to_fp16_dtype_0, x = encoder_output)[name = tensor<string, []>("cast_1")];
|
| 15 |
+
tensor<fp16, [1, 1025, 188]> var_18_cast_fp16 = conv(bias = module_decoder_layers_0_bias_to_fp16, dilations = var_18_dilations_0, groups = var_18_groups_0, pad = var_18_pad_0, pad_type = var_18_pad_type_0, strides = var_18_strides_0, weight = module_decoder_layers_0_weight_to_fp16, x = encoder_output_to_fp16)[name = tensor<string, []>("op_18_cast_fp16")];
|
| 16 |
+
tensor<int32, [3]> input_perm_0 = const()[name = tensor<string, []>("input_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 17 |
+
tensor<fp16, [1, 188, 1025]> input_cast_fp16 = transpose(perm = input_perm_0, x = var_18_cast_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
+
tensor<fp16, [1, 188, 1025]> out_objects_softmax_cast_fp16 = softmax(axis = var_4, x = input_cast_fp16)[name = tensor<string, []>("out_objects_softmax_cast_fp16")];
|
| 19 |
+
tensor<fp32, []> out_objects_epsilon_0 = const()[name = tensor<string, []>("out_objects_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 20 |
+
tensor<fp16, [1, 188, 1025]> out_objects_cast_fp16 = log(epsilon = out_objects_epsilon_0, x = out_objects_softmax_cast_fp16)[name = tensor<string, []>("out_objects_cast_fp16")];
|
| 21 |
+
tensor<string, []> out_objects_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("out_objects_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 22 |
+
tensor<fp32, [1, 188, 1025]> ctc_logits = cast(dtype = out_objects_cast_fp16_to_fp32_dtype_0, x = out_objects_cast_fp16)[name = tensor<string, []>("cast_0")];
|
| 23 |
+
} -> (ctc_logits);
|
| 24 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
|
| 3 |
+
size 1051842
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:990455f6431342750254f66edf27bfb41be62a7ba17a18e1dd6afd4f5f56e9eb
|
| 3 |
+
size 243
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:29009727821ad8551ab5fe9271e93c597d92a9714f64b94aa533a9ceb6e22b93
|
| 3 |
+
size 498
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet 110M decoder (RNNT prediction network)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 640, 1]",
|
| 13 |
+
"name" : "decoder",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 1, 640]",
|
| 23 |
+
"name" : "h_out",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Float32",
|
| 30 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 1, 640]",
|
| 33 |
+
"name" : "c_out",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"storagePrecision" : "Float16",
|
| 38 |
+
"modelParameters" : [
|
| 39 |
+
|
| 40 |
+
],
|
| 41 |
+
"author" : "Fluid Inference",
|
| 42 |
+
"specificationVersion" : 8,
|
| 43 |
+
"mlProgramOperationTypeHistogram" : {
|
| 44 |
+
"Ios17.squeeze" : 2,
|
| 45 |
+
"Ios17.gather" : 1,
|
| 46 |
+
"Ios17.cast" : 6,
|
| 47 |
+
"Ios17.lstm" : 1,
|
| 48 |
+
"Ios17.transpose" : 2,
|
| 49 |
+
"Identity" : 1,
|
| 50 |
+
"Ios17.expandDims" : 2
|
| 51 |
+
},
|
| 52 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
|
| 53 |
+
"isUpdatable" : "0",
|
| 54 |
+
"stateSchema" : [
|
| 55 |
+
|
| 56 |
+
],
|
| 57 |
+
"availability" : {
|
| 58 |
+
"macOS" : "14.0",
|
| 59 |
+
"tvOS" : "17.0",
|
| 60 |
+
"visionOS" : "1.0",
|
| 61 |
+
"watchOS" : "10.0",
|
| 62 |
+
"iOS" : "17.0",
|
| 63 |
+
"macCatalyst" : "17.0"
|
| 64 |
+
},
|
| 65 |
+
"modelType" : {
|
| 66 |
+
"name" : "MLModelType_mlProgram"
|
| 67 |
+
},
|
| 68 |
+
"inputSchema" : [
|
| 69 |
+
{
|
| 70 |
+
"hasShapeFlexibility" : "0",
|
| 71 |
+
"isOptional" : "0",
|
| 72 |
+
"dataType" : "Int32",
|
| 73 |
+
"formattedType" : "MultiArray (Int32 1 × 1)",
|
| 74 |
+
"shortDescription" : "",
|
| 75 |
+
"shape" : "[1, 1]",
|
| 76 |
+
"name" : "targets",
|
| 77 |
+
"type" : "MultiArray"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"hasShapeFlexibility" : "0",
|
| 81 |
+
"isOptional" : "0",
|
| 82 |
+
"dataType" : "Int32",
|
| 83 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 84 |
+
"shortDescription" : "",
|
| 85 |
+
"shape" : "[1]",
|
| 86 |
+
"name" : "target_length",
|
| 87 |
+
"type" : "MultiArray"
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"hasShapeFlexibility" : "0",
|
| 91 |
+
"isOptional" : "0",
|
| 92 |
+
"dataType" : "Float32",
|
| 93 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 94 |
+
"shortDescription" : "",
|
| 95 |
+
"shape" : "[1, 1, 640]",
|
| 96 |
+
"name" : "h_in",
|
| 97 |
+
"type" : "MultiArray"
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"hasShapeFlexibility" : "0",
|
| 101 |
+
"isOptional" : "0",
|
| 102 |
+
"dataType" : "Float32",
|
| 103 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 640)",
|
| 104 |
+
"shortDescription" : "",
|
| 105 |
+
"shape" : "[1, 1, 640]",
|
| 106 |
+
"name" : "c_in",
|
| 107 |
+
"type" : "MultiArray"
|
| 108 |
+
}
|
| 109 |
+
],
|
| 110 |
+
"userDefinedMetadata" : {
|
| 111 |
+
"com.github.apple.coremltools.version" : "8.3.0",
|
| 112 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 113 |
+
"com.github.apple.coremltools.source" : "torch==2.9.0"
|
| 114 |
+
},
|
| 115 |
+
"generatedClassName" : "parakeet_decoder",
|
| 116 |
+
"method" : "predict"
|
| 117 |
+
}
|
| 118 |
+
]
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
|
| 5 |
+
tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
|
| 6 |
+
tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 7 |
+
tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 8 |
+
tensor<fp16, [1025, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 9 |
+
tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 10 |
+
tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
|
| 11 |
+
tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
|
| 12 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
|
| 13 |
+
tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 14 |
+
tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 15 |
+
tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
|
| 16 |
+
tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
|
| 17 |
+
tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 18 |
+
tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 19 |
+
tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
|
| 20 |
+
tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
|
| 21 |
+
tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
|
| 22 |
+
tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
|
| 23 |
+
tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
|
| 24 |
+
tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
|
| 25 |
+
tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
|
| 26 |
+
tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1312128)))];
|
| 27 |
+
tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4588992)))];
|
| 28 |
+
tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7865856)))];
|
| 29 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
|
| 30 |
+
tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
|
| 31 |
+
tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
|
| 32 |
+
tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
|
| 33 |
+
tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 34 |
+
tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
|
| 35 |
+
tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
|
| 36 |
+
tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 37 |
+
tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
|
| 38 |
+
tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 39 |
+
tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
|
| 40 |
+
tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
|
| 41 |
+
tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
|
| 42 |
+
tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
|
| 43 |
+
tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
|
| 44 |
+
} -> (decoder, h_out, c_out);
|
| 45 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
|
| 3 |
+
size 7871040
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7ae65e2af616df46066b7efca2d7c19941666ac0685f4ed005666890a052b0d
|
| 3 |
+
size 243
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0713c2d6ac5f8f6fb9582be250351ebd8efc925f71f4261191165f1406f2ee5d
|
| 3 |
+
size 437
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet 110M encoder (15 s window)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 188)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 512, 188]",
|
| 13 |
+
"name" : "encoder_output",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "encoder_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
}
|
| 26 |
+
],
|
| 27 |
+
"storagePrecision" : "Float16",
|
| 28 |
+
"modelParameters" : [
|
| 29 |
+
|
| 30 |
+
],
|
| 31 |
+
"author" : "Fluid Inference",
|
| 32 |
+
"specificationVersion" : 8,
|
| 33 |
+
"mlProgramOperationTypeHistogram" : {
|
| 34 |
+
"Ios17.logicalAnd" : 2,
|
| 35 |
+
"Ios17.reshape" : 103,
|
| 36 |
+
"Ios16.softmax" : 17,
|
| 37 |
+
"Ios17.matmul" : 51,
|
| 38 |
+
"Ios17.transpose" : 123,
|
| 39 |
+
"Split" : 17,
|
| 40 |
+
"Ios17.expandDims" : 17,
|
| 41 |
+
"Select" : 51,
|
| 42 |
+
"Ios17.add" : 128,
|
| 43 |
+
"Tile" : 8,
|
| 44 |
+
"Ios17.sliceByIndex" : 34,
|
| 45 |
+
"Ios16.sigmoid" : 17,
|
| 46 |
+
"Pad" : 34,
|
| 47 |
+
"Ios17.logicalNot" : 2,
|
| 48 |
+
"Ios17.layerNorm" : 85,
|
| 49 |
+
"Ios16.silu" : 51,
|
| 50 |
+
"Ios17.less" : 5,
|
| 51 |
+
"Ios17.sub" : 3,
|
| 52 |
+
"Ios17.conv" : 56,
|
| 53 |
+
"Ios16.relu" : 3,
|
| 54 |
+
"Ios17.linear" : 137,
|
| 55 |
+
"Ios17.cast" : 11,
|
| 56 |
+
"Ios17.floorDiv" : 3,
|
| 57 |
+
"Ios17.mul" : 77
|
| 58 |
+
},
|
| 59 |
+
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 60 |
+
"isUpdatable" : "0",
|
| 61 |
+
"stateSchema" : [
|
| 62 |
+
|
| 63 |
+
],
|
| 64 |
+
"availability" : {
|
| 65 |
+
"macOS" : "14.0",
|
| 66 |
+
"tvOS" : "17.0",
|
| 67 |
+
"visionOS" : "1.0",
|
| 68 |
+
"watchOS" : "10.0",
|
| 69 |
+
"iOS" : "17.0",
|
| 70 |
+
"macCatalyst" : "17.0"
|
| 71 |
+
},
|
| 72 |
+
"modelType" : {
|
| 73 |
+
"name" : "MLModelType_mlProgram"
|
| 74 |
+
},
|
| 75 |
+
"inputSchema" : [
|
| 76 |
+
{
|
| 77 |
+
"hasShapeFlexibility" : "0",
|
| 78 |
+
"isOptional" : "0",
|
| 79 |
+
"dataType" : "Float32",
|
| 80 |
+
"formattedType" : "MultiArray (Float32 1 × 80 × 1501)",
|
| 81 |
+
"shortDescription" : "",
|
| 82 |
+
"shape" : "[1, 80, 1501]",
|
| 83 |
+
"name" : "mel_features",
|
| 84 |
+
"type" : "MultiArray"
|
| 85 |
+
},
|
| 86 |
+
{
|
| 87 |
+
"hasShapeFlexibility" : "0",
|
| 88 |
+
"isOptional" : "0",
|
| 89 |
+
"dataType" : "Int32",
|
| 90 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 91 |
+
"shortDescription" : "",
|
| 92 |
+
"shape" : "[1]",
|
| 93 |
+
"name" : "mel_length",
|
| 94 |
+
"type" : "MultiArray"
|
| 95 |
+
}
|
| 96 |
+
],
|
| 97 |
+
"userDefinedMetadata" : {
|
| 98 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 99 |
+
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 100 |
+
"com.github.apple.coremltools.version" : "8.3.0"
|
| 101 |
+
},
|
| 102 |
+
"generatedClassName" : "parakeet_encoder",
|
| 103 |
+
"method" : "predict"
|
| 104 |
+
}
|
| 105 |
+
]
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
|
| 3 |
+
size 215143424
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:983ba26dd9276b8d2d4f75f3475aefb1817c542df87dbd0fdac95bd63647494f
|
| 3 |
+
size 243
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0800e3bdf4ecb1bd46fd27e1826d33125cd574f9ae1e15dd9ff70ea42944ca2d
|
| 3 |
+
size 476
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet 110M joint + decision head (split, softmax, argmax)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Int32",
|
| 10 |
+
"formattedType" : "MultiArray (Int32 1 × 188 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 188, 1]",
|
| 13 |
+
"name" : "token_id",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 188 × 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 188, 1]",
|
| 23 |
+
"name" : "token_prob",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Int32",
|
| 30 |
+
"formattedType" : "MultiArray (Int32 1 × 188 × 1)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 188, 1]",
|
| 33 |
+
"name" : "duration",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
}
|
| 36 |
+
],
|
| 37 |
+
"storagePrecision" : "Float16",
|
| 38 |
+
"modelParameters" : [
|
| 39 |
+
|
| 40 |
+
],
|
| 41 |
+
"author" : "Fluid Inference",
|
| 42 |
+
"specificationVersion" : 8,
|
| 43 |
+
"mlProgramOperationTypeHistogram" : {
|
| 44 |
+
"Ios17.reduceArgmax" : 2,
|
| 45 |
+
"Ios17.squeeze" : 1,
|
| 46 |
+
"Ios17.cast" : 4,
|
| 47 |
+
"Ios17.linear" : 3,
|
| 48 |
+
"Ios17.transpose" : 2,
|
| 49 |
+
"Ios17.sliceByIndex" : 2,
|
| 50 |
+
"Ios17.add" : 1,
|
| 51 |
+
"Ios16.relu" : 1,
|
| 52 |
+
"Ios16.softmax" : 1,
|
| 53 |
+
"Ios17.gatherAlongAxis" : 1,
|
| 54 |
+
"Ios17.expandDims" : 3
|
| 55 |
+
},
|
| 56 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
|
| 57 |
+
"isUpdatable" : "0",
|
| 58 |
+
"stateSchema" : [
|
| 59 |
+
|
| 60 |
+
],
|
| 61 |
+
"availability" : {
|
| 62 |
+
"macOS" : "14.0",
|
| 63 |
+
"tvOS" : "17.0",
|
| 64 |
+
"visionOS" : "1.0",
|
| 65 |
+
"watchOS" : "10.0",
|
| 66 |
+
"iOS" : "17.0",
|
| 67 |
+
"macCatalyst" : "17.0"
|
| 68 |
+
},
|
| 69 |
+
"modelType" : {
|
| 70 |
+
"name" : "MLModelType_mlProgram"
|
| 71 |
+
},
|
| 72 |
+
"inputSchema" : [
|
| 73 |
+
{
|
| 74 |
+
"hasShapeFlexibility" : "0",
|
| 75 |
+
"isOptional" : "0",
|
| 76 |
+
"dataType" : "Float32",
|
| 77 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 188)",
|
| 78 |
+
"shortDescription" : "",
|
| 79 |
+
"shape" : "[1, 512, 188]",
|
| 80 |
+
"name" : "encoder",
|
| 81 |
+
"type" : "MultiArray"
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"hasShapeFlexibility" : "0",
|
| 85 |
+
"isOptional" : "0",
|
| 86 |
+
"dataType" : "Float32",
|
| 87 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 88 |
+
"shortDescription" : "",
|
| 89 |
+
"shape" : "[1, 640, 1]",
|
| 90 |
+
"name" : "decoder",
|
| 91 |
+
"type" : "MultiArray"
|
| 92 |
+
}
|
| 93 |
+
],
|
| 94 |
+
"userDefinedMetadata" : {
|
| 95 |
+
"com.github.apple.coremltools.version" : "8.3.0",
|
| 96 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 97 |
+
"com.github.apple.coremltools.source" : "torch==2.9.0"
|
| 98 |
+
},
|
| 99 |
+
"generatedClassName" : "parakeet_joint_decision",
|
| 100 |
+
"method" : "predict"
|
| 101 |
+
}
|
| 102 |
+
]
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 512, 188]> encoder) {
|
| 5 |
+
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 6 |
+
tensor<string, []> encoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 8 |
+
tensor<string, []> decoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 9 |
+
tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 10 |
+
tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
|
| 11 |
+
tensor<fp16, [1, 512, 188]> encoder_to_fp16 = cast(dtype = encoder_to_fp16_dtype_0, x = encoder)[name = tensor<string, []>("cast_6")];
|
| 12 |
+
tensor<fp16, [1, 188, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_to_fp16)[name = tensor<string, []>("transpose_1")];
|
| 13 |
+
tensor<fp16, [1, 188, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
|
| 14 |
+
tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
|
| 15 |
+
tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
|
| 16 |
+
tensor<fp16, [1, 640, 1]> decoder_to_fp16 = cast(dtype = decoder_to_fp16_dtype_0, x = decoder)[name = tensor<string, []>("cast_5")];
|
| 17 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_to_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
+
tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
|
| 19 |
+
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 20 |
+
tensor<fp16, [1, 188, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
|
| 21 |
+
tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
+
tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
|
| 23 |
+
tensor<fp16, [1, 188, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 24 |
+
tensor<fp16, [1, 188, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
|
| 25 |
+
tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
|
| 26 |
+
tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
|
| 27 |
+
tensor<fp16, [1, 188, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
|
| 28 |
+
tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
|
| 29 |
+
tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1025])];
|
| 30 |
+
tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
|
| 31 |
+
tensor<fp16, [1, 188, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
|
| 32 |
+
tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
|
| 33 |
+
tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1030])];
|
| 34 |
+
tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
|
| 35 |
+
tensor<fp16, [1, 188, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
|
| 36 |
+
tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
|
| 37 |
+
tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 38 |
+
tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 39 |
+
tensor<int32, [1, 188, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
|
| 40 |
+
tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
|
| 41 |
+
tensor<fp16, [1, 188, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
|
| 42 |
+
tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 43 |
+
tensor<int32, [1, 188, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
|
| 44 |
+
tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
|
| 45 |
+
tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 46 |
+
tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 47 |
+
tensor<int16, [1, 188, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_4")];
|
| 48 |
+
tensor<fp16, [1, 188, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
|
| 49 |
+
tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 50 |
+
tensor<fp16, [1, 188, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
|
| 51 |
+
tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 52 |
+
tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
|
| 53 |
+
tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 54 |
+
tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 55 |
+
tensor<int32, [1, 188, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
|
| 56 |
+
tensor<fp32, [1, 188, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_3")];
|
| 57 |
+
} -> (token_id, token_prob, duration);
|
| 58 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
|
| 3 |
+
size 2798028
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7c11c6bb985fab7f835ba687a575f1eb04f4c93b0783155d634adbc49f0e797
|
| 3 |
+
size 243
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1af2cb9bcc13eec83ce006e4f1c2cf158393745cd9187428333fbcb6917da244
|
| 3 |
+
size 535
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet 110M single-step joint decision (current frame)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Int32",
|
| 10 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[1, 1, 1]",
|
| 13 |
+
"name" : "token_id",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Float32",
|
| 20 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1, 1, 1]",
|
| 23 |
+
"name" : "token_prob",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"hasShapeFlexibility" : "0",
|
| 28 |
+
"isOptional" : "0",
|
| 29 |
+
"dataType" : "Int32",
|
| 30 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1)",
|
| 31 |
+
"shortDescription" : "",
|
| 32 |
+
"shape" : "[1, 1, 1]",
|
| 33 |
+
"name" : "duration",
|
| 34 |
+
"type" : "MultiArray"
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"hasShapeFlexibility" : "0",
|
| 38 |
+
"isOptional" : "0",
|
| 39 |
+
"dataType" : "Int32",
|
| 40 |
+
"formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
|
| 41 |
+
"shortDescription" : "",
|
| 42 |
+
"shape" : "[1, 1, 1, 64]",
|
| 43 |
+
"name" : "top_k_ids",
|
| 44 |
+
"type" : "MultiArray"
|
| 45 |
+
},
|
| 46 |
+
{
|
| 47 |
+
"hasShapeFlexibility" : "0",
|
| 48 |
+
"isOptional" : "0",
|
| 49 |
+
"dataType" : "Float32",
|
| 50 |
+
"formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
|
| 51 |
+
"shortDescription" : "",
|
| 52 |
+
"shape" : "[1, 1, 1, 64]",
|
| 53 |
+
"name" : "top_k_logits",
|
| 54 |
+
"type" : "MultiArray"
|
| 55 |
+
}
|
| 56 |
+
],
|
| 57 |
+
"storagePrecision" : "Float16",
|
| 58 |
+
"modelParameters" : [
|
| 59 |
+
|
| 60 |
+
],
|
| 61 |
+
"author" : "Fluid Inference",
|
| 62 |
+
"specificationVersion" : 8,
|
| 63 |
+
"mlProgramOperationTypeHistogram" : {
|
| 64 |
+
"Ios17.reduceArgmax" : 2,
|
| 65 |
+
"Ios17.linear" : 3,
|
| 66 |
+
"Ios17.transpose" : 2,
|
| 67 |
+
"Ios17.sliceByIndex" : 2,
|
| 68 |
+
"Ios17.add" : 1,
|
| 69 |
+
"Ios17.topk" : 1,
|
| 70 |
+
"Ios16.relu" : 1,
|
| 71 |
+
"Ios16.softmax" : 1,
|
| 72 |
+
"Ios17.expandDims" : 3,
|
| 73 |
+
"Ios17.squeeze" : 1,
|
| 74 |
+
"Ios17.cast" : 6,
|
| 75 |
+
"Ios17.gatherAlongAxis" : 1
|
| 76 |
+
},
|
| 77 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
|
| 78 |
+
"isUpdatable" : "0",
|
| 79 |
+
"stateSchema" : [
|
| 80 |
+
|
| 81 |
+
],
|
| 82 |
+
"availability" : {
|
| 83 |
+
"macOS" : "14.0",
|
| 84 |
+
"tvOS" : "17.0",
|
| 85 |
+
"visionOS" : "1.0",
|
| 86 |
+
"watchOS" : "10.0",
|
| 87 |
+
"iOS" : "17.0",
|
| 88 |
+
"macCatalyst" : "17.0"
|
| 89 |
+
},
|
| 90 |
+
"modelType" : {
|
| 91 |
+
"name" : "MLModelType_mlProgram"
|
| 92 |
+
},
|
| 93 |
+
"inputSchema" : [
|
| 94 |
+
{
|
| 95 |
+
"hasShapeFlexibility" : "0",
|
| 96 |
+
"isOptional" : "0",
|
| 97 |
+
"dataType" : "Float32",
|
| 98 |
+
"formattedType" : "MultiArray (Float32 1 × 512 × 1)",
|
| 99 |
+
"shortDescription" : "",
|
| 100 |
+
"shape" : "[1, 512, 1]",
|
| 101 |
+
"name" : "encoder_step",
|
| 102 |
+
"type" : "MultiArray"
|
| 103 |
+
},
|
| 104 |
+
{
|
| 105 |
+
"hasShapeFlexibility" : "0",
|
| 106 |
+
"isOptional" : "0",
|
| 107 |
+
"dataType" : "Float32",
|
| 108 |
+
"formattedType" : "MultiArray (Float32 1 × 640 × 1)",
|
| 109 |
+
"shortDescription" : "",
|
| 110 |
+
"shape" : "[1, 640, 1]",
|
| 111 |
+
"name" : "decoder_step",
|
| 112 |
+
"type" : "MultiArray"
|
| 113 |
+
}
|
| 114 |
+
],
|
| 115 |
+
"userDefinedMetadata" : {
|
| 116 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 117 |
+
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 118 |
+
"com.github.apple.coremltools.version" : "8.3.0"
|
| 119 |
+
},
|
| 120 |
+
"generatedClassName" : "parakeet_joint_decision_single_step",
|
| 121 |
+
"method" : "predict"
|
| 122 |
+
}
|
| 123 |
+
]
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
|
| 5 |
+
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 6 |
+
tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
+
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 8 |
+
tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 9 |
+
tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 10 |
+
tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
|
| 11 |
+
tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_9")];
|
| 12 |
+
tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
|
| 13 |
+
tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
|
| 14 |
+
tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
|
| 15 |
+
tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
|
| 16 |
+
tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_8")];
|
| 17 |
+
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
+
tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
|
| 19 |
+
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 20 |
+
tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
|
| 21 |
+
tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
+
tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
|
| 23 |
+
tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 24 |
+
tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
|
| 25 |
+
tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
|
| 26 |
+
tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
|
| 27 |
+
tensor<fp16, [1, 1, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
|
| 28 |
+
tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
|
| 29 |
+
tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1025])];
|
| 30 |
+
tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
|
| 31 |
+
tensor<fp16, [1, 1, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
|
| 32 |
+
tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
|
| 33 |
+
tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1030])];
|
| 34 |
+
tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
|
| 35 |
+
tensor<fp16, [1, 1, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
|
| 36 |
+
tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
|
| 37 |
+
tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 38 |
+
tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 39 |
+
tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
|
| 40 |
+
tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
|
| 41 |
+
tensor<fp16, [1, 1, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
|
| 42 |
+
tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 43 |
+
tensor<int32, [1, 1, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
|
| 44 |
+
tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
|
| 45 |
+
tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 46 |
+
tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 47 |
+
tensor<int16, [1, 1, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_7")];
|
| 48 |
+
tensor<fp16, [1, 1, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
|
| 49 |
+
tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 50 |
+
tensor<fp16, [1, 1, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
|
| 51 |
+
tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 52 |
+
tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
|
| 53 |
+
tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 54 |
+
tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 55 |
+
tensor<int32, [1, 1, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
|
| 56 |
+
tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(64)];
|
| 57 |
+
tensor<int32, []> var_76_axis_0 = const()[name = tensor<string, []>("op_76_axis_0"), val = tensor<int32, []>(-1)];
|
| 58 |
+
tensor<bool, []> var_76_ascending_0 = const()[name = tensor<string, []>("op_76_ascending_0"), val = tensor<bool, []>(false)];
|
| 59 |
+
tensor<bool, []> var_76_sort_0 = const()[name = tensor<string, []>("op_76_sort_0"), val = tensor<bool, []>(true)];
|
| 60 |
+
tensor<bool, []> var_76_return_indices_0 = const()[name = tensor<string, []>("op_76_return_indices_0"), val = tensor<bool, []>(true)];
|
| 61 |
+
tensor<string, []> var_76_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
|
| 62 |
+
tensor<fp16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_1 = topk(ascending = var_76_ascending_0, axis = var_76_axis_0, k = var_72, output_indices_dtype = var_76_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_76_return_indices_0, sort = var_76_sort_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_76_cast_fp16_cast_int16")];
|
| 63 |
+
tensor<string, []> var_76_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 64 |
+
tensor<string, []> var_76_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 65 |
+
tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_76_cast_fp16_0_to_fp32_dtype_0, x = var_76_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_4")];
|
| 66 |
+
tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_76_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_76_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_5")];
|
| 67 |
+
tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_6")];
|
| 68 |
+
} -> (token_id, token_prob, duration, top_k_ids, top_k_logits);
|
| 69 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
|
| 3 |
+
size 2798028
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1ac15543fbb9301fba5f018b147e44d767479dec352aaa91dfe7bcf65949693
|
| 3 |
+
size 243
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4940877938cc1b6d8830bbdd68ac8a49377cc57d75b61308883da5235b6a1914
|
| 3 |
+
size 439
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json
ADDED
|
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"metadataOutputVersion" : "3.0",
|
| 4 |
+
"shortDescription" : "Parakeet 110M preprocessor (15 s window)",
|
| 5 |
+
"outputSchema" : [
|
| 6 |
+
{
|
| 7 |
+
"hasShapeFlexibility" : "0",
|
| 8 |
+
"isOptional" : "0",
|
| 9 |
+
"dataType" : "Float32",
|
| 10 |
+
"formattedType" : "MultiArray (Float32)",
|
| 11 |
+
"shortDescription" : "",
|
| 12 |
+
"shape" : "[]",
|
| 13 |
+
"name" : "mel_features",
|
| 14 |
+
"type" : "MultiArray"
|
| 15 |
+
},
|
| 16 |
+
{
|
| 17 |
+
"hasShapeFlexibility" : "0",
|
| 18 |
+
"isOptional" : "0",
|
| 19 |
+
"dataType" : "Int32",
|
| 20 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
+
"shortDescription" : "",
|
| 22 |
+
"shape" : "[1]",
|
| 23 |
+
"name" : "mel_length",
|
| 24 |
+
"type" : "MultiArray"
|
| 25 |
+
}
|
| 26 |
+
],
|
| 27 |
+
"storagePrecision" : "Float16",
|
| 28 |
+
"modelParameters" : [
|
| 29 |
+
|
| 30 |
+
],
|
| 31 |
+
"author" : "Fluid Inference",
|
| 32 |
+
"specificationVersion" : 8,
|
| 33 |
+
"mlProgramOperationTypeHistogram" : {
|
| 34 |
+
"Range1d" : 3,
|
| 35 |
+
"Ios17.equal" : 1,
|
| 36 |
+
"Ios17.notEqual" : 1,
|
| 37 |
+
"Ios17.reshape" : 2,
|
| 38 |
+
"Identity" : 1,
|
| 39 |
+
"Ios17.matmul" : 1,
|
| 40 |
+
"Select" : 6,
|
| 41 |
+
"Ios17.expandDims" : 12,
|
| 42 |
+
"Ios17.add" : 3,
|
| 43 |
+
"Tile" : 2,
|
| 44 |
+
"Ios17.sliceByIndex" : 3,
|
| 45 |
+
"Ios16.reduceSum" : 4,
|
| 46 |
+
"Shape" : 4,
|
| 47 |
+
"Ios17.gather" : 4,
|
| 48 |
+
"Ios17.logicalNot" : 1,
|
| 49 |
+
"Pad" : 1,
|
| 50 |
+
"Ios17.log" : 1,
|
| 51 |
+
"Ios17.less" : 2,
|
| 52 |
+
"Ios17.sub" : 4,
|
| 53 |
+
"Ios17.conv" : 2,
|
| 54 |
+
"Ios17.pow" : 2,
|
| 55 |
+
"Ios17.cast" : 10,
|
| 56 |
+
"Ios17.concat" : 3,
|
| 57 |
+
"Stack" : 1,
|
| 58 |
+
"Ios17.floorDiv" : 1,
|
| 59 |
+
"Ios17.realDiv" : 4,
|
| 60 |
+
"Ios17.sqrt" : 1,
|
| 61 |
+
"Ios17.greaterEqual" : 1,
|
| 62 |
+
"Ios17.mul" : 1
|
| 63 |
+
},
|
| 64 |
+
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
|
| 65 |
+
"isUpdatable" : "0",
|
| 66 |
+
"stateSchema" : [
|
| 67 |
+
|
| 68 |
+
],
|
| 69 |
+
"availability" : {
|
| 70 |
+
"macOS" : "14.0",
|
| 71 |
+
"tvOS" : "17.0",
|
| 72 |
+
"visionOS" : "1.0",
|
| 73 |
+
"watchOS" : "10.0",
|
| 74 |
+
"iOS" : "17.0",
|
| 75 |
+
"macCatalyst" : "17.0"
|
| 76 |
+
},
|
| 77 |
+
"modelType" : {
|
| 78 |
+
"name" : "MLModelType_mlProgram"
|
| 79 |
+
},
|
| 80 |
+
"inputSchema" : [
|
| 81 |
+
{
|
| 82 |
+
"dataType" : "Float32",
|
| 83 |
+
"hasShapeFlexibility" : "1",
|
| 84 |
+
"isOptional" : "0",
|
| 85 |
+
"shapeFlexibility" : "1 × 1...240000",
|
| 86 |
+
"shapeRange" : "[[1, 1], [1, 240000]]",
|
| 87 |
+
"formattedType" : "MultiArray (Float32 1 × 1)",
|
| 88 |
+
"type" : "MultiArray",
|
| 89 |
+
"shape" : "[1, 1]",
|
| 90 |
+
"name" : "audio",
|
| 91 |
+
"shortDescription" : ""
|
| 92 |
+
},
|
| 93 |
+
{
|
| 94 |
+
"hasShapeFlexibility" : "0",
|
| 95 |
+
"isOptional" : "0",
|
| 96 |
+
"dataType" : "Int32",
|
| 97 |
+
"formattedType" : "MultiArray (Int32 1)",
|
| 98 |
+
"shortDescription" : "",
|
| 99 |
+
"shape" : "[1]",
|
| 100 |
+
"name" : "audio_length",
|
| 101 |
+
"type" : "MultiArray"
|
| 102 |
+
}
|
| 103 |
+
],
|
| 104 |
+
"userDefinedMetadata" : {
|
| 105 |
+
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 106 |
+
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 107 |
+
"com.github.apple.coremltools.version" : "8.3.0"
|
| 108 |
+
},
|
| 109 |
+
"generatedClassName" : "parakeet_preprocessor",
|
| 110 |
+
"method" : "predict"
|
| 111 |
+
}
|
| 112 |
+
]
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil
ADDED
|
@@ -0,0 +1,191 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
program(1.0)
|
| 2 |
+
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
+
{
|
| 4 |
+
func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 240000]]}})))] {
|
| 5 |
+
tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
|
| 6 |
+
tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
|
| 7 |
+
tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
|
| 8 |
+
tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
|
| 9 |
+
tensor<int32, [1]> var_35 = add(x = audio_length, y = var_34)[name = tensor<string, []>("op_35")];
|
| 10 |
+
tensor<int32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, []>(512)];
|
| 11 |
+
tensor<int32, [1]> var_37 = sub(x = var_35, y = var_36)[name = tensor<string, []>("op_37")];
|
| 12 |
+
tensor<int32, [1]> floor_div_0 = floor_div(x = var_37, y = var_10)[name = tensor<string, []>("floor_div_0")];
|
| 13 |
+
tensor<bool, [1]> var_40 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_40")];
|
| 14 |
+
tensor<int32, [1]> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, [1]>([0])];
|
| 15 |
+
tensor<int32, [1]> mel_length = select(a = var_41, b = floor_div_0, cond = var_40)[name = tensor<string, []>("seq_len")];
|
| 16 |
+
tensor<string, []> audio_to_fp16_dtype_0 = const()[name = tensor<string, []>("audio_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 17 |
+
tensor<fp16, [1, ?]> audio_to_fp16 = cast(dtype = audio_to_fp16_dtype_0, x = audio)[name = tensor<string, []>("cast_27")];
|
| 18 |
+
tensor<int32, [2]> var_43_shape_cast_fp16 = shape(x = audio_to_fp16)[name = tensor<string, []>("op_43_shape_cast_fp16")];
|
| 19 |
+
tensor<int32, []> gather_0_axis_0 = const()[name = tensor<string, []>("gather_0_axis_0"), val = tensor<int32, []>(0)];
|
| 20 |
+
tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 21 |
+
tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 22 |
+
tensor<string, []> var_43_shape_cast_fp16_to_int16_dtype_0 = const()[name = tensor<string, []>("op_43_shape_cast_fp16_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 23 |
+
tensor<uint16, []> select_0_to_uint16 = const()[name = tensor<string, []>("select_0_to_uint16"), val = tensor<uint16, []>(1)];
|
| 24 |
+
tensor<int16, [2]> var_43_shape_cast_fp16_to_int16 = cast(dtype = var_43_shape_cast_fp16_to_int16_dtype_0, x = var_43_shape_cast_fp16)[name = tensor<string, []>("cast_26")];
|
| 25 |
+
tensor<int16, []> gather_0_cast_uint16 = gather(axis = gather_0_axis_0, batch_dims = gather_0_batch_dims_0, indices = select_0_to_uint16, validate_indices = gather_0_validate_indices_0, x = var_43_shape_cast_fp16_to_int16)[name = tensor<string, []>("gather_0_cast_uint16")];
|
| 26 |
+
tensor<string, []> gather_0_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_0_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 27 |
+
tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
|
| 28 |
+
tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
|
| 29 |
+
tensor<int32, []> gather_0_cast_uint16_to_int32 = cast(dtype = gather_0_cast_uint16_to_int32_dtype_0, x = gather_0_cast_uint16)[name = tensor<string, []>("cast_25")];
|
| 30 |
+
tensor<int32, [?]> var_44 = range_1d(end = gather_0_cast_uint16_to_int32, start = const_0, step = const_1)[name = tensor<string, []>("op_44")];
|
| 31 |
+
tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([0])];
|
| 32 |
+
tensor<int32, [1, ?]> var_45 = expand_dims(axes = var_45_axes_0, x = var_44)[name = tensor<string, []>("op_45")];
|
| 33 |
+
tensor<int32, [1]> var_46_axes_0 = const()[name = tensor<string, []>("op_46_axes_0"), val = tensor<int32, [1]>([1])];
|
| 34 |
+
tensor<int32, [1, 1]> var_46 = expand_dims(axes = var_46_axes_0, x = audio_length)[name = tensor<string, []>("op_46")];
|
| 35 |
+
tensor<bool, [1, ?]> timemask = less(x = var_45, y = var_46)[name = tensor<string, []>("timemask")];
|
| 36 |
+
tensor<int32, [2]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 37 |
+
tensor<int32, [2]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [2]>([1, 1])];
|
| 38 |
+
tensor<bool, [2]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 39 |
+
tensor<bool, [2]> var_49_squeeze_mask_0 = const()[name = tensor<string, []>("op_49_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
|
| 40 |
+
tensor<fp16, [1]> var_49_cast_fp16 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, squeeze_mask = var_49_squeeze_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_49_cast_fp16")];
|
| 41 |
+
tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
|
| 42 |
+
tensor<fp16, [1, 1]> var_50_cast_fp16 = expand_dims(axes = var_50_axes_0, x = var_49_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
|
| 43 |
+
tensor<int32, [2]> var_52_begin_0 = const()[name = tensor<string, []>("op_52_begin_0"), val = tensor<int32, [2]>([0, 1])];
|
| 44 |
+
tensor<int32, [2]> var_52_end_0 = const()[name = tensor<string, []>("op_52_end_0"), val = tensor<int32, [2]>([1, 0])];
|
| 45 |
+
tensor<bool, [2]> var_52_end_mask_0 = const()[name = tensor<string, []>("op_52_end_mask_0"), val = tensor<bool, [2]>([true, true])];
|
| 46 |
+
tensor<fp16, [1, ?]> var_52_cast_fp16 = slice_by_index(begin = var_52_begin_0, end = var_52_end_0, end_mask = var_52_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_52_cast_fp16")];
|
| 47 |
+
tensor<int32, [2]> var_54_begin_0 = const()[name = tensor<string, []>("op_54_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 48 |
+
tensor<int32, [2]> var_54_end_0 = const()[name = tensor<string, []>("op_54_end_0"), val = tensor<int32, [2]>([1, -1])];
|
| 49 |
+
tensor<bool, [2]> var_54_end_mask_0 = const()[name = tensor<string, []>("op_54_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 50 |
+
tensor<fp16, [1, ?]> var_54_cast_fp16 = slice_by_index(begin = var_54_begin_0, end = var_54_end_0, end_mask = var_54_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_54_cast_fp16")];
|
| 51 |
+
tensor<fp16, []> var_55_to_fp16 = const()[name = tensor<string, []>("op_55_to_fp16"), val = tensor<fp16, []>(0x1.f0cp-1)];
|
| 52 |
+
tensor<fp16, [1, ?]> var_56_cast_fp16 = mul(x = var_54_cast_fp16, y = var_55_to_fp16)[name = tensor<string, []>("op_56_cast_fp16")];
|
| 53 |
+
tensor<fp16, [1, ?]> var_57_cast_fp16 = sub(x = var_52_cast_fp16, y = var_56_cast_fp16)[name = tensor<string, []>("op_57_cast_fp16")];
|
| 54 |
+
tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
|
| 55 |
+
tensor<fp16, [1, ?]> x_3_cast_fp16 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_50_cast_fp16, var_57_cast_fp16))[name = tensor<string, []>("x_3_cast_fp16")];
|
| 56 |
+
tensor<bool, [1, ?]> var_60 = logical_not(x = timemask)[name = tensor<string, []>("op_60")];
|
| 57 |
+
tensor<fp16, []> var_16_to_fp16 = const()[name = tensor<string, []>("op_16_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
|
| 58 |
+
tensor<fp16, [1, ?]> input_1_cast_fp16 = select(a = var_16_to_fp16, b = x_3_cast_fp16, cond = var_60)[name = tensor<string, []>("input_1_cast_fp16")];
|
| 59 |
+
tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
|
| 60 |
+
tensor<fp16, [1, 1, ?]> input_3_cast_fp16 = reshape(shape = concat_1x, x = input_1_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
|
| 61 |
+
tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
|
| 62 |
+
tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
|
| 63 |
+
tensor<fp16, []> const_3_to_fp16 = const()[name = tensor<string, []>("const_3_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
|
| 64 |
+
tensor<fp16, [1, 1, ?]> input_5_cast_fp16 = pad(constant_val = const_3_to_fp16, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 65 |
+
tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
|
| 66 |
+
tensor<fp16, [1, ?]> input_cast_fp16 = reshape(shape = concat_2x, x = input_5_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
|
| 67 |
+
tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
|
| 68 |
+
tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
|
| 69 |
+
tensor<fp16, [1, 1, ?]> expand_dims_4_cast_fp16 = expand_dims(axes = expand_dims_4_axes_0, x = input_cast_fp16)[name = tensor<string, []>("expand_dims_4_cast_fp16")];
|
| 70 |
+
tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
|
| 71 |
+
tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 72 |
+
tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 73 |
+
tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
|
| 74 |
+
tensor<fp16, [257, 1, 512]> expand_dims_1_to_fp16 = const()[name = tensor<string, []>("expand_dims_1_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 75 |
+
tensor<fp16, [1, 257, ?]> conv_0_cast_fp16 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_0_cast_fp16")];
|
| 76 |
+
tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
|
| 77 |
+
tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 78 |
+
tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 79 |
+
tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
|
| 80 |
+
tensor<fp16, [257, 1, 512]> expand_dims_2_to_fp16 = const()[name = tensor<string, []>("expand_dims_2_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263296)))];
|
| 81 |
+
tensor<fp16, [1, 257, ?]> conv_1_cast_fp16 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_1_cast_fp16")];
|
| 82 |
+
tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
|
| 83 |
+
tensor<fp16, [1, 257, ?, 2]> stack_0_cast_fp16 = stack(axis = stack_0_axis_0, values = (conv_0_cast_fp16, conv_1_cast_fp16))[name = tensor<string, []>("stack_0_cast_fp16")];
|
| 84 |
+
tensor<fp16, []> var_19_promoted_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
|
| 85 |
+
tensor<fp16, [1, 257, ?, 2]> var_75_cast_fp16 = pow(x = stack_0_cast_fp16, y = var_19_promoted_to_fp16)[name = tensor<string, []>("op_75_cast_fp16")];
|
| 86 |
+
tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 87 |
+
tensor<bool, []> var_77_keep_dims_0 = const()[name = tensor<string, []>("op_77_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 88 |
+
tensor<fp16, [1, 257, ?]> var_77_cast_fp16 = reduce_sum(axes = var_77_axes_0, keep_dims = var_77_keep_dims_0, x = var_75_cast_fp16)[name = tensor<string, []>("op_77_cast_fp16")];
|
| 89 |
+
tensor<fp16, [1, 257, ?]> x_11_cast_fp16 = identity(x = var_77_cast_fp16)[name = tensor<string, []>("x_11_cast_fp16")];
|
| 90 |
+
tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
|
| 91 |
+
tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
|
| 92 |
+
tensor<fp16, [1, 80, 257]> const_4_to_fp16 = const()[name = tensor<string, []>("const_4_to_fp16"), val = tensor<fp16, [1, 80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526528)))];
|
| 93 |
+
tensor<fp16, [1, 80, ?]> x_13_cast_fp16 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4_to_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("x_13_cast_fp16")];
|
| 94 |
+
tensor<fp16, []> var_84_to_fp16 = const()[name = tensor<string, []>("op_84_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
|
| 95 |
+
tensor<fp16, [1, 80, ?]> var_85_cast_fp16 = add(x = x_13_cast_fp16, y = var_84_to_fp16)[name = tensor<string, []>("op_85_cast_fp16")];
|
| 96 |
+
tensor<fp32, []> x_15_epsilon_0 = const()[name = tensor<string, []>("x_15_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 97 |
+
tensor<fp16, [1, 80, ?]> x_15_cast_fp16 = log(epsilon = x_15_epsilon_0, x = var_85_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
|
| 98 |
+
tensor<int32, [3]> var_87_shape_cast_fp16 = shape(x = x_15_cast_fp16)[name = tensor<string, []>("op_87_shape_cast_fp16")];
|
| 99 |
+
tensor<int32, []> gather_5 = const()[name = tensor<string, []>("gather_5"), val = tensor<int32, []>(1)];
|
| 100 |
+
tensor<int32, []> gather_6_axis_0 = const()[name = tensor<string, []>("gather_6_axis_0"), val = tensor<int32, []>(0)];
|
| 101 |
+
tensor<int32, []> gather_6_batch_dims_0 = const()[name = tensor<string, []>("gather_6_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 102 |
+
tensor<bool, []> gather_6_validate_indices_0 = const()[name = tensor<string, []>("gather_6_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 103 |
+
tensor<string, []> var_87_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_87_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
|
| 104 |
+
tensor<uint16, []> select_6_to_uint16 = const()[name = tensor<string, []>("select_6_to_uint16"), val = tensor<uint16, []>(2)];
|
| 105 |
+
tensor<uint16, [3]> var_87_shape_cast_fp16_to_uint16 = cast(dtype = var_87_shape_cast_fp16_to_uint16_dtype_0, x = var_87_shape_cast_fp16)[name = tensor<string, []>("cast_24")];
|
| 106 |
+
tensor<uint16, []> gather_6_cast_uint16 = gather(axis = gather_6_axis_0, batch_dims = gather_6_batch_dims_0, indices = select_6_to_uint16, validate_indices = gather_6_validate_indices_0, x = var_87_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_6_cast_uint16")];
|
| 107 |
+
tensor<string, []> gather_6_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_6_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 108 |
+
tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
|
| 109 |
+
tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
|
| 110 |
+
tensor<int32, []> gather_6_cast_uint16_to_int32 = cast(dtype = gather_6_cast_uint16_to_int32_dtype_0, x = gather_6_cast_uint16)[name = tensor<string, []>("cast_23")];
|
| 111 |
+
tensor<int32, [?]> var_89 = range_1d(end = gather_6_cast_uint16_to_int32, start = const_5, step = const_6)[name = tensor<string, []>("op_89")];
|
| 112 |
+
tensor<int32, [1]> var_90_axes_0 = const()[name = tensor<string, []>("op_90_axes_0"), val = tensor<int32, [1]>([0])];
|
| 113 |
+
tensor<int32, [1, ?]> var_90 = expand_dims(axes = var_90_axes_0, x = var_89)[name = tensor<string, []>("op_90")];
|
| 114 |
+
tensor<int32, []> concat_3_axis_0 = const()[name = tensor<string, []>("concat_3_axis_0"), val = tensor<int32, []>(0)];
|
| 115 |
+
tensor<bool, []> concat_3_interleave_0 = const()[name = tensor<string, []>("concat_3_interleave_0"), val = tensor<bool, []>(false)];
|
| 116 |
+
tensor<int32, [2]> concat_3 = concat(axis = concat_3_axis_0, interleave = concat_3_interleave_0, values = (gather_5, gather_6_cast_uint16_to_int32))[name = tensor<string, []>("concat_3")];
|
| 117 |
+
tensor<int32, [2]> shape_8 = shape(x = var_90)[name = tensor<string, []>("shape_8")];
|
| 118 |
+
tensor<int32, [2]> real_div_0 = real_div(x = concat_3, y = shape_8)[name = tensor<string, []>("real_div_0")];
|
| 119 |
+
tensor<int32, [?, ?]> time_steps = tile(reps = real_div_0, x = var_90)[name = tensor<string, []>("time_steps")];
|
| 120 |
+
tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
|
| 121 |
+
tensor<int32, [1, 1]> var_93 = expand_dims(axes = var_93_axes_0, x = mel_length)[name = tensor<string, []>("op_93")];
|
| 122 |
+
tensor<bool, [?, ?]> valid_mask = less(x = time_steps, y = var_93)[name = tensor<string, []>("valid_mask")];
|
| 123 |
+
tensor<int32, [1]> var_95_axes_0 = const()[name = tensor<string, []>("op_95_axes_0"), val = tensor<int32, [1]>([1])];
|
| 124 |
+
tensor<bool, [?, 1, ?]> var_95 = expand_dims(axes = var_95_axes_0, x = valid_mask)[name = tensor<string, []>("op_95")];
|
| 125 |
+
tensor<fp16, [1, 80, ?]> var_96_cast_fp16 = select(a = x_15_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_96_cast_fp16")];
|
| 126 |
+
tensor<int32, [1]> x_mean_numerator_axes_0 = const()[name = tensor<string, []>("x_mean_numerator_axes_0"), val = tensor<int32, [1]>([2])];
|
| 127 |
+
tensor<bool, []> x_mean_numerator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_numerator_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 128 |
+
tensor<fp16, [1, 80]> x_mean_numerator_cast_fp16 = reduce_sum(axes = x_mean_numerator_axes_0, keep_dims = x_mean_numerator_keep_dims_0, x = var_96_cast_fp16)[name = tensor<string, []>("x_mean_numerator_cast_fp16")];
|
| 129 |
+
tensor<int32, [1]> x_mean_denominator_axes_0 = const()[name = tensor<string, []>("x_mean_denominator_axes_0"), val = tensor<int32, [1]>([1])];
|
| 130 |
+
tensor<bool, []> x_mean_denominator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_denominator_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 131 |
+
tensor<string, []> cast_6_to_fp16_dtype_0 = const()[name = tensor<string, []>("cast_6_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 132 |
+
tensor<fp16, [?, ?]> valid_mask_to_fp16 = cast(dtype = cast_6_to_fp16_dtype_0, x = valid_mask)[name = tensor<string, []>("cast_22")];
|
| 133 |
+
tensor<fp16, [?]> x_mean_denominator_cast_fp16 = reduce_sum(axes = x_mean_denominator_axes_0, keep_dims = x_mean_denominator_keep_dims_0, x = valid_mask_to_fp16)[name = tensor<string, []>("x_mean_denominator_cast_fp16")];
|
| 134 |
+
tensor<int32, [1]> var_101_axes_0 = const()[name = tensor<string, []>("op_101_axes_0"), val = tensor<int32, [1]>([1])];
|
| 135 |
+
tensor<fp16, [?, 1]> var_101_cast_fp16 = expand_dims(axes = var_101_axes_0, x = x_mean_denominator_cast_fp16)[name = tensor<string, []>("op_101_cast_fp16")];
|
| 136 |
+
tensor<fp16, [?, 80]> x_mean_cast_fp16 = real_div(x = x_mean_numerator_cast_fp16, y = var_101_cast_fp16)[name = tensor<string, []>("x_mean_cast_fp16")];
|
| 137 |
+
tensor<int32, [1]> var_104_axes_0 = const()[name = tensor<string, []>("op_104_axes_0"), val = tensor<int32, [1]>([2])];
|
| 138 |
+
tensor<fp16, [?, 80, 1]> var_104_cast_fp16 = expand_dims(axes = var_104_axes_0, x = x_mean_cast_fp16)[name = tensor<string, []>("op_104_cast_fp16")];
|
| 139 |
+
tensor<fp16, [?, 80, ?]> var_105_cast_fp16 = sub(x = x_15_cast_fp16, y = var_104_cast_fp16)[name = tensor<string, []>("op_105_cast_fp16")];
|
| 140 |
+
tensor<fp16, [?, 80, ?]> var_106_cast_fp16 = select(a = var_105_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_106_cast_fp16")];
|
| 141 |
+
tensor<fp16, []> var_19_promoted_1_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_1_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
|
| 142 |
+
tensor<fp16, [?, 80, ?]> var_107_cast_fp16 = pow(x = var_106_cast_fp16, y = var_19_promoted_1_to_fp16)[name = tensor<string, []>("op_107_cast_fp16")];
|
| 143 |
+
tensor<int32, [1]> var_109_axes_0 = const()[name = tensor<string, []>("op_109_axes_0"), val = tensor<int32, [1]>([2])];
|
| 144 |
+
tensor<bool, []> var_109_keep_dims_0 = const()[name = tensor<string, []>("op_109_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 145 |
+
tensor<fp16, [?, 80]> var_109_cast_fp16 = reduce_sum(axes = var_109_axes_0, keep_dims = var_109_keep_dims_0, x = var_107_cast_fp16)[name = tensor<string, []>("op_109_cast_fp16")];
|
| 146 |
+
tensor<fp16, []> var_111_to_fp16 = const()[name = tensor<string, []>("op_111_to_fp16"), val = tensor<fp16, []>(0x1p+0)];
|
| 147 |
+
tensor<fp16, [?, 1]> var_112_cast_fp16 = sub(x = var_101_cast_fp16, y = var_111_to_fp16)[name = tensor<string, []>("op_112_cast_fp16")];
|
| 148 |
+
tensor<fp16, [?, 80]> var_113_cast_fp16 = real_div(x = var_109_cast_fp16, y = var_112_cast_fp16)[name = tensor<string, []>("op_113_cast_fp16")];
|
| 149 |
+
tensor<fp16, [?, 80]> x_std_1_cast_fp16 = sqrt(x = var_113_cast_fp16)[name = tensor<string, []>("x_std_1_cast_fp16")];
|
| 150 |
+
tensor<bool, [?, 80]> var_115_cast_fp16 = not_equal(x = x_std_1_cast_fp16, y = x_std_1_cast_fp16)[name = tensor<string, []>("op_115_cast_fp16")];
|
| 151 |
+
tensor<fp16, [?, 80]> x_std_3_cast_fp16 = select(a = var_16_to_fp16, b = x_std_1_cast_fp16, cond = var_115_cast_fp16)[name = tensor<string, []>("x_std_3_cast_fp16")];
|
| 152 |
+
tensor<fp16, []> var_25_to_fp16 = const()[name = tensor<string, []>("op_25_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
|
| 153 |
+
tensor<fp16, [?, 80]> x_std_cast_fp16 = add(x = x_std_3_cast_fp16, y = var_25_to_fp16)[name = tensor<string, []>("x_std_cast_fp16")];
|
| 154 |
+
tensor<int32, [1]> var_120_axes_0 = const()[name = tensor<string, []>("op_120_axes_0"), val = tensor<int32, [1]>([2])];
|
| 155 |
+
tensor<fp16, [?, 80, 1]> var_120_cast_fp16 = expand_dims(axes = var_120_axes_0, x = x_std_cast_fp16)[name = tensor<string, []>("op_120_cast_fp16")];
|
| 156 |
+
tensor<fp16, [?, 80, ?]> x_cast_fp16 = real_div(x = var_105_cast_fp16, y = var_120_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
|
| 157 |
+
tensor<int32, [3]> var_122_shape_cast_fp16 = shape(x = x_cast_fp16)[name = tensor<string, []>("op_122_shape_cast_fp16")];
|
| 158 |
+
tensor<int32, []> gather_7_axis_0 = const()[name = tensor<string, []>("gather_7_axis_0"), val = tensor<int32, []>(0)];
|
| 159 |
+
tensor<int32, []> gather_7_batch_dims_0 = const()[name = tensor<string, []>("gather_7_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 160 |
+
tensor<bool, []> gather_7_validate_indices_0 = const()[name = tensor<string, []>("gather_7_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 161 |
+
tensor<string, []> var_122_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_122_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
|
| 162 |
+
tensor<uint16, []> select_7_to_uint16 = const()[name = tensor<string, []>("select_7_to_uint16"), val = tensor<uint16, []>(2)];
|
| 163 |
+
tensor<uint16, [3]> var_122_shape_cast_fp16_to_uint16 = cast(dtype = var_122_shape_cast_fp16_to_uint16_dtype_0, x = var_122_shape_cast_fp16)[name = tensor<string, []>("cast_21")];
|
| 164 |
+
tensor<uint16, []> gather_7_cast_uint16 = gather(axis = gather_7_axis_0, batch_dims = gather_7_batch_dims_0, indices = select_7_to_uint16, validate_indices = gather_7_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_7_cast_uint16")];
|
| 165 |
+
tensor<string, []> gather_7_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_7_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 166 |
+
tensor<int32, []> const_7 = const()[name = tensor<string, []>("const_7"), val = tensor<int32, []>(0)];
|
| 167 |
+
tensor<int32, []> const_8 = const()[name = tensor<string, []>("const_8"), val = tensor<int32, []>(1)];
|
| 168 |
+
tensor<int32, []> gather_7_cast_uint16_to_int32 = cast(dtype = gather_7_cast_uint16_to_int32_dtype_0, x = gather_7_cast_uint16)[name = tensor<string, []>("cast_20")];
|
| 169 |
+
tensor<int32, [?]> mask_1 = range_1d(end = gather_7_cast_uint16_to_int32, start = const_7, step = const_8)[name = tensor<string, []>("mask_1")];
|
| 170 |
+
tensor<int32, []> gather_8_axis_0 = const()[name = tensor<string, []>("gather_8_axis_0"), val = tensor<int32, []>(0)];
|
| 171 |
+
tensor<int32, []> gather_8_batch_dims_0 = const()[name = tensor<string, []>("gather_8_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 172 |
+
tensor<bool, []> gather_8_validate_indices_0 = const()[name = tensor<string, []>("gather_8_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 173 |
+
tensor<uint16, []> select_8_to_uint16 = const()[name = tensor<string, []>("select_8_to_uint16"), val = tensor<uint16, []>(0)];
|
| 174 |
+
tensor<uint16, []> gather_8_cast_uint16 = gather(axis = gather_8_axis_0, batch_dims = gather_8_batch_dims_0, indices = select_8_to_uint16, validate_indices = gather_8_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_8_cast_uint16")];
|
| 175 |
+
tensor<string, []> gather_8_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_8_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 176 |
+
tensor<int32, []> concat_4_axis_0 = const()[name = tensor<string, []>("concat_4_axis_0"), val = tensor<int32, []>(0)];
|
| 177 |
+
tensor<bool, []> concat_4_interleave_0 = const()[name = tensor<string, []>("concat_4_interleave_0"), val = tensor<bool, []>(false)];
|
| 178 |
+
tensor<int32, []> gather_8_cast_uint16_to_int32 = cast(dtype = gather_8_cast_uint16_to_int32_dtype_0, x = gather_8_cast_uint16)[name = tensor<string, []>("cast_19")];
|
| 179 |
+
tensor<int32, [2]> concat_4 = concat(axis = concat_4_axis_0, interleave = concat_4_interleave_0, values = (gather_8_cast_uint16_to_int32, var_9))[name = tensor<string, []>("concat_4")];
|
| 180 |
+
tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
|
| 181 |
+
tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
|
| 182 |
+
tensor<int32, [?, ?]> var_126 = tile(reps = concat_4, x = expand_dims_0)[name = tensor<string, []>("op_126")];
|
| 183 |
+
tensor<bool, [?, ?]> mask = greater_equal(x = var_126, y = var_93)[name = tensor<string, []>("mask")];
|
| 184 |
+
tensor<int32, [1]> var_129_axes_0 = const()[name = tensor<string, []>("op_129_axes_0"), val = tensor<int32, [1]>([1])];
|
| 185 |
+
tensor<bool, [?, 1, ?]> var_129 = expand_dims(axes = var_129_axes_0, x = mask)[name = tensor<string, []>("op_129")];
|
| 186 |
+
tensor<fp16, []> cast_15_to_fp16 = const()[name = tensor<string, []>("cast_15_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
|
| 187 |
+
tensor<fp16, [?, 80, ?]> processed_signal_cast_fp16 = select(a = cast_15_to_fp16, b = x_cast_fp16, cond = var_129)[name = tensor<string, []>("processed_signal_cast_fp16")];
|
| 188 |
+
tensor<string, []> processed_signal_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("processed_signal_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 189 |
+
tensor<fp32, [?, 80, ?]> mel_features = cast(dtype = processed_signal_cast_fp16_to_fp32_dtype_0, x = processed_signal_cast_fp16)[name = tensor<string, []>("cast_18")];
|
| 190 |
+
} -> (mel_features, mel_length);
|
| 191 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c062338de852a26607ce4101f74e6895de3a4134a57b07232bd72bfc6f1d7f1a
|
| 3 |
+
size 567712
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_id": "nvidia/parakeet-tdt_ctc-110m",
|
| 3 |
+
"model_type": "hybrid_rnnt_ctc",
|
| 4 |
+
"sample_rate": 16000,
|
| 5 |
+
"max_audio_seconds": 15.0,
|
| 6 |
+
"max_audio_samples": 240000,
|
| 7 |
+
"max_symbol_steps": 1,
|
| 8 |
+
"vocab_size": 1024,
|
| 9 |
+
"joint_extra_outputs": 5,
|
| 10 |
+
"encoder_dim": 512,
|
| 11 |
+
"decoder_dim": 640,
|
| 12 |
+
"decoder_hidden": 640,
|
| 13 |
+
"decoder_layers": 1,
|
| 14 |
+
"blank_id": 1024,
|
| 15 |
+
"checkpoint": {
|
| 16 |
+
"type": "pretrained",
|
| 17 |
+
"model_id": "nvidia/parakeet-tdt_ctc-110m"
|
| 18 |
+
},
|
| 19 |
+
"coreml": {
|
| 20 |
+
"compute_units": "CPU_ONLY",
|
| 21 |
+
"compute_precision": "FLOAT32"
|
| 22 |
+
},
|
| 23 |
+
"components": {
|
| 24 |
+
"preprocessor": {
|
| 25 |
+
"inputs": {
|
| 26 |
+
"audio_signal": [
|
| 27 |
+
1,
|
| 28 |
+
240000
|
| 29 |
+
],
|
| 30 |
+
"audio_length": [
|
| 31 |
+
1
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
"outputs": {
|
| 35 |
+
"mel": [
|
| 36 |
+
1,
|
| 37 |
+
80,
|
| 38 |
+
1501
|
| 39 |
+
],
|
| 40 |
+
"mel_length": [
|
| 41 |
+
1
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
"path": "parakeet_preprocessor.mlpackage"
|
| 45 |
+
},
|
| 46 |
+
"encoder": {
|
| 47 |
+
"inputs": {
|
| 48 |
+
"mel": [
|
| 49 |
+
1,
|
| 50 |
+
80,
|
| 51 |
+
1501
|
| 52 |
+
],
|
| 53 |
+
"mel_length": [
|
| 54 |
+
1
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
"outputs": {
|
| 58 |
+
"encoder": [
|
| 59 |
+
1,
|
| 60 |
+
512,
|
| 61 |
+
188
|
| 62 |
+
],
|
| 63 |
+
"encoder_length": [
|
| 64 |
+
1
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"path": "parakeet_encoder.mlpackage"
|
| 68 |
+
},
|
| 69 |
+
"ctc_head": {
|
| 70 |
+
"inputs": {
|
| 71 |
+
"encoder": [
|
| 72 |
+
1,
|
| 73 |
+
512,
|
| 74 |
+
188
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
"outputs": {
|
| 78 |
+
"log_probs": [
|
| 79 |
+
1,
|
| 80 |
+
188,
|
| 81 |
+
1025
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
"path": "parakeet_ctc_head.mlpackage"
|
| 85 |
+
},
|
| 86 |
+
"mel_encoder": {
|
| 87 |
+
"inputs": {
|
| 88 |
+
"audio_signal": [
|
| 89 |
+
1,
|
| 90 |
+
240000
|
| 91 |
+
],
|
| 92 |
+
"audio_length": [
|
| 93 |
+
1
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
"outputs": {
|
| 97 |
+
"encoder": [
|
| 98 |
+
1,
|
| 99 |
+
512,
|
| 100 |
+
188
|
| 101 |
+
],
|
| 102 |
+
"encoder_length": [
|
| 103 |
+
1
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
"path": "parakeet_mel_encoder.mlpackage"
|
| 107 |
+
},
|
| 108 |
+
"decoder": {
|
| 109 |
+
"inputs": {
|
| 110 |
+
"targets": [
|
| 111 |
+
1,
|
| 112 |
+
1
|
| 113 |
+
],
|
| 114 |
+
"target_length": [
|
| 115 |
+
1
|
| 116 |
+
],
|
| 117 |
+
"h_in": [
|
| 118 |
+
1,
|
| 119 |
+
1,
|
| 120 |
+
640
|
| 121 |
+
],
|
| 122 |
+
"c_in": [
|
| 123 |
+
1,
|
| 124 |
+
1,
|
| 125 |
+
640
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
"outputs": {
|
| 129 |
+
"decoder": [
|
| 130 |
+
1,
|
| 131 |
+
640,
|
| 132 |
+
1
|
| 133 |
+
],
|
| 134 |
+
"h_out": [
|
| 135 |
+
1,
|
| 136 |
+
1,
|
| 137 |
+
640
|
| 138 |
+
],
|
| 139 |
+
"c_out": [
|
| 140 |
+
1,
|
| 141 |
+
1,
|
| 142 |
+
640
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
"path": "parakeet_decoder.mlpackage"
|
| 146 |
+
},
|
| 147 |
+
"joint": {
|
| 148 |
+
"inputs": {
|
| 149 |
+
"encoder": [
|
| 150 |
+
1,
|
| 151 |
+
512,
|
| 152 |
+
188
|
| 153 |
+
],
|
| 154 |
+
"decoder": [
|
| 155 |
+
1,
|
| 156 |
+
640,
|
| 157 |
+
1
|
| 158 |
+
]
|
| 159 |
+
},
|
| 160 |
+
"outputs": {
|
| 161 |
+
"logits": [
|
| 162 |
+
1,
|
| 163 |
+
188,
|
| 164 |
+
1,
|
| 165 |
+
1030
|
| 166 |
+
]
|
| 167 |
+
},
|
| 168 |
+
"path": "parakeet_joint.mlpackage"
|
| 169 |
+
},
|
| 170 |
+
"joint_decision": {
|
| 171 |
+
"inputs": {
|
| 172 |
+
"encoder": [
|
| 173 |
+
1,
|
| 174 |
+
512,
|
| 175 |
+
188
|
| 176 |
+
],
|
| 177 |
+
"decoder": [
|
| 178 |
+
1,
|
| 179 |
+
640,
|
| 180 |
+
1
|
| 181 |
+
]
|
| 182 |
+
},
|
| 183 |
+
"outputs": {
|
| 184 |
+
"token_id": [
|
| 185 |
+
1,
|
| 186 |
+
188,
|
| 187 |
+
1
|
| 188 |
+
],
|
| 189 |
+
"token_prob": [
|
| 190 |
+
1,
|
| 191 |
+
188,
|
| 192 |
+
1
|
| 193 |
+
],
|
| 194 |
+
"duration": [
|
| 195 |
+
1,
|
| 196 |
+
188,
|
| 197 |
+
1
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
"path": "parakeet_joint_decision.mlpackage"
|
| 201 |
+
},
|
| 202 |
+
"joint_decision_single_step": {
|
| 203 |
+
"inputs": {
|
| 204 |
+
"encoder_step": [
|
| 205 |
+
1,
|
| 206 |
+
512,
|
| 207 |
+
1
|
| 208 |
+
],
|
| 209 |
+
"decoder_step": [
|
| 210 |
+
1,
|
| 211 |
+
640,
|
| 212 |
+
1
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
"outputs": {
|
| 216 |
+
"token_id": [
|
| 217 |
+
1,
|
| 218 |
+
1,
|
| 219 |
+
1
|
| 220 |
+
],
|
| 221 |
+
"token_prob": [
|
| 222 |
+
1,
|
| 223 |
+
1,
|
| 224 |
+
1
|
| 225 |
+
],
|
| 226 |
+
"duration": [
|
| 227 |
+
1,
|
| 228 |
+
1,
|
| 229 |
+
1
|
| 230 |
+
],
|
| 231 |
+
"top_k_ids": [
|
| 232 |
+
1,
|
| 233 |
+
1,
|
| 234 |
+
1,
|
| 235 |
+
64
|
| 236 |
+
],
|
| 237 |
+
"top_k_logits": [
|
| 238 |
+
1,
|
| 239 |
+
1,
|
| 240 |
+
1,
|
| 241 |
+
64
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
"path": "parakeet_joint_decision_single_step.mlpackage"
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"0": "<unk>", "1": "▁t", "2": "▁th", "3": "▁a", "4": "in", "5": "re", "6": "▁the", "7": "▁w", "8": "▁s", "9": "▁o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "▁h", "16": "▁c", "17": "▁b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "▁f", "23": "▁to", "24": "▁m", "25": "es", "26": "▁p", "27": "or", "28": "an", "29": "▁d", "30": "ll", "31": "▁I", "32": "ed", "33": "▁and", "34": "▁l", "35": "▁of", "36": "▁in", "37": "▁y", "38": "ar", "39": "▁g", "40": "▁you", "41": "as", "42": "om", "43": "▁n", "44": "ve", "45": "▁that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "▁e", "53": "ut", "54": "▁it", "55": "ot", "56": "▁be", "57": "▁T", "58": "ion", "59": "▁is", "60": "▁wh", "61": "▁re", "62": "▁on", "63": "▁we", "64": "ent", "65": "▁A", "66": "ay", "67": "▁ha", "68": "▁Th", "69": "id", "70": "▁S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "▁for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "▁he", "81": "▁st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "▁this", "91": "if", "92": "▁W", "93": "oo", "94": "ri", "95": "▁was", "96": "ght", "97": "▁u", "98": "▁with", "99": "ad", "100": "ch", "101": "▁se", "102": "▁k", "103": "▁an", "104": "▁The", "105": "▁li", "106": "▁do", "107": "▁B", "108": "▁have", "109": "▁as", "110": "th", "111": "▁are", "112": "▁sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "▁H", "118": "▁j", "119": "ter", "120": "▁go", "121": "▁And", "122": "ation", "123": "▁C", "124": "▁so", "125": "ome", "126": "▁not", "127": "op", "128": "il", "129": "ore", "130": "▁ne", "131": "▁can", "132": "▁me", "133": "▁at", "134": "ould", "135": "ant", "136": "▁M", "137": "▁like", "138": "ere", "139": "▁they", "140": "ra", "141": "ers", "142": "▁ab", "143": "▁de", "144": "▁kn", "145": "ge", "146": "▁Y", "147": "▁ch", "148": "ul", "149": "pp", "150": "▁or", "151": "▁al", "152": "▁con", "153": "▁com", "154": "ess", "155": "▁su", "156": "out", "157": "▁your", "158": "▁So", "159": "ate", "160": "▁one", "161": "▁all", "162": "▁ex", "163": "est", "164": "▁fr", "165": "▁just", "166": "▁pro", "167": "▁know", "168": "▁O", "169": "ain", "170": "▁but", "171": "ol", "172": "ive", "173": "▁v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "▁my", "179": "el", "180": "▁N", "181": "nt", "182": "▁It", "183": "▁what", "184": "ab", "185": "▁P", "186": "▁wor", "187": "▁out", "188": "▁there", "189": "▁up", "190": "um", "191": "▁from", "192": "pe", "193": "▁tw", "194": "▁r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "▁L", "200": "ist", "201": "▁about", "202": "ide", "203": "ig", "204": "ake", "205": "▁D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "▁We", "214": "▁get", "215": "▁E", "216": "▁G", "217": "ack", "218": "▁le", "219": "ity", "220": "od", "221": "▁F", "222": "ard", "223": "▁pl", "224": "▁our", "225": "▁int", "226": "ment", "227": "▁will", "228": "ies", "229": "▁by", "230": "ink", "231": "ca", "232": "▁if", "233": "red", "234": "her", "235": "ie", "236": "▁us", "237": "▁some", "238": "▁don", "239": "ven", "240": "ood", "241": "ast", "242": "▁R", "243": "▁his", "244": "▁tim", "245": "▁tr", "246": "▁more", "247": "ich", "248": "ous", "249": "ame", "250": "▁going", "251": "▁had", "252": "▁them", "253": "ook", "254": "▁pe", "255": "▁Wh", "256": "▁You", "257": "▁But", "258": "ine", "259": "▁here", "260": "▁would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "▁has", "267": "ect", "268": "▁think", "269": "▁fe", "270": "ong", "271": "▁see", "272": "▁when", "273": "▁who", "274": "▁were", "275": "▁really", "276": "▁their", "277": "▁want", "278": "one", "279": "ople", "280": "▁then", "281": "▁time", "282": "▁sa", "283": "ap", "284": "▁te", "285": "▁He", "286": "▁ye", "287": "ck", "288": "▁her", "289": "▁thing", "290": "▁right", "291": "▁which", "292": "itt", "293": "ice", "294": "act", "295": "▁people", "296": "ty", "297": "▁two", "298": "▁J", "299": "▁im", "300": "ther", "301": "ci", "302": "ose", "303": "▁cl", "304": "▁qu", "305": "▁man", "306": "▁also", "307": "ree", "308": "▁en", "309": "ud", "310": "▁how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "▁any", "316": "ff", "317": "ace", "318": "per", "319": "▁because", "320": "▁very", "321": "own", "322": "▁ad", "323": "▁act", "324": "▁been", "325": "▁now", "326": "▁ag", "327": "▁into", "328": "▁comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "▁these", "335": "ays", "336": "ep", "337": "▁This", "338": "▁she", "339": "ans", "340": "ah", "341": "een", "342": "▁over", "343": "ry", "344": "▁lo", "345": "age", "346": "▁pr", "347": "▁sp", "348": "ue", "349": "▁co", "350": "ick", "351": "ber", "352": "▁did", "353": "ip", "354": "ach", "355": "▁back", "356": "▁no", "357": "▁cont", "358": "▁other", "359": "▁every", "360": "pt", "361": "▁need", "362": "▁him", "363": "▁U", "364": "▁In", "365": "▁work", "366": "irst", "367": "▁part", "368": "▁look", "369": "ittle", "370": "ble", "371": "iz", "372": "▁un", "373": "▁make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "▁little", "379": "▁off", "380": "▁than", "381": "▁got", "382": "ually", "383": "▁per", "384": "▁good", "385": "▁way", "386": "▁could", "387": "▁ac", "388": "▁imp", "389": "able", "390": "▁where", "391": "iff", "392": "▁That", "393": "▁res", "394": "ount", "395": "pl", "396": "ance", "397": "▁first", "398": "▁ro", "399": "▁pre", "400": "ass", "401": "▁say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "▁somet", "408": "ound", "409": "▁down", "410": "▁diff", "411": "sel", "412": "▁gu", "413": "▁am", "414": "ress", "415": "▁lot", "416": "ence", "417": "▁dis", "418": "orm", "419": "ix", "420": "▁po", "421": "ving", "422": "enty", "423": "▁K", "424": "▁spe", "425": "und", "426": "he", "427": "▁much", "428": "▁ar", "429": "round", "430": "▁app", "431": "co", "432": "ark", "433": "▁new", "434": "ater", "435": "ult", "436": "end", "437": "▁even", "438": "▁start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "▁well", "444": "be", "445": "▁They", "446": "▁three", "447": "ign", "448": "ild", "449": "▁said", "450": "ough", "451": "ang", "452": "▁too", "453": "ade", "454": "▁bl", "455": "ens", "456": "▁inc", "457": "ia", "458": "▁those", "459": "▁mo", "460": "▁take", "461": "▁through", "462": "▁fl", "463": "▁kind", "464": "▁things", "465": "▁bet", "466": "▁only", "467": "▁St", "468": "▁let", "469": "cess", "470": "▁Ch", "471": "ary", "472": "vel", "473": "▁If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "▁again", "480": "▁something", "481": "onna", "482": "fore", "483": "▁may", "484": "ting", "485": "▁bu", "486": "▁differe", "487": "urn", "488": "▁gonna", "489": "▁does", "490": "uct", "491": "og", "492": "▁twenty", "493": "▁gr", "494": "▁Ye", "495": "wn", "496": "▁should", "497": "▁comm", "498": "ition", "499": "▁under", "500": "▁hel", "501": "ory", "502": "▁fo", "503": "▁use", "504": "igh", "505": "ife", "506": "▁actually", "507": "▁tal", "508": "▁call", "509": "ents", "510": "ious", "511": "ull", "512": "▁There", "513": "▁Yeah", "514": "▁most", "515": "▁ke", "516": "ors", "517": "ved", "518": "ys", "519": "▁sc", "520": "▁happ", "521": "ope", "522": "▁help", "523": "atch", "524": "▁What", "525": "▁rem", "526": "ple", "527": "▁Now", "528": "▁br", "529": "ool", "530": "oth", "531": "▁four", "532": "self", "533": "▁str", "534": "ne", "535": "thing", "536": "▁put", "537": "ial", "538": "▁great", "539": "ail", "540": "ub", "541": "ning", "542": "▁sm", "543": "▁feel", "544": "▁five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "▁many", "552": "▁hundred", "553": "▁years", "554": "▁being", "555": "▁come", "556": "▁mean", "557": "ily", "558": "▁different", "559": "▁after", "560": "▁ser", "561": "▁show", "562": "form", "563": "ful", "564": "oy", "565": "▁six", "566": "▁vide", "567": "▁V", "568": "▁its", "569": "▁point", "570": "▁day", "571": "▁des", "572": "ons", "573": "▁bit", "574": "▁bel", "575": "▁before", "576": "▁aw", "577": "▁end", "578": "▁Oh", "579": "▁still", "580": "ath", "581": "▁long", "582": "▁'", "583": "ise", "584": "ob", "585": "day", "586": "▁add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "▁cr", "592": "▁around", "593": "▁try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "▁find", "600": "ward", "601": "▁As", "602": "▁eight", "603": "lic", "604": "▁same", "605": "▁pos", "606": "▁em", "607": "▁made", "608": "▁supp", "609": "▁life", "610": "▁Be", "611": "pect", "612": "▁dec", "613": "▁play", "614": "ange", "615": "▁att", "616": "▁pers", "617": "ways", "618": "▁high", "619": "▁hand", "620": "▁next", "621": "▁cons", "622": "▁own", "623": "▁inv", "624": "ower", "625": "▁ind", "626": "ert", "627": "ng", "628": "ave", "629": "▁year", "630": "▁big", "631": "ating", "632": "▁world", "633": "▁rel", "634": "▁sure", "635": "▁tra", "636": "ew", "637": "ered", "638": "▁fin", "639": "▁Well", "640": "▁sl", "641": "▁doing", "642": "bs", "643": "▁set", "644": "▁rec", "645": "ual", "646": "cial", "647": "▁ph", "648": "erm", "649": "▁love", "650": "ph", "651": "▁real", "652": "▁last", "653": "ict", "654": "▁bo", "655": "▁ra", "656": "ible", "657": "▁wr", "658": "mer", "659": "▁count", "660": "ities", "661": "▁always", "662": "inet", "663": "ments", "664": "uc", "665": "▁might", "666": "▁inter", "667": "▁video", "668": "gin", "669": "▁tell", "670": "▁never", "671": "vent", "672": "▁import", "673": "ied", "674": "▁sy", "675": "▁How", "676": "ically", "677": "ought", "678": "▁thir", "679": "▁rep", "680": "ks", "681": "ib", "682": "▁fam", "683": "ject", "684": "▁bas", "685": "▁She", "686": "▁give", "687": "akes", "688": "▁ninet", "689": "▁reg", "690": "▁min", "691": "▁op", "692": "▁def", "693": "▁didn", "694": "te", "695": "▁cour", "696": "▁why", "697": "▁ent", "698": "▁place", "699": "▁ins", "700": "▁car", "701": "ather", "702": "▁person", "703": "ular", "704": "▁inst", "705": "▁prod", "706": "lect", "707": "▁Al", "708": "▁today", "709": "▁bec", "710": "▁sur", "711": "▁All", "712": "▁another", "713": "▁bus", "714": "▁keep", "715": "ell", "716": "ese", "717": "riend", "718": "▁quest", "719": "▁talk", "720": "als", "721": "ings", "722": "▁mon", "723": "cond", "724": "old", "725": "▁acc", "726": "▁la", "727": "▁num", "728": "ident", "729": "▁che", "730": "iness", "731": "▁turn", "732": "▁ear", "733": "▁No", "734": "ousand", "735": "▁better", "736": "ific", "737": "▁loo", "738": "▁gl", "739": "oc", "740": "▁important", "741": "ited", "742": "▁An", "743": "▁thousand", "744": "ility", "745": "llow", "746": "▁used", "747": "▁gen", "748": "▁sim", "749": "li", "750": "▁happen", "751": "▁Un", "752": "▁Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "▁watch", "758": "▁For", "759": "▁sw", "760": "ren", "761": "ute", "762": "ever", "763": "▁pol", "764": "▁sch", "765": "▁When", "766": "▁such", "767": "▁fif", "768": "▁home", "769": "▁cle", "770": "▁contin", "771": "ouse", "772": "▁friend", "773": "uring", "774": "▁Okay", "775": "gr", "776": "▁able", "777": "▁stud", "778": "▁eff", "779": "hip", "780": "body", "781": "▁top", "782": "ness", "783": "▁exper", "784": "▁pret", "785": "▁both", "786": "▁done", "787": "cri", "788": "▁mark", "789": "▁while", "790": "▁old", "791": "ros", "792": "ont", "793": "▁second", "794": "ative", "795": "▁thought", "796": "▁best", "797": "▁found", "798": "iew", "799": "▁belie", "800": "▁each", "801": "erest", "802": "▁tri", "803": "▁eas", "804": "▁ca", "805": "▁fact", "806": "▁care", "807": "▁fun", "808": "atter", "809": "ures", "810": "▁head", "811": "▁lear", "812": "▁water", "813": "▁hard", "814": "▁few", "815": "▁side", "816": "ween", "817": "▁exp", "818": "▁away", "819": "its", "820": "▁ext", "821": "lud", "822": "▁run", "823": "▁trans", "824": "ince", "825": "▁sk", "826": "▁open", "827": "cus", "828": "▁between", "829": "▁called", "830": "▁wee", "831": "▁pretty", "832": "ason", "833": "▁far", "834": "ember", "835": "omm", "836": "▁interest", "837": "any", "838": "ner", "839": "uff", "840": "▁pres", "841": "▁cur", "842": "▁child", "843": "ee", "844": "▁toget", "845": "▁together", "846": "olog", "847": "▁God", "848": "ond", "849": "▁char", "850": "▁looking", "851": "stem", "852": "az", "853": "cent", "854": "▁ob", "855": "▁ass", "856": "land", "857": "▁doesn", "858": "▁business", "859": "▁course", "860": "▁ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "▁ref", "868": "▁name", "869": "ross", "870": "▁grow", "871": "oney", "872": "▁went", "873": "ics", "874": "teen", "875": "▁cou", "876": "▁prob", "877": "▁ret", "878": "▁guys", "879": "▁came", "880": "ash", "881": "led", "882": "▁Eur", "883": "ues", "884": "▁ide", "885": "gan", "886": "▁everything", "887": "▁getting", "888": "▁ask", "889": "▁cor", "890": "▁build", "891": "▁sign", "892": "▁small", "893": "uck", "894": "▁el", "895": "▁col", "896": "▁Is", "897": "ational", "898": "stand", "899": "cy", "900": "▁conf", "901": "der", "902": "▁bre", "903": "▁cap", "904": "▁mod", "905": "ets", "906": "ike", "907": "▁number", "908": "▁comple", "909": "ertain", "910": "▁ever", "911": "▁coll", "912": "▁hum", "913": "▁Europe", "914": "▁cre", "915": "▁met", "916": "▁exam", "917": "▁move", "918": "▁pass", "919": "▁left", "920": "▁system", "921": "▁includ", "922": "▁Thank", "923": "cept", "924": "▁wom", "925": "▁product", "926": "ten", "927": "▁rest", "928": "▁probably", "929": "▁dri", "930": "▁Do", "931": "▁gener", "932": "▁anything", "933": "▁lar", "934": "▁My", "935": "▁school", "936": "▁lead", "937": "▁sub", "938": "▁ty", "939": "▁plan", "940": "▁seem", "941": "▁whole", "942": "irect", "943": "▁light", "944": "▁must", "945": "▁mom", "946": "▁opp", "947": "▁support", "948": "▁family", "949": "ices", "950": "amp", "951": "▁proble", "952": "▁dr", "953": "ready", "954": "▁using", "955": "ense", "956": "▁prov", "957": "ush", "958": "ax", "959": "▁power", "960": "▁Re", "961": "alth", "962": "▁ev", "963": "▁stand", "964": "��war", "965": "ts", "966": "▁", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}
|
convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py
ADDED
|
@@ -0,0 +1,697 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""CLI for exporting Parakeet TDT-CTC 110M Hybrid components to CoreML."""
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
from dataclasses import asdict
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Optional, Tuple
|
| 9 |
+
|
| 10 |
+
import coremltools as ct
|
| 11 |
+
import numpy as np
|
| 12 |
+
import soundfile as sf
|
| 13 |
+
import torch
|
| 14 |
+
import typer
|
| 15 |
+
|
| 16 |
+
import nemo.collections.asr as nemo_asr
|
| 17 |
+
|
| 18 |
+
from individual_components import (
|
| 19 |
+
CTCHeadWrapper,
|
| 20 |
+
DecoderWrapper,
|
| 21 |
+
EncoderWrapper,
|
| 22 |
+
ExportSettings,
|
| 23 |
+
JointWrapper,
|
| 24 |
+
JointDecisionWrapper,
|
| 25 |
+
JointDecisionSingleStep,
|
| 26 |
+
PreprocessorWrapper,
|
| 27 |
+
MelEncoderWrapper,
|
| 28 |
+
_coreml_convert,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
DEFAULT_MODEL_ID = "nvidia/parakeet-tdt_ctc-110m"
|
| 32 |
+
AUTHOR = "Fluid Inference"
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
def _compute_length(seconds: float, sample_rate: int) -> int:
|
| 36 |
+
return int(round(seconds * sample_rate))
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def _prepare_audio(
|
| 40 |
+
validation_audio: Optional[Path],
|
| 41 |
+
sample_rate: int,
|
| 42 |
+
max_samples: int,
|
| 43 |
+
seed: Optional[int],
|
| 44 |
+
) -> torch.Tensor:
|
| 45 |
+
if validation_audio is None:
|
| 46 |
+
if seed is not None:
|
| 47 |
+
torch.manual_seed(seed)
|
| 48 |
+
audio = torch.randn(1, max_samples, dtype=torch.float32)
|
| 49 |
+
return audio
|
| 50 |
+
|
| 51 |
+
data, sr = sf.read(str(validation_audio), dtype="float32")
|
| 52 |
+
if sr != sample_rate:
|
| 53 |
+
raise typer.BadParameter(
|
| 54 |
+
f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if data.ndim > 1:
|
| 58 |
+
data = data[:, 0]
|
| 59 |
+
|
| 60 |
+
if data.size == 0:
|
| 61 |
+
raise typer.BadParameter("Validation audio is empty")
|
| 62 |
+
|
| 63 |
+
if data.size < max_samples:
|
| 64 |
+
pad_width = max_samples - data.size
|
| 65 |
+
data = np.pad(data, (0, pad_width))
|
| 66 |
+
elif data.size > max_samples:
|
| 67 |
+
data = data[:max_samples]
|
| 68 |
+
|
| 69 |
+
audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
|
| 70 |
+
return audio
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
|
| 74 |
+
# Ensure iOS 17+ target for MLProgram ops and ANE readiness
|
| 75 |
+
try:
|
| 76 |
+
model.minimum_deployment_target = ct.target.iOS17
|
| 77 |
+
except Exception:
|
| 78 |
+
pass
|
| 79 |
+
model.short_description = description
|
| 80 |
+
model.author = AUTHOR
|
| 81 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 82 |
+
model.save(str(path))
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
|
| 86 |
+
return tuple(int(dim) for dim in tensor.shape)
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _parse_compute_units(name: str) -> ct.ComputeUnit:
|
| 90 |
+
"""Parse a human-friendly compute units string into ct.ComputeUnit.
|
| 91 |
+
|
| 92 |
+
Accepted (case-insensitive): ALL, CPU_ONLY, CPU_AND_GPU, CPU_AND_NE.
|
| 93 |
+
"""
|
| 94 |
+
normalized = str(name).strip().upper()
|
| 95 |
+
mapping = {
|
| 96 |
+
"ALL": ct.ComputeUnit.ALL,
|
| 97 |
+
"CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
|
| 98 |
+
"CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
|
| 99 |
+
"CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
|
| 100 |
+
"CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
|
| 101 |
+
}
|
| 102 |
+
if normalized not in mapping:
|
| 103 |
+
raise typer.BadParameter(
|
| 104 |
+
f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
|
| 105 |
+
)
|
| 106 |
+
return mapping[normalized]
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
|
| 110 |
+
"""Parse compute precision string into ct.precision or None.
|
| 111 |
+
|
| 112 |
+
Accepted (case-insensitive): FLOAT32, FLOAT16. If None/empty, returns None (tool default).
|
| 113 |
+
"""
|
| 114 |
+
if name is None:
|
| 115 |
+
return None
|
| 116 |
+
normalized = str(name).strip().upper()
|
| 117 |
+
if normalized == "":
|
| 118 |
+
return None
|
| 119 |
+
mapping = {
|
| 120 |
+
"FLOAT32": ct.precision.FLOAT32,
|
| 121 |
+
"FLOAT16": ct.precision.FLOAT16,
|
| 122 |
+
}
|
| 123 |
+
if normalized not in mapping:
|
| 124 |
+
raise typer.BadParameter(
|
| 125 |
+
f"Unknown compute precision '{name}'. Choose from: " + ", ".join(mapping.keys())
|
| 126 |
+
)
|
| 127 |
+
return mapping[normalized]
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# Fixed export choices: CPU_ONLY + FP32, min target iOS17
|
| 131 |
+
|
| 132 |
+
|
| 133 |
+
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
@app.command()
|
| 137 |
+
def convert(
|
| 138 |
+
nemo_path: Optional[Path] = typer.Option(
|
| 139 |
+
None,
|
| 140 |
+
"--nemo-path",
|
| 141 |
+
exists=True,
|
| 142 |
+
resolve_path=True,
|
| 143 |
+
help="Path to parakeet-tdt_ctc-110m .nemo checkpoint (skip to auto-download)",
|
| 144 |
+
),
|
| 145 |
+
model_id: str = typer.Option(
|
| 146 |
+
DEFAULT_MODEL_ID,
|
| 147 |
+
"--model-id",
|
| 148 |
+
help="Model identifier to download when --nemo-path is omitted",
|
| 149 |
+
),
|
| 150 |
+
output_dir: Path = typer.Option(Path("parakeet_110m_coreml"), help="Directory where mlpackages and metadata will be written"),
|
| 151 |
+
preprocessor_cu: str = typer.Option(
|
| 152 |
+
"CPU_ONLY",
|
| 153 |
+
"--preprocessor-cu",
|
| 154 |
+
help="Compute units for preprocessor (default CPU_ONLY)",
|
| 155 |
+
),
|
| 156 |
+
mel_encoder_cu: str = typer.Option(
|
| 157 |
+
"CPU_ONLY",
|
| 158 |
+
"--mel-encoder-cu",
|
| 159 |
+
help="Compute units for fused mel+encoder (default CPU_ONLY)",
|
| 160 |
+
),
|
| 161 |
+
compute_precision: Optional[str] = typer.Option(
|
| 162 |
+
None,
|
| 163 |
+
"--compute-precision",
|
| 164 |
+
help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
|
| 165 |
+
),
|
| 166 |
+
) -> None:
|
| 167 |
+
"""Export all Parakeet TDT-CTC 110M Hybrid sub-modules to CoreML with a fixed 15-second window.
|
| 168 |
+
|
| 169 |
+
This exports both CTC and TDT components from the hybrid model.
|
| 170 |
+
"""
|
| 171 |
+
# Runtime CoreML contract keeps U=1 so the prediction net matches the streaming decoder.
|
| 172 |
+
export_settings = ExportSettings(
|
| 173 |
+
output_dir=output_dir,
|
| 174 |
+
compute_units=ct.ComputeUnit.CPU_ONLY, # Default: CPU-only for all components
|
| 175 |
+
deployment_target=ct.target.iOS17, # iOS 17+ features and kernels
|
| 176 |
+
compute_precision=_parse_compute_precision(compute_precision),
|
| 177 |
+
max_audio_seconds=15.0,
|
| 178 |
+
max_symbol_steps=1,
|
| 179 |
+
)
|
| 180 |
+
|
| 181 |
+
typer.echo("Export configuration:")
|
| 182 |
+
typer.echo(asdict(export_settings))
|
| 183 |
+
|
| 184 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 185 |
+
pre_cu = _parse_compute_units(preprocessor_cu)
|
| 186 |
+
melenc_cu = _parse_compute_units(mel_encoder_cu)
|
| 187 |
+
|
| 188 |
+
if nemo_path is not None:
|
| 189 |
+
typer.echo(f"Loading NeMo model from {nemo_path}…")
|
| 190 |
+
# 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
|
| 191 |
+
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(
|
| 192 |
+
str(nemo_path), map_location="cpu"
|
| 193 |
+
)
|
| 194 |
+
checkpoint_meta = {
|
| 195 |
+
"type": "file",
|
| 196 |
+
"path": str(nemo_path),
|
| 197 |
+
}
|
| 198 |
+
else:
|
| 199 |
+
typer.echo(f"Downloading NeMo model via {model_id}…")
|
| 200 |
+
# 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
|
| 201 |
+
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
|
| 202 |
+
model_id, map_location="cpu"
|
| 203 |
+
)
|
| 204 |
+
checkpoint_meta = {
|
| 205 |
+
"type": "pretrained",
|
| 206 |
+
"model_id": model_id,
|
| 207 |
+
}
|
| 208 |
+
asr_model.eval()
|
| 209 |
+
|
| 210 |
+
sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
|
| 211 |
+
max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
|
| 212 |
+
|
| 213 |
+
# Look for a bundled 15s 16kHz audio file
|
| 214 |
+
default_audio = (Path(__file__).parent / "audio" / "yc_first_minute_16k_15s.wav").resolve()
|
| 215 |
+
if default_audio.exists():
|
| 216 |
+
typer.echo(f"Using trace audio: {default_audio}")
|
| 217 |
+
audio_tensor = _prepare_audio(default_audio, sample_rate, max_samples, seed=None)
|
| 218 |
+
else:
|
| 219 |
+
typer.echo("No trace audio found, using random noise for tracing")
|
| 220 |
+
audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
|
| 221 |
+
audio_length = torch.tensor([max_samples], dtype=torch.int32)
|
| 222 |
+
|
| 223 |
+
preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
|
| 224 |
+
encoder = EncoderWrapper(asr_model.encoder.eval())
|
| 225 |
+
decoder = DecoderWrapper(asr_model.decoder.eval())
|
| 226 |
+
joint = JointWrapper(asr_model.joint.eval())
|
| 227 |
+
# CTC head for hybrid model
|
| 228 |
+
ctc_head = CTCHeadWrapper(asr_model.ctc_decoder.eval())
|
| 229 |
+
|
| 230 |
+
decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
|
| 231 |
+
asr_model.decoder._rnnt_export = True
|
| 232 |
+
|
| 233 |
+
try:
|
| 234 |
+
with torch.inference_mode():
|
| 235 |
+
mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
|
| 236 |
+
mel_length_ref = mel_length_ref.to(dtype=torch.int32)
|
| 237 |
+
encoder_ref, encoder_length_ref = encoder(mel_ref, mel_length_ref)
|
| 238 |
+
encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
|
| 239 |
+
# CTC log probs
|
| 240 |
+
ctc_log_probs_ref = ctc_head(encoder_ref)
|
| 241 |
+
|
| 242 |
+
# Clone Tensors to drop the inference tensor flag before tracing
|
| 243 |
+
mel_ref = mel_ref.clone()
|
| 244 |
+
mel_length_ref = mel_length_ref.clone()
|
| 245 |
+
encoder_ref = encoder_ref.clone()
|
| 246 |
+
encoder_length_ref = encoder_length_ref.clone()
|
| 247 |
+
ctc_log_probs_ref = ctc_log_probs_ref.clone()
|
| 248 |
+
|
| 249 |
+
vocab_size = int(asr_model.tokenizer.vocab_size)
|
| 250 |
+
num_extra = int(asr_model.joint.num_extra_outputs)
|
| 251 |
+
decoder_hidden = int(asr_model.decoder.pred_hidden)
|
| 252 |
+
decoder_layers = int(asr_model.decoder.pred_rnn_layers)
|
| 253 |
+
|
| 254 |
+
typer.echo(f"Model info:")
|
| 255 |
+
typer.echo(f" Vocab size: {vocab_size}")
|
| 256 |
+
typer.echo(f" Num extra (duration bins): {num_extra}")
|
| 257 |
+
typer.echo(f" Decoder hidden: {decoder_hidden}")
|
| 258 |
+
typer.echo(f" Decoder layers: {decoder_layers}")
|
| 259 |
+
typer.echo(f" Encoder output shape: {_tensor_shape(encoder_ref)}")
|
| 260 |
+
|
| 261 |
+
targets = torch.full(
|
| 262 |
+
(1, export_settings.max_symbol_steps),
|
| 263 |
+
fill_value=asr_model.decoder.blank_idx,
|
| 264 |
+
dtype=torch.int32,
|
| 265 |
+
)
|
| 266 |
+
target_lengths = torch.tensor(
|
| 267 |
+
[export_settings.max_symbol_steps], dtype=torch.int32
|
| 268 |
+
)
|
| 269 |
+
zero_state = torch.zeros(
|
| 270 |
+
decoder_layers,
|
| 271 |
+
1,
|
| 272 |
+
decoder_hidden,
|
| 273 |
+
dtype=torch.float32,
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
with torch.inference_mode():
|
| 277 |
+
decoder_ref, h_ref, c_ref = decoder(targets, target_lengths, zero_state, zero_state)
|
| 278 |
+
joint_ref = joint(encoder_ref, decoder_ref)
|
| 279 |
+
|
| 280 |
+
decoder_ref = decoder_ref.clone()
|
| 281 |
+
h_ref = h_ref.clone()
|
| 282 |
+
c_ref = c_ref.clone()
|
| 283 |
+
joint_ref = joint_ref.clone()
|
| 284 |
+
|
| 285 |
+
typer.echo(f" Decoder output shape: {_tensor_shape(decoder_ref)}")
|
| 286 |
+
typer.echo(f" Joint output shape: {_tensor_shape(joint_ref)}")
|
| 287 |
+
typer.echo(f" CTC log probs shape: {_tensor_shape(ctc_log_probs_ref)}")
|
| 288 |
+
|
| 289 |
+
typer.echo("Tracing and converting preprocessor…")
|
| 290 |
+
# Ensure tracing happens on CPU explicitly
|
| 291 |
+
preprocessor = preprocessor.cpu()
|
| 292 |
+
audio_tensor = audio_tensor.cpu()
|
| 293 |
+
audio_length = audio_length.cpu()
|
| 294 |
+
traced_preprocessor = torch.jit.trace(
|
| 295 |
+
preprocessor, (audio_tensor, audio_length), strict=False
|
| 296 |
+
)
|
| 297 |
+
traced_preprocessor.eval()
|
| 298 |
+
preprocessor_inputs = [
|
| 299 |
+
# Allow variable-length audio up to the fixed 15s window using RangeDim
|
| 300 |
+
ct.TensorType(
|
| 301 |
+
name="audio",
|
| 302 |
+
shape=(1, ct.RangeDim(1, max_samples)),
|
| 303 |
+
dtype=np.float32,
|
| 304 |
+
),
|
| 305 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 306 |
+
]
|
| 307 |
+
preprocessor_outputs = [
|
| 308 |
+
ct.TensorType(name="mel_features", dtype=np.float32),
|
| 309 |
+
ct.TensorType(name="mel_length", dtype=np.int32),
|
| 310 |
+
]
|
| 311 |
+
# Preprocessor compute units (parametrized; default CPU_ONLY)
|
| 312 |
+
preprocessor_model = _coreml_convert(
|
| 313 |
+
traced_preprocessor,
|
| 314 |
+
preprocessor_inputs,
|
| 315 |
+
preprocessor_outputs,
|
| 316 |
+
export_settings,
|
| 317 |
+
compute_units_override=pre_cu,
|
| 318 |
+
)
|
| 319 |
+
preprocessor_path = output_dir / "parakeet_preprocessor.mlpackage"
|
| 320 |
+
_save_mlpackage(
|
| 321 |
+
preprocessor_model,
|
| 322 |
+
preprocessor_path,
|
| 323 |
+
"Parakeet 110M preprocessor (15 s window)",
|
| 324 |
+
)
|
| 325 |
+
|
| 326 |
+
typer.echo("Tracing and converting encoder…")
|
| 327 |
+
traced_encoder = torch.jit.trace(
|
| 328 |
+
encoder, (mel_ref, mel_length_ref), strict=False
|
| 329 |
+
)
|
| 330 |
+
traced_encoder.eval()
|
| 331 |
+
encoder_inputs = [
|
| 332 |
+
ct.TensorType(name="mel_features", shape=_tensor_shape(mel_ref), dtype=np.float32),
|
| 333 |
+
ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
|
| 334 |
+
]
|
| 335 |
+
encoder_outputs = [
|
| 336 |
+
ct.TensorType(name="encoder_output", dtype=np.float32),
|
| 337 |
+
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 338 |
+
]
|
| 339 |
+
# Encoder: CPU only
|
| 340 |
+
encoder_model = _coreml_convert(
|
| 341 |
+
traced_encoder,
|
| 342 |
+
encoder_inputs,
|
| 343 |
+
encoder_outputs,
|
| 344 |
+
export_settings,
|
| 345 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 346 |
+
)
|
| 347 |
+
encoder_path = output_dir / "parakeet_encoder.mlpackage"
|
| 348 |
+
_save_mlpackage(
|
| 349 |
+
encoder_model,
|
| 350 |
+
encoder_path,
|
| 351 |
+
"Parakeet 110M encoder (15 s window)",
|
| 352 |
+
)
|
| 353 |
+
|
| 354 |
+
# CTC Head for hybrid model
|
| 355 |
+
typer.echo("Tracing and converting CTC head…")
|
| 356 |
+
traced_ctc_head = torch.jit.trace(
|
| 357 |
+
ctc_head, (encoder_ref,), strict=False
|
| 358 |
+
)
|
| 359 |
+
traced_ctc_head.eval()
|
| 360 |
+
ctc_head_inputs = [
|
| 361 |
+
ct.TensorType(name="encoder_output", shape=_tensor_shape(encoder_ref), dtype=np.float32),
|
| 362 |
+
]
|
| 363 |
+
ctc_head_outputs = [
|
| 364 |
+
ct.TensorType(name="ctc_logits", dtype=np.float32),
|
| 365 |
+
]
|
| 366 |
+
ctc_head_model = _coreml_convert(
|
| 367 |
+
traced_ctc_head,
|
| 368 |
+
ctc_head_inputs,
|
| 369 |
+
ctc_head_outputs,
|
| 370 |
+
export_settings,
|
| 371 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 372 |
+
)
|
| 373 |
+
ctc_head_path = output_dir / "parakeet_ctc_head.mlpackage"
|
| 374 |
+
_save_mlpackage(
|
| 375 |
+
ctc_head_model,
|
| 376 |
+
ctc_head_path,
|
| 377 |
+
"Parakeet 110M CTC decoder head",
|
| 378 |
+
)
|
| 379 |
+
|
| 380 |
+
# Optional fused export: Preprocessor + Encoder
|
| 381 |
+
typer.echo("Tracing and converting fused mel+encoder…")
|
| 382 |
+
mel_encoder = MelEncoderWrapper(preprocessor, encoder)
|
| 383 |
+
traced_mel_encoder = torch.jit.trace(
|
| 384 |
+
mel_encoder, (audio_tensor, audio_length), strict=False
|
| 385 |
+
)
|
| 386 |
+
traced_mel_encoder.eval()
|
| 387 |
+
mel_encoder_inputs = [
|
| 388 |
+
# Keep fixed 15s window for fused Mel+Encoder
|
| 389 |
+
ct.TensorType(name="audio", shape=(1, max_samples), dtype=np.float32),
|
| 390 |
+
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 391 |
+
]
|
| 392 |
+
mel_encoder_outputs = [
|
| 393 |
+
ct.TensorType(name="encoder_output", dtype=np.float32),
|
| 394 |
+
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 395 |
+
]
|
| 396 |
+
# Fused mel+encoder compute units (parametrized; default CPU_ONLY)
|
| 397 |
+
mel_encoder_model = _coreml_convert(
|
| 398 |
+
traced_mel_encoder,
|
| 399 |
+
mel_encoder_inputs,
|
| 400 |
+
mel_encoder_outputs,
|
| 401 |
+
export_settings,
|
| 402 |
+
compute_units_override=melenc_cu,
|
| 403 |
+
)
|
| 404 |
+
mel_encoder_path = output_dir / "parakeet_mel_encoder.mlpackage"
|
| 405 |
+
_save_mlpackage(
|
| 406 |
+
mel_encoder_model,
|
| 407 |
+
mel_encoder_path,
|
| 408 |
+
"Parakeet 110M fused Mel+Encoder (15 s window)",
|
| 409 |
+
)
|
| 410 |
+
|
| 411 |
+
typer.echo("Tracing and converting decoder…")
|
| 412 |
+
traced_decoder = torch.jit.trace(
|
| 413 |
+
decoder,
|
| 414 |
+
(targets, target_lengths, zero_state, zero_state),
|
| 415 |
+
strict=False,
|
| 416 |
+
)
|
| 417 |
+
traced_decoder.eval()
|
| 418 |
+
decoder_inputs = [
|
| 419 |
+
ct.TensorType(name="targets", shape=_tensor_shape(targets), dtype=np.int32),
|
| 420 |
+
ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
|
| 421 |
+
ct.TensorType(name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32),
|
| 422 |
+
ct.TensorType(name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32),
|
| 423 |
+
]
|
| 424 |
+
decoder_outputs = [
|
| 425 |
+
ct.TensorType(name="decoder", dtype=np.float32),
|
| 426 |
+
ct.TensorType(name="h_out", dtype=np.float32),
|
| 427 |
+
ct.TensorType(name="c_out", dtype=np.float32),
|
| 428 |
+
]
|
| 429 |
+
# Decoder: CPU only
|
| 430 |
+
decoder_model = _coreml_convert(
|
| 431 |
+
traced_decoder,
|
| 432 |
+
decoder_inputs,
|
| 433 |
+
decoder_outputs,
|
| 434 |
+
export_settings,
|
| 435 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 436 |
+
)
|
| 437 |
+
decoder_path = output_dir / "parakeet_decoder.mlpackage"
|
| 438 |
+
_save_mlpackage(
|
| 439 |
+
decoder_model,
|
| 440 |
+
decoder_path,
|
| 441 |
+
"Parakeet 110M decoder (RNNT prediction network)",
|
| 442 |
+
)
|
| 443 |
+
|
| 444 |
+
typer.echo("Tracing and converting joint…")
|
| 445 |
+
traced_joint = torch.jit.trace(
|
| 446 |
+
joint,
|
| 447 |
+
(encoder_ref, decoder_ref),
|
| 448 |
+
strict=False,
|
| 449 |
+
)
|
| 450 |
+
traced_joint.eval()
|
| 451 |
+
joint_inputs = [
|
| 452 |
+
ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
|
| 453 |
+
ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
|
| 454 |
+
]
|
| 455 |
+
joint_outputs = [
|
| 456 |
+
ct.TensorType(name="logits", dtype=np.float32),
|
| 457 |
+
]
|
| 458 |
+
# Joint: CPU only
|
| 459 |
+
joint_model = _coreml_convert(
|
| 460 |
+
traced_joint,
|
| 461 |
+
joint_inputs,
|
| 462 |
+
joint_outputs,
|
| 463 |
+
export_settings,
|
| 464 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 465 |
+
)
|
| 466 |
+
joint_path = output_dir / "parakeet_joint.mlpackage"
|
| 467 |
+
_save_mlpackage(
|
| 468 |
+
joint_model,
|
| 469 |
+
joint_path,
|
| 470 |
+
"Parakeet 110M joint network (RNNT)",
|
| 471 |
+
)
|
| 472 |
+
|
| 473 |
+
# Joint + decision head (split logits, softmax, argmax)
|
| 474 |
+
typer.echo("Tracing and converting joint decision head…")
|
| 475 |
+
vocab_size = int(asr_model.tokenizer.vocab_size)
|
| 476 |
+
num_extra = int(asr_model.joint.num_extra_outputs)
|
| 477 |
+
joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size, num_extra=num_extra)
|
| 478 |
+
traced_joint_decision = torch.jit.trace(
|
| 479 |
+
joint_decision,
|
| 480 |
+
(encoder_ref, decoder_ref),
|
| 481 |
+
strict=False,
|
| 482 |
+
)
|
| 483 |
+
traced_joint_decision.eval()
|
| 484 |
+
joint_decision_inputs = [
|
| 485 |
+
ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
|
| 486 |
+
ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
|
| 487 |
+
]
|
| 488 |
+
joint_decision_outputs = [
|
| 489 |
+
ct.TensorType(name="token_id", dtype=np.int32),
|
| 490 |
+
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 491 |
+
ct.TensorType(name="duration", dtype=np.int32),
|
| 492 |
+
]
|
| 493 |
+
# JointDecision: CPU only
|
| 494 |
+
joint_decision_model = _coreml_convert(
|
| 495 |
+
traced_joint_decision,
|
| 496 |
+
joint_decision_inputs,
|
| 497 |
+
joint_decision_outputs,
|
| 498 |
+
export_settings,
|
| 499 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 500 |
+
)
|
| 501 |
+
joint_decision_path = output_dir / "parakeet_joint_decision.mlpackage"
|
| 502 |
+
_save_mlpackage(
|
| 503 |
+
joint_decision_model,
|
| 504 |
+
joint_decision_path,
|
| 505 |
+
"Parakeet 110M joint + decision head (split, softmax, argmax)",
|
| 506 |
+
)
|
| 507 |
+
|
| 508 |
+
# Single-step JointDecision for [1,512,1] x [1,640,1] -> [1,1,1]
|
| 509 |
+
# Note: 110M encoder dim is 512 (not 1024 like 0.6B)
|
| 510 |
+
typer.echo("Tracing and converting single-step joint decision…")
|
| 511 |
+
jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size, num_extra=num_extra)
|
| 512 |
+
# Create single-step slices from refs
|
| 513 |
+
enc_step = encoder_ref[:, :, :1].contiguous()
|
| 514 |
+
dec_step = decoder_ref[:, :, :1].contiguous()
|
| 515 |
+
traced_jd_single = torch.jit.trace(
|
| 516 |
+
jd_single,
|
| 517 |
+
(enc_step, dec_step),
|
| 518 |
+
strict=False,
|
| 519 |
+
)
|
| 520 |
+
traced_jd_single.eval()
|
| 521 |
+
jd_single_inputs = [
|
| 522 |
+
ct.TensorType(name="encoder_step", shape=(1, enc_step.shape[1], 1), dtype=np.float32),
|
| 523 |
+
ct.TensorType(name="decoder_step", shape=(1, dec_step.shape[1], 1), dtype=np.float32),
|
| 524 |
+
]
|
| 525 |
+
jd_single_outputs = [
|
| 526 |
+
ct.TensorType(name="token_id", dtype=np.int32),
|
| 527 |
+
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 528 |
+
ct.TensorType(name="duration", dtype=np.int32),
|
| 529 |
+
ct.TensorType(name="top_k_ids", dtype=np.int32),
|
| 530 |
+
ct.TensorType(name="top_k_logits", dtype=np.float32),
|
| 531 |
+
]
|
| 532 |
+
# Single-step JointDecision: CPU only
|
| 533 |
+
jd_single_model = _coreml_convert(
|
| 534 |
+
traced_jd_single,
|
| 535 |
+
jd_single_inputs,
|
| 536 |
+
jd_single_outputs,
|
| 537 |
+
export_settings,
|
| 538 |
+
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 539 |
+
)
|
| 540 |
+
jd_single_path = output_dir / "parakeet_joint_decision_single_step.mlpackage"
|
| 541 |
+
_save_mlpackage(
|
| 542 |
+
jd_single_model,
|
| 543 |
+
jd_single_path,
|
| 544 |
+
"Parakeet 110M single-step joint decision (current frame)",
|
| 545 |
+
)
|
| 546 |
+
|
| 547 |
+
# Export vocabulary
|
| 548 |
+
typer.echo("Exporting vocabulary…")
|
| 549 |
+
vocab_path = output_dir / "vocab.json"
|
| 550 |
+
vocab_dict = {
|
| 551 |
+
"vocab_size": vocab_size,
|
| 552 |
+
"blank_id": int(asr_model.decoder.blank_idx),
|
| 553 |
+
"tokens": asr_model.tokenizer.vocab,
|
| 554 |
+
}
|
| 555 |
+
vocab_path.write_text(json.dumps(vocab_dict, indent=2, ensure_ascii=False))
|
| 556 |
+
|
| 557 |
+
metadata: Dict[str, object] = {
|
| 558 |
+
"model_id": model_id,
|
| 559 |
+
"model_type": "hybrid_rnnt_ctc",
|
| 560 |
+
"sample_rate": sample_rate,
|
| 561 |
+
"max_audio_seconds": export_settings.max_audio_seconds,
|
| 562 |
+
"max_audio_samples": max_samples,
|
| 563 |
+
"max_symbol_steps": export_settings.max_symbol_steps,
|
| 564 |
+
"vocab_size": vocab_size,
|
| 565 |
+
"joint_extra_outputs": num_extra,
|
| 566 |
+
"encoder_dim": int(encoder_ref.shape[1]), # 512 for 110M
|
| 567 |
+
"decoder_dim": int(decoder_ref.shape[1]), # 640
|
| 568 |
+
"decoder_hidden": decoder_hidden,
|
| 569 |
+
"decoder_layers": decoder_layers,
|
| 570 |
+
"blank_id": int(asr_model.decoder.blank_idx),
|
| 571 |
+
"checkpoint": checkpoint_meta,
|
| 572 |
+
"coreml": {
|
| 573 |
+
"compute_units": export_settings.compute_units.name,
|
| 574 |
+
"compute_precision": (
|
| 575 |
+
export_settings.compute_precision.name
|
| 576 |
+
if export_settings.compute_precision is not None
|
| 577 |
+
else "FLOAT32"
|
| 578 |
+
),
|
| 579 |
+
},
|
| 580 |
+
"components": {
|
| 581 |
+
"preprocessor": {
|
| 582 |
+
"inputs": {
|
| 583 |
+
"audio_signal": list(_tensor_shape(audio_tensor)),
|
| 584 |
+
"audio_length": [1],
|
| 585 |
+
},
|
| 586 |
+
"outputs": {
|
| 587 |
+
"mel": list(_tensor_shape(mel_ref)),
|
| 588 |
+
"mel_length": [1],
|
| 589 |
+
},
|
| 590 |
+
"path": preprocessor_path.name,
|
| 591 |
+
},
|
| 592 |
+
"encoder": {
|
| 593 |
+
"inputs": {
|
| 594 |
+
"mel": list(_tensor_shape(mel_ref)),
|
| 595 |
+
"mel_length": [1],
|
| 596 |
+
},
|
| 597 |
+
"outputs": {
|
| 598 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 599 |
+
"encoder_length": [1],
|
| 600 |
+
},
|
| 601 |
+
"path": encoder_path.name,
|
| 602 |
+
},
|
| 603 |
+
"ctc_head": {
|
| 604 |
+
"inputs": {
|
| 605 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 606 |
+
},
|
| 607 |
+
"outputs": {
|
| 608 |
+
"log_probs": list(_tensor_shape(ctc_log_probs_ref)),
|
| 609 |
+
},
|
| 610 |
+
"path": ctc_head_path.name,
|
| 611 |
+
},
|
| 612 |
+
"mel_encoder": {
|
| 613 |
+
"inputs": {
|
| 614 |
+
"audio_signal": [1, max_samples],
|
| 615 |
+
"audio_length": [1],
|
| 616 |
+
},
|
| 617 |
+
"outputs": {
|
| 618 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 619 |
+
"encoder_length": [1],
|
| 620 |
+
},
|
| 621 |
+
"path": mel_encoder_path.name,
|
| 622 |
+
},
|
| 623 |
+
"decoder": {
|
| 624 |
+
"inputs": {
|
| 625 |
+
"targets": list(_tensor_shape(targets)),
|
| 626 |
+
"target_length": [1],
|
| 627 |
+
"h_in": list(_tensor_shape(zero_state)),
|
| 628 |
+
"c_in": list(_tensor_shape(zero_state)),
|
| 629 |
+
},
|
| 630 |
+
"outputs": {
|
| 631 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 632 |
+
"h_out": list(_tensor_shape(h_ref)),
|
| 633 |
+
"c_out": list(_tensor_shape(c_ref)),
|
| 634 |
+
},
|
| 635 |
+
"path": decoder_path.name,
|
| 636 |
+
},
|
| 637 |
+
"joint": {
|
| 638 |
+
"inputs": {
|
| 639 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 640 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 641 |
+
},
|
| 642 |
+
"outputs": {
|
| 643 |
+
"logits": list(_tensor_shape(joint_ref)),
|
| 644 |
+
},
|
| 645 |
+
"path": joint_path.name,
|
| 646 |
+
},
|
| 647 |
+
"joint_decision": {
|
| 648 |
+
"inputs": {
|
| 649 |
+
"encoder": list(_tensor_shape(encoder_ref)),
|
| 650 |
+
"decoder": list(_tensor_shape(decoder_ref)),
|
| 651 |
+
},
|
| 652 |
+
"outputs": {
|
| 653 |
+
"token_id": [
|
| 654 |
+
_tensor_shape(encoder_ref)[0],
|
| 655 |
+
_tensor_shape(encoder_ref)[2],
|
| 656 |
+
_tensor_shape(decoder_ref)[2],
|
| 657 |
+
],
|
| 658 |
+
"token_prob": [
|
| 659 |
+
_tensor_shape(encoder_ref)[0],
|
| 660 |
+
_tensor_shape(encoder_ref)[2],
|
| 661 |
+
_tensor_shape(decoder_ref)[2],
|
| 662 |
+
],
|
| 663 |
+
"duration": [
|
| 664 |
+
_tensor_shape(encoder_ref)[0],
|
| 665 |
+
_tensor_shape(encoder_ref)[2],
|
| 666 |
+
_tensor_shape(decoder_ref)[2],
|
| 667 |
+
],
|
| 668 |
+
},
|
| 669 |
+
"path": joint_decision_path.name,
|
| 670 |
+
},
|
| 671 |
+
"joint_decision_single_step": {
|
| 672 |
+
"inputs": {
|
| 673 |
+
"encoder_step": [1, int(encoder_ref.shape[1]), 1],
|
| 674 |
+
"decoder_step": [1, int(decoder_ref.shape[1]), 1],
|
| 675 |
+
},
|
| 676 |
+
"outputs": {
|
| 677 |
+
"token_id": [1, 1, 1],
|
| 678 |
+
"token_prob": [1, 1, 1],
|
| 679 |
+
"duration": [1, 1, 1],
|
| 680 |
+
"top_k_ids": [1, 1, 1, 64],
|
| 681 |
+
"top_k_logits": [1, 1, 1, 64],
|
| 682 |
+
},
|
| 683 |
+
"path": jd_single_path.name,
|
| 684 |
+
},
|
| 685 |
+
},
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
metadata_path = output_dir / "metadata.json"
|
| 689 |
+
metadata_path.write_text(json.dumps(metadata, indent=2))
|
| 690 |
+
typer.echo(f"Export complete. Metadata written to {metadata_path}")
|
| 691 |
+
|
| 692 |
+
finally:
|
| 693 |
+
asr_model.decoder._rnnt_export = decoder_export_flag
|
| 694 |
+
|
| 695 |
+
|
| 696 |
+
if __name__ == "__main__":
|
| 697 |
+
app()
|
convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"approach" : "single-encoder",
|
| 3 |
+
"model" : "parakeet-tdt-ctc-110m-hybrid",
|
| 4 |
+
"results" : [
|
| 5 |
+
{
|
| 6 |
+
"audioLength" : 15,
|
| 7 |
+
"ctcDetections" : [
|
| 8 |
+
{
|
| 9 |
+
"endTime" : 6.0800000000000001,
|
| 10 |
+
"inReference" : true,
|
| 11 |
+
"score" : -8.3699999999999992,
|
| 12 |
+
"source" : "ctc",
|
| 13 |
+
"startTime" : 4.96,
|
| 14 |
+
"word" : "LATAM Airlines"
|
| 15 |
+
}
|
| 16 |
+
],
|
| 17 |
+
"dictFound" : 1,
|
| 18 |
+
"dictTotal" : 1,
|
| 19 |
+
"fileId" : "4329526_chunk0",
|
| 20 |
+
"hypothesis" : "goodday everyone and welcome to latam airlines group earnings release confonference call just as a reminder this conference is being recorded lat tam airlines group eararnings releaseed for the",
|
| 21 |
+
"processingTime" : 0.070000000000000007,
|
| 22 |
+
"reference" : "good day everyone and welcome to latam airlines group earnings release conference call just as a reminder this conference is being recorded latam airlines group earnings released for the",
|
| 23 |
+
"wer" : 24.140000000000001
|
| 24 |
+
}
|
| 25 |
+
],
|
| 26 |
+
"summary" : {
|
| 27 |
+
"avgWer" : 24.140000000000001,
|
| 28 |
+
"dictPass" : 1,
|
| 29 |
+
"dictRate" : 100,
|
| 30 |
+
"dictTotal" : 1,
|
| 31 |
+
"totalAudioDuration" : 15,
|
| 32 |
+
"totalProcessingTime" : 0.070000000000000007,
|
| 33 |
+
"totalTests" : 1
|
| 34 |
+
}
|
| 35 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/individual_components.py
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Export Parakeet TDT-CTC 110M Hybrid RNNT components into CoreML and validate outputs."""
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from dataclasses import dataclass
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
from typing import Optional, Tuple
|
| 8 |
+
|
| 9 |
+
import coremltools as ct
|
| 10 |
+
import torch
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class ExportSettings:
|
| 15 |
+
output_dir: Path
|
| 16 |
+
compute_units: ct.ComputeUnit
|
| 17 |
+
deployment_target: Optional[ct.target.iOS17]
|
| 18 |
+
compute_precision: Optional[ct.precision]
|
| 19 |
+
max_audio_seconds: float
|
| 20 |
+
max_symbol_steps: int
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@dataclass
|
| 24 |
+
class ValidationSettings:
|
| 25 |
+
audio_path: Optional[Path]
|
| 26 |
+
seconds: float
|
| 27 |
+
seed: Optional[int]
|
| 28 |
+
rtol: float
|
| 29 |
+
atol: float
|
| 30 |
+
skip: bool
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
@dataclass
|
| 34 |
+
class ValidationDiff:
|
| 35 |
+
name: str
|
| 36 |
+
max_abs_diff: float
|
| 37 |
+
max_rel_diff: float
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass
|
| 41 |
+
class ValidationResult:
|
| 42 |
+
source: str
|
| 43 |
+
audio_num_samples: int
|
| 44 |
+
audio_seconds: float
|
| 45 |
+
token_length: int
|
| 46 |
+
atol: float
|
| 47 |
+
rtol: float
|
| 48 |
+
diffs: Tuple[ValidationDiff, ...]
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
class PreprocessorWrapper(torch.nn.Module):
|
| 52 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 53 |
+
super().__init__()
|
| 54 |
+
self.module = module
|
| 55 |
+
|
| 56 |
+
def forward(self, audio_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 57 |
+
mel, mel_length = self.module(input_signal=audio_signal, length=length.to(dtype=torch.long))
|
| 58 |
+
return mel, mel_length
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
class EncoderWrapper(torch.nn.Module):
|
| 62 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 63 |
+
super().__init__()
|
| 64 |
+
self.module = module
|
| 65 |
+
|
| 66 |
+
def forward(self, features: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 67 |
+
encoded, encoded_lengths = self.module(audio_signal=features, length=length.to(dtype=torch.long))
|
| 68 |
+
return encoded, encoded_lengths
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
class DecoderWrapper(torch.nn.Module):
|
| 72 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 73 |
+
super().__init__()
|
| 74 |
+
self.module = module
|
| 75 |
+
|
| 76 |
+
def forward(
|
| 77 |
+
self,
|
| 78 |
+
targets: torch.Tensor,
|
| 79 |
+
target_lengths: torch.Tensor,
|
| 80 |
+
h_in: torch.Tensor,
|
| 81 |
+
c_in: torch.Tensor,
|
| 82 |
+
) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
|
| 83 |
+
state = [h_in, c_in]
|
| 84 |
+
decoder_output, _, new_state = self.module(
|
| 85 |
+
targets=targets.to(dtype=torch.long),
|
| 86 |
+
target_length=target_lengths.to(dtype=torch.long),
|
| 87 |
+
states=state,
|
| 88 |
+
)
|
| 89 |
+
return decoder_output, new_state[0], new_state[1]
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
class JointWrapper(torch.nn.Module):
|
| 93 |
+
"""Joint network for 110M hybrid model.
|
| 94 |
+
|
| 95 |
+
Note: The 110M model has encoder_dim=512 and decoder_dim=640.
|
| 96 |
+
The joint network projects both to 640, then combines them.
|
| 97 |
+
"""
|
| 98 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 99 |
+
super().__init__()
|
| 100 |
+
self.module = module
|
| 101 |
+
|
| 102 |
+
def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor) -> torch.Tensor:
|
| 103 |
+
# Input: encoder_outputs [B, D_enc, T], decoder_outputs [B, D_dec, U]
|
| 104 |
+
# For 110M: D_enc=512, D_dec=640
|
| 105 |
+
# Transpose to match what projection layers expect
|
| 106 |
+
encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D_enc]
|
| 107 |
+
decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D_dec]
|
| 108 |
+
|
| 109 |
+
# Apply projections
|
| 110 |
+
enc_proj = self.module.enc(encoder_outputs) # [B, T, 640]
|
| 111 |
+
dec_proj = self.module.pred(decoder_outputs) # [B, U, 640]
|
| 112 |
+
|
| 113 |
+
# Explicit broadcasting along T and U to avoid converter ambiguity
|
| 114 |
+
x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, 640]
|
| 115 |
+
x = self.module.joint_net[0](x) # ReLU
|
| 116 |
+
x = self.module.joint_net[1](x) # Dropout (no-op in eval)
|
| 117 |
+
out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+1+durations]
|
| 118 |
+
return out
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
class CTCHeadWrapper(torch.nn.Module):
|
| 122 |
+
"""CTC decoder head for 110M hybrid model.
|
| 123 |
+
|
| 124 |
+
Takes encoder output and produces log probabilities over vocabulary.
|
| 125 |
+
The NeMo CTC decoder (ConvASRDecoder) uses Conv1d so it expects [B, D, T] format.
|
| 126 |
+
"""
|
| 127 |
+
def __init__(self, module: torch.nn.Module) -> None:
|
| 128 |
+
super().__init__()
|
| 129 |
+
self.module = module
|
| 130 |
+
|
| 131 |
+
def forward(self, encoder_outputs: torch.Tensor) -> torch.Tensor:
|
| 132 |
+
# Input: encoder_outputs [B, D_enc, T] - already in the format CTC decoder expects
|
| 133 |
+
# The NeMo CTC decoder uses Conv1d internally, so it expects [B, D, T]
|
| 134 |
+
# Output: log probabilities [B, T, vocab+1]
|
| 135 |
+
log_probs = self.module(encoder_output=encoder_outputs)
|
| 136 |
+
return log_probs
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
class MelEncoderWrapper(torch.nn.Module):
|
| 140 |
+
"""Fused wrapper: waveform -> mel -> encoder.
|
| 141 |
+
|
| 142 |
+
Inputs:
|
| 143 |
+
- audio_signal: [B, S]
|
| 144 |
+
- audio_length: [B]
|
| 145 |
+
|
| 146 |
+
Outputs:
|
| 147 |
+
- encoder: [B, D, T_enc]
|
| 148 |
+
- encoder_length: [B]
|
| 149 |
+
"""
|
| 150 |
+
def __init__(self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper) -> None:
|
| 151 |
+
super().__init__()
|
| 152 |
+
self.preprocessor = preprocessor
|
| 153 |
+
self.encoder = encoder
|
| 154 |
+
|
| 155 |
+
def forward(self, audio_signal: torch.Tensor, audio_length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
|
| 156 |
+
mel, mel_length = self.preprocessor(audio_signal, audio_length)
|
| 157 |
+
encoded, enc_len = self.encoder(mel, mel_length.to(dtype=torch.int32))
|
| 158 |
+
return encoded, enc_len
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
class JointDecisionWrapper(torch.nn.Module):
|
| 162 |
+
"""Joint + decision head: outputs label id, label prob, duration frames.
|
| 163 |
+
|
| 164 |
+
Splits joint logits into token logits and duration logits, applies softmax
|
| 165 |
+
over tokens, argmax for both heads, and gathers probability of the chosen token.
|
| 166 |
+
|
| 167 |
+
Inputs:
|
| 168 |
+
- encoder_outputs: [B, D, T]
|
| 169 |
+
- decoder_outputs: [B, D, U]
|
| 170 |
+
|
| 171 |
+
Returns:
|
| 172 |
+
- token_id: [B, T, U] int32
|
| 173 |
+
- token_prob: [B, T, U] float32
|
| 174 |
+
- duration: [B, T, U] int32 (frames; for v3 bins=[0,1,2,3,4])
|
| 175 |
+
"""
|
| 176 |
+
def __init__(self, joint: JointWrapper, vocab_size: int, num_extra: int) -> None:
|
| 177 |
+
super().__init__()
|
| 178 |
+
self.joint = joint
|
| 179 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 180 |
+
self.num_extra = int(num_extra)
|
| 181 |
+
|
| 182 |
+
def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
|
| 183 |
+
logits = self.joint(encoder_outputs, decoder_outputs)
|
| 184 |
+
token_logits = logits[..., : self.vocab_with_blank]
|
| 185 |
+
duration_logits = logits[..., -self.num_extra :]
|
| 186 |
+
|
| 187 |
+
# Token selection
|
| 188 |
+
token_ids = torch.argmax(token_logits, dim=-1).to(dtype=torch.int32)
|
| 189 |
+
token_probs_all = torch.softmax(token_logits, dim=-1)
|
| 190 |
+
# gather expects int64 (long) indices; cast only for gather
|
| 191 |
+
token_prob = torch.gather(
|
| 192 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 193 |
+
).squeeze(-1)
|
| 194 |
+
|
| 195 |
+
# Duration prediction (bins are identity mapping to frames for v3)
|
| 196 |
+
duration = torch.argmax(duration_logits, dim=-1).to(dtype=torch.int32)
|
| 197 |
+
return token_ids, token_prob, duration
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
class JointDecisionSingleStep(torch.nn.Module):
|
| 201 |
+
"""Single-step variant for streaming: encoder_step [1, 512, 1] -> [1,1,1].
|
| 202 |
+
|
| 203 |
+
Note: For 110M model, encoder_dim is 512 (not 1024 like 0.6B).
|
| 204 |
+
|
| 205 |
+
Inputs:
|
| 206 |
+
- encoder_step: [B=1, D=512, T=1]
|
| 207 |
+
- decoder_step: [B=1, D=640, U=1]
|
| 208 |
+
|
| 209 |
+
Returns:
|
| 210 |
+
- token_id: [1, 1, 1] int32
|
| 211 |
+
- token_prob: [1, 1, 1] float32
|
| 212 |
+
- duration: [1, 1, 1] int32
|
| 213 |
+
- top_k_ids: [1, 1, 1, K] int32
|
| 214 |
+
- top_k_logits: [1, 1, 1, K] float32
|
| 215 |
+
"""
|
| 216 |
+
def __init__(self, joint: JointWrapper, vocab_size: int, num_extra: int, top_k: int = 64) -> None:
|
| 217 |
+
super().__init__()
|
| 218 |
+
self.joint = joint
|
| 219 |
+
self.vocab_with_blank = int(vocab_size) + 1
|
| 220 |
+
self.num_extra = int(num_extra)
|
| 221 |
+
# Emit top-K candidates to enable host-side re-ranking with contextual biasing
|
| 222 |
+
self.top_k = int(top_k)
|
| 223 |
+
|
| 224 |
+
def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
|
| 225 |
+
# Reuse JointWrapper which expects [B, D, T] and [B, D, U]
|
| 226 |
+
logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+extra]
|
| 227 |
+
token_logits = logits[..., : self.vocab_with_blank]
|
| 228 |
+
duration_logits = logits[..., -self.num_extra :]
|
| 229 |
+
|
| 230 |
+
token_ids = torch.argmax(token_logits, dim=-1, keepdim=False).to(dtype=torch.int32)
|
| 231 |
+
token_probs_all = torch.softmax(token_logits, dim=-1)
|
| 232 |
+
token_prob = torch.gather(
|
| 233 |
+
token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
|
| 234 |
+
).squeeze(-1)
|
| 235 |
+
duration = torch.argmax(duration_logits, dim=-1, keepdim=False).to(dtype=torch.int32)
|
| 236 |
+
|
| 237 |
+
# Also expose top-K candidates for host-side re-ranking.
|
| 238 |
+
# Shapes preserved as [1, 1, 1, K] to match CoreML broadcasting expectations.
|
| 239 |
+
# Note: topk expects last dimension; original shape is [1, 1, 1, V].
|
| 240 |
+
topk_logits, topk_ids_long = torch.topk(token_logits, k=min(self.top_k, token_logits.shape[-1]), dim=-1)
|
| 241 |
+
topk_ids = topk_ids_long.to(dtype=torch.int32)
|
| 242 |
+
return token_ids, token_prob, duration, topk_ids, topk_logits
|
| 243 |
+
|
| 244 |
+
|
| 245 |
+
def _coreml_convert(
|
| 246 |
+
traced: torch.jit.ScriptModule,
|
| 247 |
+
inputs,
|
| 248 |
+
outputs,
|
| 249 |
+
settings: ExportSettings,
|
| 250 |
+
compute_units_override: Optional[ct.ComputeUnit] = None,
|
| 251 |
+
) -> ct.models.MLModel:
|
| 252 |
+
cu = compute_units_override if compute_units_override is not None else settings.compute_units
|
| 253 |
+
kwargs = {
|
| 254 |
+
"convert_to": "mlprogram",
|
| 255 |
+
"inputs": inputs,
|
| 256 |
+
"outputs": outputs,
|
| 257 |
+
"compute_units": cu,
|
| 258 |
+
}
|
| 259 |
+
print("Converting:", traced.__class__.__name__)
|
| 260 |
+
print("Conversion kwargs:", kwargs)
|
| 261 |
+
if settings.deployment_target is not None:
|
| 262 |
+
kwargs["minimum_deployment_target"] = settings.deployment_target
|
| 263 |
+
if settings.compute_precision is not None:
|
| 264 |
+
kwargs["compute_precision"] = settings.compute_precision
|
| 265 |
+
return ct.convert(traced, **kwargs)
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/metadata.json
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_id": "nvidia/parakeet-tdt_ctc-110m",
|
| 3 |
+
"model_type": "hybrid_rnnt_ctc",
|
| 4 |
+
"sample_rate": 16000,
|
| 5 |
+
"max_audio_seconds": 15.0,
|
| 6 |
+
"max_audio_samples": 240000,
|
| 7 |
+
"max_symbol_steps": 1,
|
| 8 |
+
"vocab_size": 1024,
|
| 9 |
+
"joint_extra_outputs": 5,
|
| 10 |
+
"encoder_dim": 512,
|
| 11 |
+
"decoder_dim": 640,
|
| 12 |
+
"decoder_hidden": 640,
|
| 13 |
+
"decoder_layers": 1,
|
| 14 |
+
"blank_id": 1024,
|
| 15 |
+
"checkpoint": {
|
| 16 |
+
"type": "pretrained",
|
| 17 |
+
"model_id": "nvidia/parakeet-tdt_ctc-110m"
|
| 18 |
+
},
|
| 19 |
+
"coreml": {
|
| 20 |
+
"compute_units": "CPU_ONLY",
|
| 21 |
+
"compute_precision": "FLOAT32"
|
| 22 |
+
},
|
| 23 |
+
"components": {
|
| 24 |
+
"preprocessor": {
|
| 25 |
+
"inputs": {
|
| 26 |
+
"audio_signal": [
|
| 27 |
+
1,
|
| 28 |
+
240000
|
| 29 |
+
],
|
| 30 |
+
"audio_length": [
|
| 31 |
+
1
|
| 32 |
+
]
|
| 33 |
+
},
|
| 34 |
+
"outputs": {
|
| 35 |
+
"mel": [
|
| 36 |
+
1,
|
| 37 |
+
80,
|
| 38 |
+
1501
|
| 39 |
+
],
|
| 40 |
+
"mel_length": [
|
| 41 |
+
1
|
| 42 |
+
]
|
| 43 |
+
},
|
| 44 |
+
"path": "parakeet_preprocessor.mlpackage"
|
| 45 |
+
},
|
| 46 |
+
"encoder": {
|
| 47 |
+
"inputs": {
|
| 48 |
+
"mel": [
|
| 49 |
+
1,
|
| 50 |
+
80,
|
| 51 |
+
1501
|
| 52 |
+
],
|
| 53 |
+
"mel_length": [
|
| 54 |
+
1
|
| 55 |
+
]
|
| 56 |
+
},
|
| 57 |
+
"outputs": {
|
| 58 |
+
"encoder": [
|
| 59 |
+
1,
|
| 60 |
+
512,
|
| 61 |
+
188
|
| 62 |
+
],
|
| 63 |
+
"encoder_length": [
|
| 64 |
+
1
|
| 65 |
+
]
|
| 66 |
+
},
|
| 67 |
+
"path": "parakeet_encoder.mlpackage"
|
| 68 |
+
},
|
| 69 |
+
"ctc_head": {
|
| 70 |
+
"inputs": {
|
| 71 |
+
"encoder": [
|
| 72 |
+
1,
|
| 73 |
+
512,
|
| 74 |
+
188
|
| 75 |
+
]
|
| 76 |
+
},
|
| 77 |
+
"outputs": {
|
| 78 |
+
"log_probs": [
|
| 79 |
+
1,
|
| 80 |
+
188,
|
| 81 |
+
1025
|
| 82 |
+
]
|
| 83 |
+
},
|
| 84 |
+
"path": "parakeet_ctc_head.mlpackage"
|
| 85 |
+
},
|
| 86 |
+
"mel_encoder": {
|
| 87 |
+
"inputs": {
|
| 88 |
+
"audio_signal": [
|
| 89 |
+
1,
|
| 90 |
+
240000
|
| 91 |
+
],
|
| 92 |
+
"audio_length": [
|
| 93 |
+
1
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
"outputs": {
|
| 97 |
+
"encoder": [
|
| 98 |
+
1,
|
| 99 |
+
512,
|
| 100 |
+
188
|
| 101 |
+
],
|
| 102 |
+
"encoder_length": [
|
| 103 |
+
1
|
| 104 |
+
]
|
| 105 |
+
},
|
| 106 |
+
"path": "parakeet_mel_encoder.mlpackage"
|
| 107 |
+
},
|
| 108 |
+
"decoder": {
|
| 109 |
+
"inputs": {
|
| 110 |
+
"targets": [
|
| 111 |
+
1,
|
| 112 |
+
1
|
| 113 |
+
],
|
| 114 |
+
"target_length": [
|
| 115 |
+
1
|
| 116 |
+
],
|
| 117 |
+
"h_in": [
|
| 118 |
+
1,
|
| 119 |
+
1,
|
| 120 |
+
640
|
| 121 |
+
],
|
| 122 |
+
"c_in": [
|
| 123 |
+
1,
|
| 124 |
+
1,
|
| 125 |
+
640
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
"outputs": {
|
| 129 |
+
"decoder": [
|
| 130 |
+
1,
|
| 131 |
+
640,
|
| 132 |
+
1
|
| 133 |
+
],
|
| 134 |
+
"h_out": [
|
| 135 |
+
1,
|
| 136 |
+
1,
|
| 137 |
+
640
|
| 138 |
+
],
|
| 139 |
+
"c_out": [
|
| 140 |
+
1,
|
| 141 |
+
1,
|
| 142 |
+
640
|
| 143 |
+
]
|
| 144 |
+
},
|
| 145 |
+
"path": "parakeet_decoder.mlpackage"
|
| 146 |
+
},
|
| 147 |
+
"joint": {
|
| 148 |
+
"inputs": {
|
| 149 |
+
"encoder": [
|
| 150 |
+
1,
|
| 151 |
+
512,
|
| 152 |
+
188
|
| 153 |
+
],
|
| 154 |
+
"decoder": [
|
| 155 |
+
1,
|
| 156 |
+
640,
|
| 157 |
+
1
|
| 158 |
+
]
|
| 159 |
+
},
|
| 160 |
+
"outputs": {
|
| 161 |
+
"logits": [
|
| 162 |
+
1,
|
| 163 |
+
188,
|
| 164 |
+
1,
|
| 165 |
+
1030
|
| 166 |
+
]
|
| 167 |
+
},
|
| 168 |
+
"path": "parakeet_joint.mlpackage"
|
| 169 |
+
},
|
| 170 |
+
"joint_decision": {
|
| 171 |
+
"inputs": {
|
| 172 |
+
"encoder": [
|
| 173 |
+
1,
|
| 174 |
+
512,
|
| 175 |
+
188
|
| 176 |
+
],
|
| 177 |
+
"decoder": [
|
| 178 |
+
1,
|
| 179 |
+
640,
|
| 180 |
+
1
|
| 181 |
+
]
|
| 182 |
+
},
|
| 183 |
+
"outputs": {
|
| 184 |
+
"token_id": [
|
| 185 |
+
1,
|
| 186 |
+
188,
|
| 187 |
+
1
|
| 188 |
+
],
|
| 189 |
+
"token_prob": [
|
| 190 |
+
1,
|
| 191 |
+
188,
|
| 192 |
+
1
|
| 193 |
+
],
|
| 194 |
+
"duration": [
|
| 195 |
+
1,
|
| 196 |
+
188,
|
| 197 |
+
1
|
| 198 |
+
]
|
| 199 |
+
},
|
| 200 |
+
"path": "parakeet_joint_decision.mlpackage"
|
| 201 |
+
},
|
| 202 |
+
"joint_decision_single_step": {
|
| 203 |
+
"inputs": {
|
| 204 |
+
"encoder_step": [
|
| 205 |
+
1,
|
| 206 |
+
512,
|
| 207 |
+
1
|
| 208 |
+
],
|
| 209 |
+
"decoder_step": [
|
| 210 |
+
1,
|
| 211 |
+
640,
|
| 212 |
+
1
|
| 213 |
+
]
|
| 214 |
+
},
|
| 215 |
+
"outputs": {
|
| 216 |
+
"token_id": [
|
| 217 |
+
1,
|
| 218 |
+
1,
|
| 219 |
+
1
|
| 220 |
+
],
|
| 221 |
+
"token_prob": [
|
| 222 |
+
1,
|
| 223 |
+
1,
|
| 224 |
+
1
|
| 225 |
+
],
|
| 226 |
+
"duration": [
|
| 227 |
+
1,
|
| 228 |
+
1,
|
| 229 |
+
1
|
| 230 |
+
],
|
| 231 |
+
"top_k_ids": [
|
| 232 |
+
1,
|
| 233 |
+
1,
|
| 234 |
+
1,
|
| 235 |
+
64
|
| 236 |
+
],
|
| 237 |
+
"top_k_logits": [
|
| 238 |
+
1,
|
| 239 |
+
1,
|
| 240 |
+
1,
|
| 241 |
+
64
|
| 242 |
+
]
|
| 243 |
+
},
|
| 244 |
+
"path": "parakeet_joint_decision_single_step.mlpackage"
|
| 245 |
+
}
|
| 246 |
+
}
|
| 247 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6459b9564e0630f2eec300eb732fceccbc1d2d16f12cb0694ce310d84bfbecf2
|
| 3 |
+
size 3366
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
|
| 3 |
+
size 1051842
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"6651E3CE-C3ED-4267-AAC3-5A772FC3515A": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"A3F7798B-67CA-418C-B8BB-58731D3A413F": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "6651E3CE-C3ED-4267-AAC3-5A772FC3515A"
|
| 18 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a05548eb455c5cd564782b125a5f9279a789be1f4141e5f044453ea79cd68b47
|
| 3 |
+
size 6729
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
|
| 3 |
+
size 7871040
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Manifest.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"fileFormatVersion": "1.0.0",
|
| 3 |
+
"itemInfoEntries": {
|
| 4 |
+
"83E7B87A-4EBE-48BF-BF3C-EE74DEA4C7AF": {
|
| 5 |
+
"author": "com.apple.CoreML",
|
| 6 |
+
"description": "CoreML Model Specification",
|
| 7 |
+
"name": "model.mlmodel",
|
| 8 |
+
"path": "com.apple.CoreML/model.mlmodel"
|
| 9 |
+
},
|
| 10 |
+
"98BF03AC-26AF-410B-95AC-C9B99B3B240C": {
|
| 11 |
+
"author": "com.apple.CoreML",
|
| 12 |
+
"description": "CoreML Model Weights",
|
| 13 |
+
"name": "weights",
|
| 14 |
+
"path": "com.apple.CoreML/weights"
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"rootModelIdentifier": "83E7B87A-4EBE-48BF-BF3C-EE74DEA4C7AF"
|
| 18 |
+
}
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70d7747b57beba0248fabb6cbfa5d276e3604d0d7e234f4ccb578ea0a4d25110
|
| 3 |
+
size 508107
|
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
|
| 3 |
+
size 215143424
|