alexwengg commited on
Commit
26cfae7
·
verified ·
1 Parent(s): 6e7e587

Upload 401 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +20 -0
  2. cli/CtcEarningsBenchmark.swift +1048 -0
  3. cli/HybridEarningsBenchmark.swift +554 -0
  4. convert/.DS_Store +0 -0
  5. convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py +323 -0
  6. convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav +3 -0
  7. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin +3 -0
  8. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin +3 -0
  9. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json +66 -0
  10. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil +24 -0
  11. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin +3 -0
  12. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin +3 -0
  13. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin +3 -0
  14. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json +118 -0
  15. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil +45 -0
  16. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin +3 -0
  17. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin +3 -0
  18. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin +3 -0
  19. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json +105 -0
  20. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil +0 -0
  21. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin +3 -0
  22. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin +3 -0
  23. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin +3 -0
  24. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json +102 -0
  25. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil +58 -0
  26. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin +3 -0
  27. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin +3 -0
  28. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin +3 -0
  29. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json +123 -0
  30. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil +69 -0
  31. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin +3 -0
  32. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
  33. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin +3 -0
  34. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json +112 -0
  35. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil +191 -0
  36. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin +3 -0
  37. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json +247 -0
  38. convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json +1 -0
  39. convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py +697 -0
  40. convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json +35 -0
  41. convert/parakeet-tdt-ctc-110m/coreml/individual_components.py +265 -0
  42. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/metadata.json +247 -0
  43. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  44. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  45. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Manifest.json +18 -0
  46. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  47. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
  48. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Manifest.json +18 -0
  49. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
  50. convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
.gitattributes CHANGED
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
37
+ convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
38
+ convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k.wav filter=lfs diff=lfs merge=lfs -text
39
+ convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute.wav filter=lfs diff=lfs merge=lfs -text
40
+ convert/parakeet-tdt-v2-0.6b/coreml/context/Efficient[[:space:]]Sequence[[:space:]]Transduction[[:space:]]by[[:space:]]Jointly[[:space:]]Predicting[[:space:]]Tokens[[:space:]]and[[:space:]]Durations.pdf filter=lfs diff=lfs merge=lfs -text
41
+ convert/parakeet-tdt-v2-0.6b/coreml/context/FAST[[:space:]]CONFORMER[[:space:]]WITH[[:space:]]LINEARLY[[:space:]]SCALABLE[[:space:]]ATTENTION.pdf filter=lfs diff=lfs merge=lfs -text
42
+ convert/parakeet-tdt-v2-0.6b/coreml/plots/compare-components/mel_composite.png filter=lfs diff=lfs merge=lfs -text
43
+ convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compile.png filter=lfs diff=lfs merge=lfs -text
44
+ convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compression.png filter=lfs diff=lfs merge=lfs -text
45
+ convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_quality.png filter=lfs diff=lfs merge=lfs -text
46
+ parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
47
+ parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
48
+ parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k.wav filter=lfs diff=lfs merge=lfs -text
49
+ parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute.wav filter=lfs diff=lfs merge=lfs -text
50
+ parakeet-tdt-v2-0.6b/coreml/context/Efficient[[:space:]]Sequence[[:space:]]Transduction[[:space:]]by[[:space:]]Jointly[[:space:]]Predicting[[:space:]]Tokens[[:space:]]and[[:space:]]Durations.pdf filter=lfs diff=lfs merge=lfs -text
51
+ parakeet-tdt-v2-0.6b/coreml/context/FAST[[:space:]]CONFORMER[[:space:]]WITH[[:space:]]LINEARLY[[:space:]]SCALABLE[[:space:]]ATTENTION.pdf filter=lfs diff=lfs merge=lfs -text
52
+ parakeet-tdt-v2-0.6b/coreml/plots/compare-components/mel_composite.png filter=lfs diff=lfs merge=lfs -text
53
+ parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compile.png filter=lfs diff=lfs merge=lfs -text
54
+ parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compression.png filter=lfs diff=lfs merge=lfs -text
55
+ parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_quality.png filter=lfs diff=lfs merge=lfs -text
cli/CtcEarningsBenchmark.swift ADDED
@@ -0,0 +1,1048 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #if os(macOS)
2
+ import AVFoundation
3
+ import CoreML
4
+ import FluidAudio
5
+ import Foundation
6
+
7
+ /// Earnings22 benchmark using TDT for transcription + CTC for keyword spotting.
8
+ /// TDT provides low WER transcription, CTC provides high recall dictionary detection.
9
+ public enum CtcEarningsBenchmark {
10
+
11
+ private enum KeywordMode: String {
12
+ case chunk
13
+ case file
14
+ }
15
+
16
+ /// Default CTC model directory
17
+ private static func defaultCtcModelPath() -> String? {
18
+ let appSupport = FileManager.default.urls(
19
+ for: .applicationSupportDirectory, in: .userDomainMask
20
+ ).first!
21
+ let modelPath = appSupport.appendingPathComponent("FluidAudio/Models/parakeet-ctc-110m-coreml")
22
+ if FileManager.default.fileExists(atPath: modelPath.path) {
23
+ return modelPath.path
24
+ }
25
+ return nil
26
+ }
27
+
28
+ /// Default data directory (from download command)
29
+ private static func defaultDataDir() -> String? {
30
+ let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
31
+ if FileManager.default.fileExists(atPath: dataDir.path) {
32
+ return dataDir.path
33
+ }
34
+ return nil
35
+ }
36
+
37
+ public static func runCLI(arguments: [String]) async {
38
+ // Check for help
39
+ if arguments.contains("--help") || arguments.contains("-h") {
40
+ printUsage()
41
+ return
42
+ }
43
+
44
+ // Parse arguments
45
+ var dataDir: String? = nil
46
+ var outputFile = "ctc_earnings_benchmark.json"
47
+ var maxFiles: Int? = nil
48
+ var ctcModelPath: String? = nil
49
+ // Note: Using v2 by default because v3 has issues with certain audio files
50
+ // (returns empty transcription for ~7 files in Earnings22 dataset)
51
+ var tdtVersion: AsrModelVersion = .v2
52
+ var autoDownload = false
53
+ var keywordMode: KeywordMode = .chunk
54
+
55
+ var i = 0
56
+ while i < arguments.count {
57
+ switch arguments[i] {
58
+ case "--data-dir":
59
+ if i + 1 < arguments.count {
60
+ dataDir = arguments[i + 1]
61
+ i += 1
62
+ }
63
+ case "--output", "-o":
64
+ if i + 1 < arguments.count {
65
+ outputFile = arguments[i + 1]
66
+ i += 1
67
+ }
68
+ case "--max-files":
69
+ if i + 1 < arguments.count {
70
+ maxFiles = Int(arguments[i + 1])
71
+ i += 1
72
+ }
73
+ case "--ctc-model":
74
+ if i + 1 < arguments.count {
75
+ ctcModelPath = arguments[i + 1]
76
+ i += 1
77
+ }
78
+ case "--tdt-version":
79
+ if i + 1 < arguments.count {
80
+ if arguments[i + 1] == "v2" || arguments[i + 1] == "2" {
81
+ tdtVersion = .v2
82
+ }
83
+ i += 1
84
+ }
85
+ case "--auto-download":
86
+ autoDownload = true
87
+ case "--keyword-mode":
88
+ if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
89
+ keywordMode = mode
90
+ i += 1
91
+ }
92
+ default:
93
+ break
94
+ }
95
+ i += 1
96
+ }
97
+
98
+ // Use defaults if not specified
99
+ if dataDir == nil {
100
+ dataDir = defaultDataDir()
101
+ }
102
+ if ctcModelPath == nil {
103
+ ctcModelPath = defaultCtcModelPath()
104
+ }
105
+
106
+ // Handle auto-download for dataset
107
+ if autoDownload && dataDir == nil {
108
+ print("📥 Downloading earnings22-kws dataset...")
109
+ await DatasetDownloader.downloadEarnings22KWS(force: false)
110
+ dataDir = defaultDataDir()
111
+ }
112
+
113
+ // Handle auto-download for CTC models
114
+ if autoDownload && ctcModelPath == nil {
115
+ print("📥 Downloading CTC models...")
116
+ do {
117
+ _ = try await CtcModels.download()
118
+ ctcModelPath = defaultCtcModelPath()
119
+ } catch {
120
+ print("ERROR: Failed to download CTC models: \(error)")
121
+ }
122
+ }
123
+
124
+ print("Earnings Benchmark (TDT transcription + CTC keyword spotting)")
125
+ print(" Data directory: \(dataDir ?? "not found")")
126
+ print(" Output file: \(outputFile)")
127
+ print(" TDT version: \(tdtVersion == .v2 ? "v2" : "v3")")
128
+ print(" CTC model: \(ctcModelPath ?? "not found")")
129
+ print(" Keyword mode: \(keywordMode.rawValue)")
130
+
131
+ guard let finalDataDir = dataDir else {
132
+ print("ERROR: Data directory not found")
133
+ print("💡 Download with: fluidaudio download --dataset earnings22-kws")
134
+ print(" Or specify: --data-dir <path>")
135
+ printUsage()
136
+ return
137
+ }
138
+
139
+ guard let modelPath = ctcModelPath else {
140
+ print("ERROR: CTC model not found")
141
+ print("💡 Download parakeet-ctc-110m-coreml model to:")
142
+ print(" ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
143
+ print(" Or specify: --ctc-model <path>")
144
+ printUsage()
145
+ return
146
+ }
147
+
148
+ let dataDirResolved = finalDataDir
149
+
150
+ do {
151
+ // Load TDT models for transcription
152
+ print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...")
153
+ let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
154
+ let asrManager = AsrManager(config: .default)
155
+ try await asrManager.initialize(models: tdtModels)
156
+ print("TDT models loaded successfully")
157
+
158
+ // Load CTC models for keyword spotting
159
+ print("Loading CTC models from: \(modelPath)")
160
+ let modelDir = URL(fileURLWithPath: modelPath)
161
+ let ctcModels = try await CtcModels.loadDirect(from: modelDir)
162
+ print("Loaded CTC vocabulary with \(ctcModels.vocabulary.count) tokens")
163
+
164
+ // Create keyword spotter
165
+ let vocabSize = ctcModels.vocabulary.count
166
+ let blankId = vocabSize // Blank is at index = vocab_size
167
+ let spotter = CtcKeywordSpotter(models: ctcModels, blankId: blankId)
168
+ print("Created CTC spotter with blankId=\(blankId)")
169
+
170
+ // Collect test files
171
+ let dataDirURL = URL(fileURLWithPath: dataDirResolved)
172
+ let fileIds = try collectFileIds(from: dataDirURL, maxFiles: maxFiles)
173
+ let keywordIndex = try buildKeywordIndex(dataDir: dataDirURL, keywordMode: keywordMode)
174
+
175
+ if fileIds.isEmpty {
176
+ print("ERROR: No test files found in \(dataDirResolved)")
177
+ return
178
+ }
179
+
180
+ print("Processing \(fileIds.count) test files...")
181
+
182
+ var results: [[String: Any]] = []
183
+ var totalWer = 0.0
184
+ var totalKeywordReference = 0
185
+ var totalKeywordPredicted = 0
186
+ var totalKeywordTruePositives = 0
187
+ var totalKeywordFalsePositives = 0
188
+ var totalKeywordFalseNegatives = 0
189
+ var totalAudioDuration = 0.0
190
+ var totalProcessingTime = 0.0
191
+
192
+ for (index, fileId) in fileIds.enumerated() {
193
+ print("[\(index + 1)/\(fileIds.count)] \(fileId)")
194
+
195
+ if let result = try await processFile(
196
+ fileId: fileId,
197
+ dataDir: dataDirURL,
198
+ asrManager: asrManager,
199
+ ctcModels: ctcModels,
200
+ spotter: spotter,
201
+ keywordMode: keywordMode,
202
+ keywordIndex: keywordIndex
203
+ ) {
204
+ results.append(result)
205
+ totalWer += result["wer"] as? Double ?? 0
206
+ totalKeywordReference += result["keywordReference"] as? Int ?? 0
207
+ totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
208
+ totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
209
+ totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
210
+ totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
211
+ totalAudioDuration += result["audioLength"] as? Double ?? 0
212
+ totalProcessingTime += result["processingTime"] as? Double ?? 0
213
+
214
+ let wer = result["wer"] as? Double ?? 0
215
+ let precision = result["keywordPrecision"] as? Double ?? 0
216
+ let recall = result["keywordRecall"] as? Double ?? 0
217
+ let fscore = result["keywordFscore"] as? Double ?? 0
218
+ print(
219
+ " WER: \(String(format: "%.1f", wer))%, " +
220
+ "KW P/R/F: \(String(format: "%.2f", precision))/" +
221
+ "\(String(format: "%.2f", recall))/" +
222
+ "\(String(format: "%.2f", fscore))"
223
+ )
224
+ }
225
+ }
226
+
227
+ // Calculate summary
228
+ let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
229
+ let keywordPrecision =
230
+ totalKeywordPredicted > 0
231
+ ? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
232
+ : 0
233
+ let keywordRecall =
234
+ totalKeywordReference > 0
235
+ ? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
236
+ : 0
237
+ let keywordFscore =
238
+ (keywordPrecision + keywordRecall) > 0
239
+ ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
240
+ : 0
241
+
242
+ // Print summary
243
+ print("\n" + String(repeating: "=", count: 60))
244
+ print("EARNINGS22 BENCHMARK (TDT + CTC)")
245
+ print(String(repeating: "=", count: 60))
246
+ print("Model: \(modelPath)")
247
+ print("Total tests: \(results.count)")
248
+ print("Average WER: \(String(format: "%.2f", avgWer))%")
249
+ print(
250
+ "Keyword Precision/Recall/F1: " +
251
+ "\(String(format: "%.2f", keywordPrecision))/" +
252
+ "\(String(format: "%.2f", keywordRecall))/" +
253
+ "\(String(format: "%.2f", keywordFscore))"
254
+ )
255
+ print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
256
+ print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
257
+ if totalProcessingTime > 0 {
258
+ print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
259
+ }
260
+ print(String(repeating: "=", count: 60))
261
+
262
+ // Sort results by WER descending (worst first)
263
+ let sortedResults = results.sorted { r1, r2 in
264
+ let wer1 = r1["wer"] as? Double ?? 0
265
+ let wer2 = r2["wer"] as? Double ?? 0
266
+ return wer1 > wer2
267
+ }
268
+
269
+ // Save to JSON
270
+ let summaryDict: [String: Any] = [
271
+ "totalTests": results.count,
272
+ "avgWer": round(avgWer * 100) / 100,
273
+ "keywordTruePositives": totalKeywordTruePositives,
274
+ "keywordFalsePositives": totalKeywordFalsePositives,
275
+ "keywordFalseNegatives": totalKeywordFalseNegatives,
276
+ "keywordPredicted": totalKeywordPredicted,
277
+ "keywordReference": totalKeywordReference,
278
+ "keywordPrecision": round(keywordPrecision * 1000) / 1000,
279
+ "keywordRecall": round(keywordRecall * 1000) / 1000,
280
+ "keywordFscore": round(keywordFscore * 1000) / 1000,
281
+ "totalAudioDuration": round(totalAudioDuration * 100) / 100,
282
+ "totalProcessingTime": round(totalProcessingTime * 100) / 100,
283
+ ]
284
+
285
+ let output: [String: Any] = [
286
+ "model": modelPath,
287
+ "keywordMode": keywordMode.rawValue,
288
+ "summary": summaryDict,
289
+ "results": sortedResults,
290
+ ]
291
+
292
+ let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
293
+ try jsonData.write(to: URL(fileURLWithPath: outputFile))
294
+ print("\nResults written to: \(outputFile)")
295
+
296
+ } catch {
297
+ print("ERROR: Benchmark failed: \(error)")
298
+ }
299
+ }
300
+
301
+ private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
302
+ var fileIds: [String] = []
303
+ let suffix = ".dictionary.txt"
304
+
305
+ let fileManager = FileManager.default
306
+ let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
307
+
308
+ for url in contents.sorted(by: { $0.path < $1.path }) {
309
+ let name = url.lastPathComponent
310
+ if name.hasSuffix(suffix) {
311
+ let data = try? Data(contentsOf: url)
312
+ if let data = data, !data.isEmpty {
313
+ let fileId = String(name.dropLast(suffix.count))
314
+ fileIds.append(fileId)
315
+ }
316
+ }
317
+ }
318
+
319
+ if let maxFiles = maxFiles {
320
+ return Array(fileIds.prefix(maxFiles))
321
+ }
322
+ return fileIds
323
+ }
324
+
325
+ private static func processFile(
326
+ fileId: String,
327
+ dataDir: URL,
328
+ asrManager: AsrManager,
329
+ ctcModels: CtcModels,
330
+ spotter: CtcKeywordSpotter,
331
+ keywordMode: KeywordMode,
332
+ keywordIndex: [String: [String]]
333
+ ) async throws -> [String: Any]? {
334
+ let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
335
+ let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
336
+ let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
337
+
338
+ let fm = FileManager.default
339
+ guard fm.fileExists(atPath: wavFile.path),
340
+ fm.fileExists(atPath: dictionaryFile.path)
341
+ else {
342
+ return nil
343
+ }
344
+
345
+ // Load dictionary words (chunk or file keywords)
346
+ let dictionaryWords = try loadDictionaryWords(
347
+ fileId: fileId,
348
+ dictionaryFile: dictionaryFile,
349
+ keywordMode: keywordMode,
350
+ keywordIndex: keywordIndex
351
+ )
352
+
353
+ // Load reference text
354
+ let referenceRaw =
355
+ (try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
356
+
357
+ // Get audio samples
358
+ let audioFile = try AVAudioFile(forReading: wavFile)
359
+ let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
360
+ let format = audioFile.processingFormat
361
+ let frameCount = AVAudioFrameCount(audioFile.length)
362
+
363
+ guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
364
+ throw NSError(
365
+ domain: "CtcEarningsBenchmark", code: 1,
366
+ userInfo: [NSLocalizedDescriptionKey: "Failed to create audio buffer"])
367
+ }
368
+ try audioFile.read(into: buffer)
369
+
370
+ // Resample to 16kHz
371
+ let converter = AudioConverter()
372
+ let samples = try converter.resampleBuffer(buffer)
373
+
374
+ let startTime = Date()
375
+
376
+ // 1. TDT transcription for low WER
377
+ let tdtResult = try await asrManager.transcribe(wavFile)
378
+
379
+ // Skip files where TDT returns empty (some audio files cause model issues)
380
+ if tdtResult.text.isEmpty {
381
+ print(" SKIPPED: TDT returned empty transcription")
382
+ return nil
383
+ }
384
+
385
+ // 2. Build custom vocabulary for CTC keyword spotting
386
+ var vocabTerms: [CustomVocabularyTerm] = []
387
+ for word in dictionaryWords {
388
+ let tokenIds = tokenize(word, vocabulary: ctcModels.vocabulary)
389
+ if !tokenIds.isEmpty {
390
+ let term = CustomVocabularyTerm(
391
+ text: word,
392
+ weight: nil,
393
+ aliases: nil,
394
+ tokenIds: nil,
395
+ ctcTokenIds: tokenIds
396
+ )
397
+ vocabTerms.append(term)
398
+ }
399
+ }
400
+ let customVocab = CustomVocabularyContext(terms: vocabTerms)
401
+
402
+ // 3. CTC keyword spotting for high recall dictionary detection
403
+ let spotResult = try await spotter.spotKeywordsWithLogProbs(
404
+ audioSamples: samples,
405
+ customVocabulary: customVocab,
406
+ minScore: nil
407
+ )
408
+
409
+ // 4. Post-process: Use VocabularyRescorer with Argmax-style parameters
410
+ // Argmax uses cbw=3.0 (context-biasing weight) for boosting vocab terms
411
+ let useRescorer = ProcessInfo.processInfo.environment["NO_CTC_RESCORING"] != "1"
412
+ let hypothesis: String
413
+ if useRescorer {
414
+ let rescorerConfig = VocabularyRescorer.Config(
415
+ minScoreAdvantage: 1.0, // Lower threshold - rely more on CTC scoring
416
+ minVocabScore: -15.0, // Permissive to include more detections
417
+ maxOriginalScoreForReplacement: -2.0, // Don't replace very confident words
418
+ vocabBoostWeight: 3.0 // Argmax cbw=3.0
419
+ )
420
+ let rescorer = VocabularyRescorer(
421
+ spotter: spotter,
422
+ vocabulary: customVocab,
423
+ config: rescorerConfig
424
+ )
425
+ let rescoreResult = rescorer.rescore(transcript: tdtResult.text, spotResult: spotResult)
426
+ hypothesis = rescoreResult.text
427
+ } else {
428
+ hypothesis = tdtResult.text // Baseline: no CTC corrections
429
+ }
430
+
431
+ let processingTime = Date().timeIntervalSince(startTime)
432
+
433
+ // Normalize texts
434
+ let referenceNormalized = TextNormalizer.normalize(referenceRaw)
435
+ let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
436
+
437
+ // Keyword sets for precision/recall
438
+ let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
439
+ let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
440
+ let truePositives = referenceKeywords.intersection(predictedKeywords)
441
+ let falsePositives = predictedKeywords.subtracting(referenceKeywords)
442
+ let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
443
+ let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
444
+ let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
445
+ let keywordFscore =
446
+ (keywordPrecision + keywordRecall) > 0
447
+ ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
448
+ : 0
449
+
450
+ let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
451
+ !$0.isEmpty
452
+ }
453
+ let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
454
+ !$0.isEmpty
455
+ }
456
+
457
+ // Calculate WER
458
+ let wer: Double
459
+ if referenceWords.isEmpty {
460
+ wer = hypothesisWords.isEmpty ? 0.0 : 1.0
461
+ } else {
462
+ wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
463
+ }
464
+
465
+ // Count dictionary detections (debug only)
466
+ let minCtcScore: Float = -15.0 // Permissive threshold for detection
467
+ var detectionDetails: [[String: Any]] = []
468
+ var ctcFoundWords: Set<String> = []
469
+
470
+ // 1. CTC detections
471
+ for detection in spotResult.detections {
472
+ let inRef = referenceKeywords.contains(detection.term.text.lowercased())
473
+ let detail: [String: Any] = [
474
+ "word": detection.term.text,
475
+ "score": round(Double(detection.score) * 100) / 100,
476
+ "startTime": round(detection.startTime * 100) / 100,
477
+ "endTime": round(detection.endTime * 100) / 100,
478
+ "source": "ctc",
479
+ "inReference": inRef,
480
+ ]
481
+ detectionDetails.append(detail)
482
+
483
+ if detection.score >= minCtcScore { // Use >= to include edge cases
484
+ ctcFoundWords.insert(detection.term.text.lowercased())
485
+ }
486
+ }
487
+
488
+ // 2. Fallback: check hypothesis for dictionary words not found by CTC
489
+ let hypothesisLower = hypothesis.lowercased()
490
+ for word in dictionaryWords {
491
+ let wordLower = word.lowercased()
492
+ if !ctcFoundWords.contains(wordLower) {
493
+ // Check if word appears as whole word in hypothesis (avoid substring false positives)
494
+ let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
495
+ if let regex = try? NSRegularExpression(pattern: pattern, options: []),
496
+ regex.firstMatch(
497
+ in: hypothesisLower, options: [],
498
+ range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
499
+ {
500
+ ctcFoundWords.insert(wordLower)
501
+ let inRef = referenceKeywords.contains(wordLower)
502
+ let detail: [String: Any] = [
503
+ "word": word,
504
+ "score": 0.0,
505
+ "startTime": 0.0,
506
+ "endTime": 0.0,
507
+ "source": "hypothesis",
508
+ "inReference": inRef,
509
+ ]
510
+ detectionDetails.append(detail)
511
+ }
512
+ }
513
+ }
514
+
515
+ let result: [String: Any] = [
516
+ "fileId": fileId,
517
+ "reference": referenceNormalized,
518
+ "hypothesis": hypothesisNormalized,
519
+ "wer": round(wer * 10000) / 100,
520
+ "dictFound": predictedKeywords.count,
521
+ "dictTotal": referenceKeywords.count,
522
+ "keywordPredicted": predictedKeywords.count,
523
+ "keywordReference": referenceKeywords.count,
524
+ "keywordTruePositives": truePositives.count,
525
+ "keywordFalsePositives": falsePositives.count,
526
+ "keywordFalseNegatives": falseNegatives.count,
527
+ "keywordPrecision": round(keywordPrecision * 1000) / 1000,
528
+ "keywordRecall": round(keywordRecall * 1000) / 1000,
529
+ "keywordFscore": round(keywordFscore * 1000) / 1000,
530
+ "audioLength": round(audioLength * 100) / 100,
531
+ "processingTime": round(processingTime * 1000) / 1000,
532
+ "ctcDetections": detectionDetails,
533
+ ]
534
+ return result
535
+ }
536
+
537
+ /// Simple tokenization using vocabulary lookup
538
+ private static func tokenize(_ text: String, vocabulary: [Int: String]) -> [Int] {
539
+ // Build reverse vocabulary (token -> id)
540
+ var tokenToId: [String: Int] = [:]
541
+ for (id, token) in vocabulary {
542
+ tokenToId[token] = id
543
+ }
544
+
545
+ let normalizedText = text.lowercased()
546
+ var result: [Int] = []
547
+ var position = normalizedText.startIndex
548
+ var isWordStart = true
549
+
550
+ while position < normalizedText.endIndex {
551
+ var matched = false
552
+ let remaining = normalizedText.distance(from: position, to: normalizedText.endIndex)
553
+ var matchLength = min(20, remaining)
554
+
555
+ while matchLength > 0 {
556
+ let endPos = normalizedText.index(position, offsetBy: matchLength)
557
+ let substring = String(normalizedText[position..<endPos])
558
+
559
+ // Try with SentencePiece prefix for word start
560
+ let withPrefix = isWordStart ? "▁" + substring : substring
561
+
562
+ if let tokenId = tokenToId[withPrefix] {
563
+ result.append(tokenId)
564
+ position = endPos
565
+ isWordStart = false
566
+ matched = true
567
+ break
568
+ } else if let tokenId = tokenToId[substring] {
569
+ result.append(tokenId)
570
+ position = endPos
571
+ isWordStart = false
572
+ matched = true
573
+ break
574
+ }
575
+
576
+ matchLength -= 1
577
+ }
578
+
579
+ if !matched {
580
+ let char = normalizedText[position]
581
+ if char == " " {
582
+ isWordStart = true
583
+ position = normalizedText.index(after: position)
584
+ } else {
585
+ // Unknown character - skip
586
+ position = normalizedText.index(after: position)
587
+ isWordStart = false
588
+ }
589
+ }
590
+ }
591
+
592
+ return result
593
+ }
594
+
595
+ /// Apply CTC keyword corrections to TDT transcription using multiple strategies:
596
+ /// 1. Fuzzy matching (for words that are phonetically similar)
597
+ /// 2. Context pattern matching (for "this is X" type patterns)
598
+ /// 3. Proper noun replacement (for names after common patterns)
599
+ private static func applyKeywordCorrections(
600
+ tdtResult: ASRResult,
601
+ detections: [CtcKeywordSpotter.KeywordDetection],
602
+ minScore: Float
603
+ ) -> String {
604
+ // Filter detections by score
605
+ let validDetections = detections.filter { $0.score >= minScore }
606
+ guard !validDetections.isEmpty else {
607
+ return tdtResult.text
608
+ }
609
+
610
+ var text = tdtResult.text
611
+ var usedDetections: Set<String> = []
612
+
613
+ // PASS 1: Fuzzy matching for phonetically similar words
614
+ for detection in validDetections {
615
+ let keyword = detection.term.text
616
+ let keywordLower = keyword.lowercased()
617
+ let keywordParts = keywordLower.components(separatedBy: " ").filter { !$0.isEmpty }
618
+
619
+ let words = text.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }
620
+
621
+ // Handle multi-word keywords
622
+ if keywordParts.count > 1 {
623
+ for i in 0..<(words.count - keywordParts.count + 1) {
624
+ var allMatch = true
625
+ var matchedWords: [String] = []
626
+
627
+ for j in 0..<keywordParts.count {
628
+ let wordClean = words[i + j].trimmingCharacters(in: .punctuationCharacters).lowercased()
629
+ if isSimilar(wordClean, keywordParts[j]) {
630
+ matchedWords.append(words[i + j])
631
+ } else {
632
+ allMatch = false
633
+ break
634
+ }
635
+ }
636
+
637
+ if allMatch && !matchedWords.isEmpty {
638
+ let originalPhrase = matchedWords.joined(separator: " ")
639
+ let replacement = matchCase(keyword, to: matchedWords[0])
640
+ text = text.replacingOccurrences(of: originalPhrase, with: replacement)
641
+ usedDetections.insert(keyword)
642
+ break
643
+ }
644
+ }
645
+ } else {
646
+ // Single word keyword
647
+ for word in words {
648
+ let wordClean = word.trimmingCharacters(in: .punctuationCharacters).lowercased()
649
+ guard !wordClean.isEmpty else { continue }
650
+
651
+ if isSimilar(wordClean, keywordLower) && wordClean != keywordLower {
652
+ let replacement = matchCase(keyword, to: word)
653
+ text = text.replacingOccurrences(of: word, with: replacement)
654
+ usedDetections.insert(keyword)
655
+ break
656
+ }
657
+ }
658
+ }
659
+ }
660
+
661
+ // PASS 2: Context pattern matching - specifically for "this is X" pattern
662
+ // Only replace if keyword is NOT already in the text
663
+ for detection in validDetections {
664
+ let keyword = detection.term.text
665
+ guard !usedDetections.contains(keyword) else { continue }
666
+
667
+ let keywordLower = keyword.lowercased()
668
+
669
+ // Skip if keyword already exists in text (case-insensitive)
670
+ if text.lowercased().contains(keywordLower) {
671
+ usedDetections.insert(keyword) // Mark as handled
672
+ continue
673
+ }
674
+
675
+ // Check if keyword looks like a proper noun (starts with uppercase)
676
+ let isProperNoun =
677
+ keyword.first?.isUppercase == true
678
+ && keyword.count >= 3
679
+ && !stopWords.contains(keywordLower)
680
+
681
+ guard isProperNoun else { continue }
682
+
683
+ // Look for "this is X" pattern specifically for names
684
+ let thisIsPattern = try? NSRegularExpression(pattern: "this is ([A-Z][a-z]+)", options: [])
685
+ if let regex = thisIsPattern {
686
+ let textRange = NSRange(text.startIndex..., in: text)
687
+ if let match = regex.firstMatch(in: text, options: [], range: textRange),
688
+ match.numberOfRanges > 1,
689
+ let captureRange = Range(match.range(at: 1), in: text)
690
+ {
691
+ let capturedWord = String(text[captureRange])
692
+ let capturedLower = capturedWord.lowercased()
693
+
694
+ // Skip if captured word is already a detected keyword
695
+ let isOtherKeyword = validDetections.contains { det in
696
+ det.term.text.lowercased() == capturedLower
697
+ }
698
+
699
+ if !isOtherKeyword && !stopWords.contains(capturedLower) {
700
+ // Similar length check
701
+ if abs(capturedWord.count - keyword.count) <= 3 {
702
+ text = text.replacingOccurrences(of: capturedWord, with: keyword)
703
+ usedDetections.insert(keyword)
704
+ }
705
+ }
706
+ }
707
+ }
708
+ }
709
+
710
+ return text
711
+ }
712
+
713
+ /// Build word timings by merging subword tokens (tokens starting with "▁" begin new words)
714
+ private static func buildWordTimings(
715
+ from tokenTimings: [TokenTiming]
716
+ ) -> [(word: String, startTime: Double, endTime: Double)] {
717
+ var wordTimings: [(word: String, startTime: Double, endTime: Double)] = []
718
+ var currentWord = ""
719
+ var wordStart: Double = 0
720
+ var wordEnd: Double = 0
721
+
722
+ for timing in tokenTimings {
723
+ let token = timing.token
724
+
725
+ // Skip special tokens
726
+ if token.isEmpty || token == "<blank>" || token == "<pad>" {
727
+ continue
728
+ }
729
+
730
+ // Check if this starts a new word (has ▁ prefix or is first token)
731
+ let startsNewWord = token.hasPrefix("▁") || currentWord.isEmpty
732
+
733
+ if startsNewWord && !currentWord.isEmpty {
734
+ // Save previous word
735
+ wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
736
+ currentWord = ""
737
+ }
738
+
739
+ if startsNewWord {
740
+ currentWord = token.hasPrefix("▁") ? String(token.dropFirst()) : token
741
+ wordStart = timing.startTime
742
+ } else {
743
+ currentWord += token
744
+ }
745
+ wordEnd = timing.endTime
746
+ }
747
+
748
+ // Save final word
749
+ if !currentWord.isEmpty {
750
+ wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
751
+ }
752
+
753
+ return wordTimings
754
+ }
755
+
756
+ /// Common English words that should never be replaced by keyword matching
757
+ private static let stopWords: Set<String> = [
758
+ // Pronouns
759
+ "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
760
+ "my", "your", "his", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
761
+ "this", "that", "these", "those", "who", "whom", "what", "which", "whose",
762
+ // Common verbs
763
+ "is", "are", "was", "were", "be", "been", "being", "am",
764
+ "have", "has", "had", "having", "do", "does", "did", "doing", "done",
765
+ "will", "would", "shall", "should", "may", "might", "must", "can", "could",
766
+ "get", "got", "getting", "go", "goes", "went", "going", "gone",
767
+ "come", "came", "coming", "see", "saw", "seen", "know", "knew", "known",
768
+ "think", "thought", "make", "made", "take", "took", "taken", "give", "gave", "given",
769
+ "say", "said", "tell", "told", "ask", "asked", "use", "used", "want", "wanted",
770
+ "need", "needed", "try", "tried", "let", "put", "keep", "kept", "look", "looked",
771
+ // Articles and determiners
772
+ "a", "an", "the", "some", "any", "no", "every", "each", "all", "both", "few", "many",
773
+ "much", "more", "most", "other", "another", "such",
774
+ // Prepositions
775
+ "in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "down", "out",
776
+ "about", "into", "over", "after", "before", "between", "under", "through", "during",
777
+ // Conjunctions
778
+ "and", "or", "but", "so", "yet", "nor", "if", "then", "than", "because", "while",
779
+ "although", "unless", "since", "when", "where", "as",
780
+ // Adverbs
781
+ "not", "very", "just", "also", "only", "even", "still", "already", "always", "never",
782
+ "often", "sometimes", "usually", "really", "well", "now", "here", "there", "how", "why",
783
+ // Common words
784
+ "yes", "no", "okay", "ok", "thank", "thanks", "please", "sorry", "hello", "hi", "bye",
785
+ "good", "great", "bad", "new", "old", "first", "last", "long", "short", "big", "small",
786
+ "high", "low", "right", "left", "next", "back", "same", "different", "own", "able",
787
+ "way", "thing", "things", "time", "times", "year", "years", "day", "days", "week", "weeks",
788
+ "part", "place", "case", "point", "fact", "end", "kind", "lot", "set",
789
+ // Numbers
790
+ "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
791
+ "hundred", "thousand", "million", "billion",
792
+ ]
793
+
794
+ /// Check if two words are similar (edit distance / length ratio)
795
+ private static func isSimilar(_ a: String, _ b: String) -> Bool {
796
+ // Never match stop words - they're too common to be proper nouns
797
+ if stopWords.contains(a) || stopWords.contains(b) {
798
+ return false
799
+ }
800
+
801
+ let maxLen = max(a.count, b.count)
802
+ let minLen = min(a.count, b.count)
803
+ guard maxLen > 0, minLen >= 3 else { return false }
804
+
805
+ // Allow more length difference for longer words
806
+ let lenDiff = abs(a.count - b.count)
807
+ if lenDiff > max(3, maxLen / 2) { return false }
808
+
809
+ // Calculate edit distance
810
+ let distance = editDistance(a, b)
811
+
812
+ // More aggressive threshold: allow up to 40% of max length as edits
813
+ let threshold = max(2, Int(Double(maxLen) * 0.4))
814
+
815
+ // Also check if one is substring of other (handles "Erik" in "Ririek")
816
+ if a.contains(b) || b.contains(a) {
817
+ return true
818
+ }
819
+
820
+ // Check common prefix/suffix (handles "Heri" vs "Harry")
821
+ let commonPrefix = commonPrefixLength(a, b)
822
+ let commonSuffix = commonSuffixLength(a, b)
823
+ if commonPrefix >= 2 || commonSuffix >= 2 {
824
+ return distance <= threshold + 1
825
+ }
826
+
827
+ return distance <= threshold
828
+ }
829
+
830
+ /// Get length of common prefix
831
+ private static func commonPrefixLength(_ a: String, _ b: String) -> Int {
832
+ let aChars = Array(a)
833
+ let bChars = Array(b)
834
+ var count = 0
835
+ for i in 0..<min(aChars.count, bChars.count) {
836
+ if aChars[i] == bChars[i] {
837
+ count += 1
838
+ } else {
839
+ break
840
+ }
841
+ }
842
+ return count
843
+ }
844
+
845
+ /// Get length of common suffix
846
+ private static func commonSuffixLength(_ a: String, _ b: String) -> Int {
847
+ let aChars = Array(a.reversed())
848
+ let bChars = Array(b.reversed())
849
+ var count = 0
850
+ for i in 0..<min(aChars.count, bChars.count) {
851
+ if aChars[i] == bChars[i] {
852
+ count += 1
853
+ } else {
854
+ break
855
+ }
856
+ }
857
+ return count
858
+ }
859
+
860
+ /// Simple edit distance calculation
861
+ private static func editDistance(_ a: String, _ b: String) -> Int {
862
+ let a = Array(a)
863
+ let b = Array(b)
864
+ let m = a.count
865
+ let n = b.count
866
+
867
+ if m == 0 { return n }
868
+ if n == 0 { return m }
869
+
870
+ var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
871
+
872
+ for i in 0...m { dp[i][0] = i }
873
+ for j in 0...n { dp[0][j] = j }
874
+
875
+ for i in 1...m {
876
+ for j in 1...n {
877
+ if a[i - 1] == b[j - 1] {
878
+ dp[i][j] = dp[i - 1][j - 1]
879
+ } else {
880
+ dp[i][j] = 1 + min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1]))
881
+ }
882
+ }
883
+ }
884
+
885
+ return dp[m][n]
886
+ }
887
+
888
+ /// Match the case pattern of the original word
889
+ private static func matchCase(_ keyword: String, to original: String) -> String {
890
+ let origClean = original.trimmingCharacters(in: .punctuationCharacters)
891
+
892
+ // Check case pattern
893
+ if origClean.first?.isUppercase == true {
894
+ // Capitalize first letter
895
+ return keyword.prefix(1).uppercased() + keyword.dropFirst()
896
+ }
897
+ return keyword
898
+ }
899
+
900
+ private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
901
+ if reference.isEmpty {
902
+ return hypothesis.isEmpty ? 0.0 : 1.0
903
+ }
904
+
905
+ let m = reference.count
906
+ let n = hypothesis.count
907
+ var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
908
+
909
+ for i in 0...m { dp[i][0] = i }
910
+ for j in 0...n { dp[0][j] = j }
911
+
912
+ for i in 1...m {
913
+ for j in 1...n {
914
+ if reference[i - 1] == hypothesis[j - 1] {
915
+ dp[i][j] = dp[i - 1][j - 1]
916
+ } else {
917
+ dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
918
+ }
919
+ }
920
+ }
921
+
922
+ return Double(dp[m][n]) / Double(m)
923
+ }
924
+
925
+ private static func printUsage() {
926
+ print(
927
+ """
928
+ CTC Earnings Benchmark (TDT + CTC keyword spotting)
929
+
930
+ Usage: fluidaudio ctc-earnings-benchmark [options]
931
+
932
+ Options:
933
+ --data-dir <path> Path to earnings test dataset (auto-detected if downloaded)
934
+ --ctc-model <path> Path to CTC model directory (auto-detected if in standard location)
935
+ --max-files <n> Maximum number of files to process
936
+ --output, -o <path> Output JSON file (default: ctc_earnings_benchmark.json)
937
+ --auto-download Download earnings22-kws dataset if not found
938
+ --keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
939
+
940
+ Default locations:
941
+ Dataset: ~/Library/Application Support/FluidAudio/earnings22-kws/test-dataset/
942
+ CTC Model: ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/
943
+
944
+ Setup:
945
+ 1. Download dataset: fluidaudio download --dataset earnings22-kws
946
+ 2. Place CTC model in standard location
947
+ 3. Run: fluidaudio ctc-earnings-benchmark
948
+
949
+ Examples:
950
+ # Run with auto-detected paths
951
+ fluidaudio ctc-earnings-benchmark
952
+
953
+ # Run with auto-download
954
+ fluidaudio ctc-earnings-benchmark --auto-download
955
+
956
+ # Run with explicit paths
957
+ fluidaudio ctc-earnings-benchmark \\
958
+ --data-dir /path/to/test-dataset \\
959
+ --ctc-model /path/to/parakeet-ctc-110m-coreml \\
960
+ --max-files 100
961
+ """)
962
+ }
963
+
964
+ private static func parseKeywordMode(_ value: String) -> KeywordMode? {
965
+ switch value.lowercased() {
966
+ case "chunk", "chunk-keywords":
967
+ return .chunk
968
+ case "file", "file-keywords":
969
+ return .file
970
+ default:
971
+ return nil
972
+ }
973
+ }
974
+
975
+ private static func parentId(from fileId: String) -> String {
976
+ guard let range = fileId.range(of: "_chunk") else {
977
+ return fileId
978
+ }
979
+ return String(fileId[..<range.lowerBound])
980
+ }
981
+
982
+ private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
983
+ guard keywordMode == .file else {
984
+ return [:]
985
+ }
986
+
987
+ var index: [String: Set<String>] = [:]
988
+ let suffix = ".dictionary.txt"
989
+ let fileManager = FileManager.default
990
+ let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
991
+
992
+ for url in contents {
993
+ let name = url.lastPathComponent
994
+ guard name.hasSuffix(suffix) else { continue }
995
+ let fileId = String(name.dropLast(suffix.count))
996
+ let parent = parentId(from: fileId)
997
+ let words = try loadDictionaryWords(from: url)
998
+ var set = index[parent] ?? Set<String>()
999
+ set.formUnion(words)
1000
+ index[parent] = set
1001
+ }
1002
+
1003
+ return index.mapValues { Array($0).sorted() }
1004
+ }
1005
+
1006
+ private static func loadDictionaryWords(
1007
+ fileId: String,
1008
+ dictionaryFile: URL,
1009
+ keywordMode: KeywordMode,
1010
+ keywordIndex: [String: [String]]
1011
+ ) throws -> [String] {
1012
+ switch keywordMode {
1013
+ case .chunk:
1014
+ return try loadDictionaryWords(from: dictionaryFile)
1015
+ case .file:
1016
+ let parent = parentId(from: fileId)
1017
+ if let words = keywordIndex[parent] {
1018
+ return words
1019
+ }
1020
+ return try loadDictionaryWords(from: dictionaryFile)
1021
+ }
1022
+ }
1023
+
1024
+ private static func loadDictionaryWords(from url: URL) throws -> [String] {
1025
+ let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
1026
+ return dictionaryContent
1027
+ .components(separatedBy: .newlines)
1028
+ .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
1029
+ .filter { !$0.isEmpty }
1030
+ }
1031
+
1032
+ private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
1033
+ let textLower = text.lowercased()
1034
+ var result: Set<String> = []
1035
+
1036
+ for word in dictionaryWords {
1037
+ let wordLower = word.lowercased()
1038
+ let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
1039
+ guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
1040
+ let range = NSRange(textLower.startIndex..., in: textLower)
1041
+ if regex.firstMatch(in: textLower, options: [], range: range) != nil {
1042
+ result.insert(wordLower)
1043
+ }
1044
+ }
1045
+ return result
1046
+ }
1047
+ }
1048
+ #endif
cli/HybridEarningsBenchmark.swift ADDED
@@ -0,0 +1,554 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #if os(macOS)
2
+ import AVFoundation
3
+ import FluidAudio
4
+ import Foundation
5
+
6
+ /// Earnings22 benchmark using ONLY the Hybrid 110M model (single encoder).
7
+ /// CTC head provides both transcription AND keyword spotting from the same encoder.
8
+ public enum HybridEarningsBenchmark {
9
+
10
+ private enum KeywordMode: String {
11
+ case chunk
12
+ case file
13
+ }
14
+
15
+ public static func runCLI(arguments: [String]) async {
16
+ if arguments.contains("--help") || arguments.contains("-h") {
17
+ printUsage()
18
+ return
19
+ }
20
+
21
+ // Parse arguments
22
+ var outputFile = "hybrid_earnings_benchmark.json"
23
+ var maxFiles: Int? = nil
24
+ var decodingMode: HybridDecodingMode = .tdt
25
+ var useRescoring = false
26
+ var keywordMode: KeywordMode = .chunk
27
+
28
+ var i = 0
29
+ while i < arguments.count {
30
+ switch arguments[i] {
31
+ case "--output", "-o":
32
+ if i + 1 < arguments.count {
33
+ outputFile = arguments[i + 1]
34
+ i += 1
35
+ }
36
+ case "--max-files":
37
+ if i + 1 < arguments.count {
38
+ maxFiles = Int(arguments[i + 1])
39
+ i += 1
40
+ }
41
+ case "--ctc":
42
+ decodingMode = .ctc
43
+ case "--tdt":
44
+ decodingMode = .tdt
45
+ case "--rescore":
46
+ useRescoring = true
47
+ case "--keyword-mode":
48
+ if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
49
+ keywordMode = mode
50
+ i += 1
51
+ }
52
+ default:
53
+ break
54
+ }
55
+ i += 1
56
+ }
57
+
58
+ let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
59
+ guard FileManager.default.fileExists(atPath: dataDir.path) else {
60
+ print("ERROR: Earnings dataset not found at \(dataDir.path)")
61
+ print("Download with: fluidaudio download --dataset earnings22-kws")
62
+ return
63
+ }
64
+
65
+ let modeStr = decodingMode == .ctc ? "CTC" : "TDT"
66
+ let rescoringStr = useRescoring ? " + Rescoring" : ""
67
+ print("Hybrid 110M Earnings Benchmark (Decoding: \(modeStr)\(rescoringStr))")
68
+ print(" Output file: \(outputFile)")
69
+ print(" Decoding mode: \(modeStr)")
70
+ print(" Rescoring: \(useRescoring ? "enabled" : "disabled")")
71
+ print(" Keyword mode: \(keywordMode.rawValue)")
72
+
73
+ do {
74
+ // Load Hybrid 110M model (single encoder with CTC head)
75
+ print("Loading Hybrid 110M model...")
76
+ let hybridModels = try await HybridAsrModels.downloadAndLoad()
77
+ let hybridManager = HybridAsrManager(models: hybridModels, decodingMode: decodingMode)
78
+ let spotter = HybridKeywordSpotter(vocabulary: hybridModels.vocabulary, blankId: hybridModels.blankId)
79
+ print(" Vocab size: \(hybridModels.vocabSize)")
80
+
81
+ // Collect test files
82
+ let fileIds = try collectFileIds(from: dataDir, maxFiles: maxFiles)
83
+ let keywordIndex = try buildKeywordIndex(dataDir: dataDir, keywordMode: keywordMode)
84
+
85
+ if fileIds.isEmpty {
86
+ print("ERROR: No test files found")
87
+ return
88
+ }
89
+
90
+ print("Processing \(fileIds.count) test files...")
91
+
92
+ var results: [[String: Any]] = []
93
+ var totalWer = 0.0
94
+ var totalKeywordReference = 0
95
+ var totalKeywordPredicted = 0
96
+ var totalKeywordTruePositives = 0
97
+ var totalKeywordFalsePositives = 0
98
+ var totalKeywordFalseNegatives = 0
99
+ var totalAudioDuration = 0.0
100
+ var totalProcessingTime = 0.0
101
+
102
+ for (index, fileId) in fileIds.enumerated() {
103
+ print("[\(index + 1)/\(fileIds.count)] \(fileId)")
104
+
105
+ if let result = try await processFile(
106
+ fileId: fileId,
107
+ dataDir: dataDir,
108
+ hybridManager: hybridManager,
109
+ spotter: spotter,
110
+ useRescoring: useRescoring,
111
+ keywordMode: keywordMode,
112
+ keywordIndex: keywordIndex
113
+ ) {
114
+ results.append(result)
115
+ totalWer += result["wer"] as? Double ?? 0
116
+ totalKeywordReference += result["keywordReference"] as? Int ?? 0
117
+ totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
118
+ totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
119
+ totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
120
+ totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
121
+ totalAudioDuration += result["audioLength"] as? Double ?? 0
122
+ totalProcessingTime += result["processingTime"] as? Double ?? 0
123
+
124
+ let wer = result["wer"] as? Double ?? 0
125
+ let precision = result["keywordPrecision"] as? Double ?? 0
126
+ let recall = result["keywordRecall"] as? Double ?? 0
127
+ let fscore = result["keywordFscore"] as? Double ?? 0
128
+ print(
129
+ " WER: \(String(format: "%.1f", wer))%, " +
130
+ "KW P/R/F: \(String(format: "%.2f", precision))/" +
131
+ "\(String(format: "%.2f", recall))/" +
132
+ "\(String(format: "%.2f", fscore))"
133
+ )
134
+ }
135
+ }
136
+
137
+ // Calculate summary
138
+ let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
139
+ let keywordPrecision =
140
+ totalKeywordPredicted > 0
141
+ ? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
142
+ : 0
143
+ let keywordRecall =
144
+ totalKeywordReference > 0
145
+ ? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
146
+ : 0
147
+ let keywordFscore =
148
+ (keywordPrecision + keywordRecall) > 0
149
+ ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
150
+ : 0
151
+
152
+ // Print summary
153
+ print("\n" + String(repeating: "=", count: 60))
154
+ print("HYBRID 110M BENCHMARK (\(modeStr)\(rescoringStr))")
155
+ print(String(repeating: "=", count: 60))
156
+ print("Model: parakeet-tdt-ctc-110m-hybrid")
157
+ print("Decoding: \(modeStr), Rescoring: \(useRescoring ? "yes" : "no")")
158
+ print("Total tests: \(results.count)")
159
+ print("Average WER: \(String(format: "%.2f", avgWer))%")
160
+ print(
161
+ "Keyword Precision/Recall/F1: " +
162
+ "\(String(format: "%.2f", keywordPrecision))/" +
163
+ "\(String(format: "%.2f", keywordRecall))/" +
164
+ "\(String(format: "%.2f", keywordFscore))"
165
+ )
166
+ print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
167
+ print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
168
+ if totalProcessingTime > 0 {
169
+ print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
170
+ }
171
+ print(String(repeating: "=", count: 60))
172
+
173
+ // Sort results by WER descending (worst first)
174
+ let sortedResults = results.sorted { r1, r2 in
175
+ let wer1 = r1["wer"] as? Double ?? 0
176
+ let wer2 = r2["wer"] as? Double ?? 0
177
+ return wer1 > wer2
178
+ }
179
+
180
+ // Save to JSON
181
+ let summaryDict: [String: Any] = [
182
+ "totalTests": results.count,
183
+ "avgWer": round(avgWer * 100) / 100,
184
+ "keywordTruePositives": totalKeywordTruePositives,
185
+ "keywordFalsePositives": totalKeywordFalsePositives,
186
+ "keywordFalseNegatives": totalKeywordFalseNegatives,
187
+ "keywordPredicted": totalKeywordPredicted,
188
+ "keywordReference": totalKeywordReference,
189
+ "keywordPrecision": round(keywordPrecision * 1000) / 1000,
190
+ "keywordRecall": round(keywordRecall * 1000) / 1000,
191
+ "keywordFscore": round(keywordFscore * 1000) / 1000,
192
+ "totalAudioDuration": round(totalAudioDuration * 100) / 100,
193
+ "totalProcessingTime": round(totalProcessingTime * 100) / 100,
194
+ ]
195
+
196
+ let output: [String: Any] = [
197
+ "model": "parakeet-tdt-ctc-110m-hybrid",
198
+ "approach": "single-encoder",
199
+ "decodingMode": modeStr,
200
+ "rescoring": useRescoring,
201
+ "keywordMode": keywordMode.rawValue,
202
+ "summary": summaryDict,
203
+ "results": sortedResults,
204
+ ]
205
+
206
+ let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
207
+ try jsonData.write(to: URL(fileURLWithPath: outputFile))
208
+ print("\nResults written to: \(outputFile)")
209
+
210
+ } catch {
211
+ print("ERROR: \(error)")
212
+ }
213
+ }
214
+
215
+ private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
216
+ var fileIds: [String] = []
217
+ let suffix = ".dictionary.txt"
218
+
219
+ let fileManager = FileManager.default
220
+ let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
221
+
222
+ for url in contents.sorted(by: { $0.path < $1.path }) {
223
+ let name = url.lastPathComponent
224
+ if name.hasSuffix(suffix) {
225
+ let data = try? Data(contentsOf: url)
226
+ if let data = data, !data.isEmpty {
227
+ let fileId = String(name.dropLast(suffix.count))
228
+ fileIds.append(fileId)
229
+ }
230
+ }
231
+ }
232
+
233
+ if let maxFiles = maxFiles {
234
+ return Array(fileIds.prefix(maxFiles))
235
+ }
236
+ return fileIds
237
+ }
238
+
239
+ private static func processFile(
240
+ fileId: String,
241
+ dataDir: URL,
242
+ hybridManager: HybridAsrManager,
243
+ spotter: HybridKeywordSpotter,
244
+ useRescoring: Bool,
245
+ keywordMode: KeywordMode,
246
+ keywordIndex: [String: [String]]
247
+ ) async throws -> [String: Any]? {
248
+ let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
249
+ let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
250
+ let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
251
+
252
+ let fm = FileManager.default
253
+ guard fm.fileExists(atPath: wavFile.path),
254
+ fm.fileExists(atPath: dictionaryFile.path)
255
+ else {
256
+ return nil
257
+ }
258
+
259
+ // Load dictionary words (chunk or file keywords)
260
+ let dictionaryWords = try loadDictionaryWords(
261
+ fileId: fileId,
262
+ dictionaryFile: dictionaryFile,
263
+ keywordMode: keywordMode,
264
+ keywordIndex: keywordIndex
265
+ )
266
+
267
+ // Load reference text
268
+ let referenceRaw =
269
+ (try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
270
+
271
+ // Get audio samples
272
+ let audioFile = try AVAudioFile(forReading: wavFile)
273
+ let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
274
+ let format = audioFile.processingFormat
275
+ let frameCount = AVAudioFrameCount(audioFile.length)
276
+
277
+ guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
278
+ return nil
279
+ }
280
+ try audioFile.read(into: buffer)
281
+
282
+ // Resample to 16kHz
283
+ let converter = AudioConverter()
284
+ let samples = try converter.resampleBuffer(buffer)
285
+
286
+ // Build custom vocabulary for keyword spotting
287
+ var vocabTerms: [CustomVocabularyTerm] = []
288
+ for word in dictionaryWords {
289
+ let term = CustomVocabularyTerm(
290
+ text: word,
291
+ weight: nil,
292
+ aliases: nil,
293
+ tokenIds: nil,
294
+ ctcTokenIds: nil
295
+ )
296
+ vocabTerms.append(term)
297
+ }
298
+ let customVocab = CustomVocabularyContext(terms: vocabTerms)
299
+
300
+ // Run Hybrid 110M using new API (TDT transcription + CTC keyword detection)
301
+ let rescorerConfig: HybridTextRescorer.Config? = useRescoring ? .default : nil
302
+ let hybridResult = try await hybridManager.transcribeHybrid(
303
+ audioSamples: samples,
304
+ customVocabulary: customVocab,
305
+ rescorerConfig: rescorerConfig
306
+ )
307
+
308
+ // Skip if empty transcription
309
+ if hybridResult.text.isEmpty {
310
+ print(" SKIPPED: Empty transcription")
311
+ return nil
312
+ }
313
+
314
+ let detections = hybridResult.keywordDetections
315
+ let processingTime = hybridResult.processingTime
316
+
317
+ // Use hybrid transcription as hypothesis (may be rescored if enabled)
318
+ let hypothesis = hybridResult.text
319
+
320
+ // Normalize texts
321
+ let referenceNormalized = TextNormalizer.normalize(referenceRaw)
322
+ let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
323
+
324
+ // Keyword sets for precision/recall
325
+ let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
326
+ let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
327
+ let truePositives = referenceKeywords.intersection(predictedKeywords)
328
+ let falsePositives = predictedKeywords.subtracting(referenceKeywords)
329
+ let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
330
+ let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
331
+ let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
332
+ let keywordFscore =
333
+ (keywordPrecision + keywordRecall) > 0
334
+ ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
335
+ : 0
336
+
337
+ let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
338
+ !$0.isEmpty
339
+ }
340
+ let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
341
+ !$0.isEmpty
342
+ }
343
+
344
+ // Calculate WER
345
+ let wer: Double
346
+ if referenceWords.isEmpty {
347
+ wer = hypothesisWords.isEmpty ? 0.0 : 1.0
348
+ } else {
349
+ wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
350
+ }
351
+
352
+ // Count dictionary detections for debugging
353
+ let minCtcScore: Float = -15.0
354
+ var detectionDetails: [[String: Any]] = []
355
+ var foundWords: Set<String> = []
356
+
357
+ // CTC detections
358
+ for detection in detections {
359
+ let inRef = referenceKeywords.contains(detection.term.text.lowercased())
360
+ let detail: [String: Any] = [
361
+ "word": detection.term.text,
362
+ "score": round(Double(detection.score) * 100) / 100,
363
+ "startTime": round(detection.startTime * 100) / 100,
364
+ "endTime": round(detection.endTime * 100) / 100,
365
+ "source": "ctc",
366
+ "inReference": inRef,
367
+ ]
368
+ detectionDetails.append(detail)
369
+
370
+ if detection.score >= minCtcScore {
371
+ foundWords.insert(detection.term.text.lowercased())
372
+ }
373
+ }
374
+
375
+ // Fallback: check hypothesis for dictionary words not found by CTC
376
+ let hypothesisLower = hypothesis.lowercased()
377
+ for word in dictionaryWords {
378
+ let wordLower = word.lowercased()
379
+ if !foundWords.contains(wordLower) {
380
+ let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
381
+ if let regex = try? NSRegularExpression(pattern: pattern, options: []),
382
+ regex.firstMatch(
383
+ in: hypothesisLower, options: [],
384
+ range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
385
+ {
386
+ foundWords.insert(wordLower)
387
+ let inRef = referenceKeywords.contains(wordLower)
388
+ let detail: [String: Any] = [
389
+ "word": word,
390
+ "score": 0.0,
391
+ "startTime": 0.0,
392
+ "endTime": 0.0,
393
+ "source": "hypothesis",
394
+ "inReference": inRef,
395
+ ]
396
+ detectionDetails.append(detail)
397
+ }
398
+ }
399
+ }
400
+
401
+ let result: [String: Any] = [
402
+ "fileId": fileId,
403
+ "reference": referenceNormalized,
404
+ "hypothesis": hypothesisNormalized,
405
+ "wer": round(wer * 10000) / 100,
406
+ "dictFound": predictedKeywords.count,
407
+ "dictTotal": referenceKeywords.count,
408
+ "keywordPredicted": predictedKeywords.count,
409
+ "keywordReference": referenceKeywords.count,
410
+ "keywordTruePositives": truePositives.count,
411
+ "keywordFalsePositives": falsePositives.count,
412
+ "keywordFalseNegatives": falseNegatives.count,
413
+ "keywordPrecision": round(keywordPrecision * 1000) / 1000,
414
+ "keywordRecall": round(keywordRecall * 1000) / 1000,
415
+ "keywordFscore": round(keywordFscore * 1000) / 1000,
416
+ "audioLength": round(audioLength * 100) / 100,
417
+ "processingTime": round(processingTime * 1000) / 1000,
418
+ "ctcDetections": detectionDetails,
419
+ ]
420
+ return result
421
+ }
422
+
423
+ private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
424
+ if reference.isEmpty {
425
+ return hypothesis.isEmpty ? 0.0 : 1.0
426
+ }
427
+
428
+ let m = reference.count
429
+ let n = hypothesis.count
430
+ var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
431
+
432
+ for i in 0...m { dp[i][0] = i }
433
+ for j in 0...n { dp[0][j] = j }
434
+
435
+ for i in 1...m {
436
+ for j in 1...n {
437
+ if reference[i - 1] == hypothesis[j - 1] {
438
+ dp[i][j] = dp[i - 1][j - 1]
439
+ } else {
440
+ dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
441
+ }
442
+ }
443
+ }
444
+
445
+ return Double(dp[m][n]) / Double(m)
446
+ }
447
+
448
+ private static func printUsage() {
449
+ print(
450
+ """
451
+ Hybrid 110M Earnings Benchmark (Single Encoder)
452
+
453
+ Usage: fluidaudio hybrid-earnings-benchmark [options]
454
+
455
+ This benchmark uses ONLY the Hybrid 110M model:
456
+ - Single encoder provides CTC log-probs
457
+ - CTC greedy decode for transcription
458
+ - CTC keyword spotting from same encoder output
459
+
460
+ Options:
461
+ --max-files <n> Maximum number of files to process
462
+ --output, -o <path> Output JSON file (default: hybrid_earnings_benchmark.json)
463
+ --keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
464
+
465
+ Compare with:
466
+ fluidaudio ctc-earnings-benchmark (Canary-CTC + TDT 0.6B, two encoders)
467
+ """)
468
+ }
469
+
470
+ private static func parseKeywordMode(_ value: String) -> KeywordMode? {
471
+ switch value.lowercased() {
472
+ case "chunk", "chunk-keywords":
473
+ return .chunk
474
+ case "file", "file-keywords":
475
+ return .file
476
+ default:
477
+ return nil
478
+ }
479
+ }
480
+
481
+ private static func parentId(from fileId: String) -> String {
482
+ guard let range = fileId.range(of: "_chunk") else {
483
+ return fileId
484
+ }
485
+ return String(fileId[..<range.lowerBound])
486
+ }
487
+
488
+ private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
489
+ guard keywordMode == .file else {
490
+ return [:]
491
+ }
492
+
493
+ var index: [String: Set<String>] = [:]
494
+ let suffix = ".dictionary.txt"
495
+ let fileManager = FileManager.default
496
+ let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
497
+
498
+ for url in contents {
499
+ let name = url.lastPathComponent
500
+ guard name.hasSuffix(suffix) else { continue }
501
+ let fileId = String(name.dropLast(suffix.count))
502
+ let parent = parentId(from: fileId)
503
+ let words = try loadDictionaryWords(from: url)
504
+ var set = index[parent] ?? Set<String>()
505
+ set.formUnion(words)
506
+ index[parent] = set
507
+ }
508
+
509
+ return index.mapValues { Array($0).sorted() }
510
+ }
511
+
512
+ private static func loadDictionaryWords(
513
+ fileId: String,
514
+ dictionaryFile: URL,
515
+ keywordMode: KeywordMode,
516
+ keywordIndex: [String: [String]]
517
+ ) throws -> [String] {
518
+ switch keywordMode {
519
+ case .chunk:
520
+ return try loadDictionaryWords(from: dictionaryFile)
521
+ case .file:
522
+ let parent = parentId(from: fileId)
523
+ if let words = keywordIndex[parent] {
524
+ return words
525
+ }
526
+ return try loadDictionaryWords(from: dictionaryFile)
527
+ }
528
+ }
529
+
530
+ private static func loadDictionaryWords(from url: URL) throws -> [String] {
531
+ let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
532
+ return dictionaryContent
533
+ .components(separatedBy: .newlines)
534
+ .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
535
+ .filter { !$0.isEmpty }
536
+ }
537
+
538
+ private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
539
+ let textLower = text.lowercased()
540
+ var result: Set<String> = []
541
+
542
+ for word in dictionaryWords {
543
+ let wordLower = word.lowercased()
544
+ let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
545
+ guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
546
+ let range = NSRange(textLower.startIndex..., in: textLower)
547
+ if regex.firstMatch(in: textLower, options: [], range: range) != nil {
548
+ result.insert(wordLower)
549
+ }
550
+ }
551
+ return result
552
+ }
553
+ }
554
+ #endif
convert/.DS_Store ADDED
Binary file (10.2 kB). View file
 
convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Convert Parakeet TDT-CTC 110M decoder components to CoreML.
4
+
5
+ This script exports the TDT decoder (prediction network) and joint network
6
+ with the SAME format as the working 0.6B model:
7
+ - JointDecision outputs token_id, token_prob, duration (argmax done inside)
8
+ - Uses shape [1, dim, 1] for encoder/decoder steps
9
+ - Matches the interface expected by TdtDecoderV3
10
+ """
11
+
12
+ import argparse
13
+ import os
14
+ import torch
15
+ import torch.nn.functional as F
16
+ import coremltools as ct
17
+ import numpy as np
18
+ from pathlib import Path
19
+
20
+ # NeMo imports
21
+ import nemo.collections.asr as nemo_asr
22
+
23
+
24
+ def get_model_config(model):
25
+ """Extract model configuration."""
26
+ encoder_dim = None
27
+ pred_hidden = 640 # Default for parakeet models
28
+ num_layers = 1
29
+ vocab_size = 1024
30
+ num_durations = 5
31
+
32
+ # Get encoder dimension
33
+ if hasattr(model, 'encoder'):
34
+ encoder = model.encoder
35
+ if hasattr(encoder, 'd_model'):
36
+ encoder_dim = encoder.d_model
37
+ elif hasattr(encoder, '_feat_out'):
38
+ encoder_dim = encoder._feat_out
39
+
40
+ # Get decoder config
41
+ if hasattr(model, 'decoder'):
42
+ decoder = model.decoder
43
+ if hasattr(decoder, 'pred_hidden'):
44
+ pred_hidden = decoder.pred_hidden
45
+ if hasattr(decoder, 'pred_rnn_layers'):
46
+ num_layers = decoder.pred_rnn_layers
47
+
48
+ # Get joint config
49
+ if hasattr(model, 'joint'):
50
+ joint = model.joint
51
+ if hasattr(joint, 'num_extra_outputs'):
52
+ num_durations = joint.num_extra_outputs
53
+ if hasattr(joint, 'num_classes'):
54
+ vocab_size = joint.num_classes - num_durations
55
+
56
+ return {
57
+ 'encoder_dim': encoder_dim,
58
+ 'pred_hidden': pred_hidden,
59
+ 'num_layers': num_layers,
60
+ 'vocab_size': vocab_size,
61
+ 'num_durations': num_durations,
62
+ }
63
+
64
+
65
+ class DecoderWrapper(torch.nn.Module):
66
+ """
67
+ Wrapper for the RNNT/TDT decoder (prediction network).
68
+
69
+ Matches 0.6B format:
70
+ - Input: targets[1,1], target_lengths[1], h_in[num_layers,1,pred_hidden], c_in[...]
71
+ - Output: decoder_output[1,pred_hidden,2], h_out[...], c_out[...]
72
+ """
73
+
74
+ def __init__(self, decoder, pred_hidden):
75
+ super().__init__()
76
+ self.decoder = decoder
77
+ self.pred_hidden = pred_hidden
78
+
79
+ def forward(self, targets, target_lengths, h_in, c_in):
80
+ """
81
+ Args:
82
+ targets: [1, 1] - previous token ID
83
+ target_lengths: [1] - always 1
84
+ h_in: [num_layers, 1, pred_hidden]
85
+ c_in: [num_layers, 1, pred_hidden]
86
+ Returns:
87
+ decoder_output: [1, pred_hidden, 2] - prediction network output (transposed)
88
+ h_out: [num_layers, 1, pred_hidden]
89
+ c_out: [num_layers, 1, pred_hidden]
90
+ """
91
+ state = (h_in, c_in)
92
+ # pred_output shape: [batch, time, pred_hidden] = [1, 1, pred_hidden]
93
+ pred_output, new_state = self.decoder.predict(targets, state=state, add_sos=False)
94
+ h_out, c_out = new_state
95
+
96
+ # Transpose to [batch, pred_hidden, time] and concat two time steps
97
+ # (0.6B outputs [1, 640, 2] - we match this by duplicating)
98
+ pred_transposed = pred_output.transpose(1, 2) # [1, pred_hidden, 1]
99
+ decoder_output = torch.cat([pred_transposed, pred_transposed], dim=2) # [1, pred_hidden, 2]
100
+
101
+ return decoder_output, h_out, c_out
102
+
103
+
104
+ class JointWrapper(torch.nn.Module):
105
+ """
106
+ Wrapper for the TDT joint network with internal argmax.
107
+
108
+ Matches 0.6B format:
109
+ - Input: encoder_step[1,encoder_dim,1], decoder_step[1,pred_hidden,1]
110
+ - Output: token_id[1,1,1], token_prob[1,1,1], duration[1,1,1]
111
+ """
112
+
113
+ def __init__(self, joint, vocab_size, num_durations=5):
114
+ super().__init__()
115
+ self.joint = joint
116
+ self.vocab_size = vocab_size
117
+ self.num_durations = num_durations
118
+
119
+ def forward(self, encoder_step, decoder_step):
120
+ """
121
+ Args:
122
+ encoder_step: [1, encoder_dim, 1]
123
+ decoder_step: [1, pred_hidden, 1]
124
+ Returns:
125
+ token_id: [1, 1, 1] - argmax token ID
126
+ token_prob: [1, 1, 1] - probability of selected token
127
+ duration: [1, 1, 1] - argmax duration bin
128
+ """
129
+ # Transpose to [batch, 1, dim] for joint network
130
+ enc = encoder_step.transpose(1, 2) # [1, 1, encoder_dim]
131
+ dec = decoder_step.transpose(1, 2) # [1, 1, pred_hidden]
132
+
133
+ # Run joint network
134
+ # Joint output: [1, 1, 1, vocab_size + 1 (blank) + num_durations]
135
+ joint_out = self.joint.joint(enc, dec)
136
+
137
+ # Debug: print shape on first call
138
+ if not hasattr(self, '_debug_printed'):
139
+ self._debug_printed = True
140
+ print(f" Joint output shape: {joint_out.shape}")
141
+ print(f" Expected: vocab={self.vocab_size} + blank=1 + durations={self.num_durations} = {self.vocab_size + 1 + self.num_durations}")
142
+
143
+ # Split: token logits include vocab + blank, durations are separate
144
+ # vocab_size = 1024 tokens (0-1023), blank = index 1024, durations = indices 1025+
145
+ num_tokens = self.vocab_size + 1 # Include blank at vocab_size
146
+ logits = joint_out[..., :num_tokens] # [1, 1, 1, vocab_size + 1]
147
+ duration_logits = joint_out[..., num_tokens:] # [1, 1, 1, num_durations]
148
+
149
+ # Apply softmax and get probabilities
150
+ probs = F.softmax(logits, dim=-1)
151
+
152
+ # Argmax for token
153
+ token_id = torch.argmax(logits, dim=-1, keepdim=True) # [1, 1, 1, 1]
154
+ token_id = token_id.squeeze(-1) # [1, 1, 1]
155
+
156
+ # Get probability of selected token
157
+ token_prob = torch.gather(probs, -1, token_id.unsqueeze(-1)) # [1, 1, 1, 1]
158
+ token_prob = token_prob.squeeze(-1) # [1, 1, 1]
159
+
160
+ # Argmax for duration
161
+ duration = torch.argmax(duration_logits, dim=-1, keepdim=False) # [1, 1, 1]
162
+
163
+ return token_id.int(), token_prob, duration.int()
164
+
165
+
166
+ def convert_decoder(model, config, output_dir: Path):
167
+ """Convert decoder to CoreML."""
168
+ print(f"Converting Decoder...")
169
+ print(f" pred_hidden={config['pred_hidden']}, num_layers={config['num_layers']}")
170
+
171
+ wrapper = DecoderWrapper(model.decoder, config['pred_hidden'])
172
+ wrapper.eval()
173
+
174
+ # Create example inputs
175
+ targets = torch.zeros(1, 1, dtype=torch.long)
176
+ target_lengths = torch.ones(1, dtype=torch.long)
177
+ h_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
178
+ c_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
179
+
180
+ # Trace the model
181
+ with torch.no_grad():
182
+ traced = torch.jit.trace(wrapper, (targets, target_lengths, h_in, c_in))
183
+
184
+ # Convert to CoreML
185
+ mlmodel = ct.convert(
186
+ traced,
187
+ inputs=[
188
+ ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
189
+ ct.TensorType(name="target_lengths", shape=(1,), dtype=np.int32),
190
+ ct.TensorType(name="h_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
191
+ ct.TensorType(name="c_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
192
+ ],
193
+ outputs=[
194
+ ct.TensorType(name="decoder_output"),
195
+ ct.TensorType(name="h_out"),
196
+ ct.TensorType(name="c_out"),
197
+ ],
198
+ minimum_deployment_target=ct.target.iOS17,
199
+ compute_precision=ct.precision.FLOAT16,
200
+ )
201
+
202
+ # Add metadata
203
+ mlmodel.author = "Fluid Inference"
204
+ mlmodel.short_description = "Hybrid TDT Decoder (110M)"
205
+
206
+ # Save
207
+ output_path = output_dir / "Decoder.mlpackage"
208
+ mlmodel.save(str(output_path))
209
+ print(f" Saved to {output_path}")
210
+
211
+ return mlmodel
212
+
213
+
214
+ def convert_joint(model, config, output_dir: Path):
215
+ """Convert joint network to CoreML."""
216
+ print(f"Converting JointDecision...")
217
+ print(f" encoder_dim={config['encoder_dim']}, pred_hidden={config['pred_hidden']}")
218
+ print(f" vocab_size={config['vocab_size']}, num_durations={config['num_durations']}")
219
+
220
+ wrapper = JointWrapper(
221
+ model.joint,
222
+ vocab_size=config['vocab_size'],
223
+ num_durations=config['num_durations']
224
+ )
225
+ wrapper.eval()
226
+
227
+ # Create example inputs - shape [1, dim, 1]
228
+ encoder_step = torch.randn(1, config['encoder_dim'], 1)
229
+ decoder_step = torch.randn(1, config['pred_hidden'], 1)
230
+
231
+ # Trace the model
232
+ with torch.no_grad():
233
+ traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
234
+
235
+ # Convert to CoreML
236
+ mlmodel = ct.convert(
237
+ traced,
238
+ inputs=[
239
+ ct.TensorType(name="encoder_step", shape=(1, config['encoder_dim'], 1), dtype=np.float32),
240
+ ct.TensorType(name="decoder_step", shape=(1, config['pred_hidden'], 1), dtype=np.float32),
241
+ ],
242
+ outputs=[
243
+ ct.TensorType(name="token_id"),
244
+ ct.TensorType(name="token_prob"),
245
+ ct.TensorType(name="duration"),
246
+ ],
247
+ minimum_deployment_target=ct.target.iOS17,
248
+ compute_precision=ct.precision.FLOAT16,
249
+ )
250
+
251
+ # Add metadata
252
+ mlmodel.author = "Fluid Inference"
253
+ mlmodel.short_description = "Hybrid Joint Decision (110M)"
254
+
255
+ # Save
256
+ output_path = output_dir / "JointDecision.mlpackage"
257
+ mlmodel.save(str(output_path))
258
+ print(f" Saved to {output_path}")
259
+
260
+ return mlmodel
261
+
262
+
263
+ def main():
264
+ parser = argparse.ArgumentParser(description="Convert TDT decoder to CoreML (0.6B format)")
265
+ parser.add_argument(
266
+ "--model-name",
267
+ default="nvidia/parakeet-tdt_ctc-110m",
268
+ help="NeMo model name or path"
269
+ )
270
+ parser.add_argument(
271
+ "--output-dir",
272
+ type=Path,
273
+ default=Path("./output"),
274
+ help="Output directory for CoreML models"
275
+ )
276
+ args = parser.parse_args()
277
+
278
+ # Create output directory
279
+ args.output_dir.mkdir(parents=True, exist_ok=True)
280
+
281
+ # Load model
282
+ print(f"Loading model: {args.model_name}")
283
+ model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(args.model_name)
284
+ model.eval()
285
+
286
+ # Get model configuration
287
+ config = get_model_config(model)
288
+
289
+ # Auto-detect encoder dim if not found
290
+ if config['encoder_dim'] is None:
291
+ print("Auto-detecting encoder dimension...")
292
+ dummy_audio = torch.randn(1, 16000)
293
+ dummy_length = torch.tensor([16000])
294
+ with torch.no_grad():
295
+ enc_out, enc_len = model.encoder(
296
+ audio_signal=dummy_audio,
297
+ length=dummy_length
298
+ )
299
+ config['encoder_dim'] = enc_out.shape[-1]
300
+
301
+ print(f"\nModel config:")
302
+ for k, v in config.items():
303
+ print(f" {k}: {v}")
304
+
305
+ # Convert components
306
+ print()
307
+ convert_decoder(model, config, args.output_dir)
308
+ convert_joint(model, config, args.output_dir)
309
+
310
+ print("\nConversion complete!")
311
+ print(f"Models saved to: {args.output_dir}")
312
+ print("\nNext steps:")
313
+ print("1. Compile to .mlmodelc:")
314
+ print(f" cd {args.output_dir}")
315
+ print(" xcrun coremlcompiler compile Decoder.mlpackage .")
316
+ print(" xcrun coremlcompiler compile JointDecision.mlpackage .")
317
+ print("2. Copy to model cache:")
318
+ print(" cp -r Decoder.mlmodelc JointDecision.mlmodelc ~/Library/Application\\ Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
319
+ print("3. Test with: swift run fluidaudio hybrid-earnings-benchmark --max-files 1")
320
+
321
+
322
+ if __name__ == "__main__":
323
+ main()
convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c79c8bc763b4efccb3e12f199ec0a59aa2edc5e9e4d21ca70fde8f36762d4147
3
+ size 480078
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc681823d92eca3dbece3a30c975afa7251eedae0e718b07ffbf1a8b4313b87e
3
+ size 243
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ebec8fc38c063de4b2159e21b1f981309fa5947c24d7e4883aca20f7c15fbb9
3
+ size 377
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet 110M CTC decoder head",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 188 × 1025)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 188, 1025]",
13
+ "name" : "ctc_logits",
14
+ "type" : "MultiArray"
15
+ }
16
+ ],
17
+ "storagePrecision" : "Float16",
18
+ "modelParameters" : [
19
+
20
+ ],
21
+ "author" : "Fluid Inference",
22
+ "specificationVersion" : 8,
23
+ "mlProgramOperationTypeHistogram" : {
24
+ "Ios17.cast" : 2,
25
+ "Ios17.conv" : 1,
26
+ "Ios17.transpose" : 1,
27
+ "Ios16.softmax" : 1,
28
+ "Ios17.log" : 1
29
+ },
30
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
31
+ "isUpdatable" : "0",
32
+ "stateSchema" : [
33
+
34
+ ],
35
+ "availability" : {
36
+ "macOS" : "14.0",
37
+ "tvOS" : "17.0",
38
+ "visionOS" : "1.0",
39
+ "watchOS" : "10.0",
40
+ "iOS" : "17.0",
41
+ "macCatalyst" : "17.0"
42
+ },
43
+ "modelType" : {
44
+ "name" : "MLModelType_mlProgram"
45
+ },
46
+ "inputSchema" : [
47
+ {
48
+ "hasShapeFlexibility" : "0",
49
+ "isOptional" : "0",
50
+ "dataType" : "Float32",
51
+ "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
52
+ "shortDescription" : "",
53
+ "shape" : "[1, 512, 188]",
54
+ "name" : "encoder_output",
55
+ "type" : "MultiArray"
56
+ }
57
+ ],
58
+ "userDefinedMetadata" : {
59
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
60
+ "com.github.apple.coremltools.source" : "torch==2.9.0",
61
+ "com.github.apple.coremltools.version" : "8.3.0"
62
+ },
63
+ "generatedClassName" : "parakeet_ctc_head",
64
+ "method" : "predict"
65
+ }
66
+ ]
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 512, 188]> encoder_output) {
5
+ tensor<int32, []> var_4 = const()[name = tensor<string, []>("op_4"), val = tensor<int32, []>(-1)];
6
+ tensor<string, []> var_18_pad_type_0 = const()[name = tensor<string, []>("op_18_pad_type_0"), val = tensor<string, []>("valid")];
7
+ tensor<int32, [1]> var_18_strides_0 = const()[name = tensor<string, []>("op_18_strides_0"), val = tensor<int32, [1]>([1])];
8
+ tensor<int32, [2]> var_18_pad_0 = const()[name = tensor<string, []>("op_18_pad_0"), val = tensor<int32, [2]>([0, 0])];
9
+ tensor<int32, [1]> var_18_dilations_0 = const()[name = tensor<string, []>("op_18_dilations_0"), val = tensor<int32, [1]>([1])];
10
+ tensor<int32, []> var_18_groups_0 = const()[name = tensor<string, []>("op_18_groups_0"), val = tensor<int32, []>(1)];
11
+ tensor<string, []> encoder_output_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_output_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
12
+ tensor<fp16, [1025, 512, 1]> module_decoder_layers_0_weight_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_weight_to_fp16"), val = tensor<fp16, [1025, 512, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
13
+ tensor<fp16, [1025]> module_decoder_layers_0_bias_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_bias_to_fp16"), val = tensor<fp16, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1049728)))];
14
+ tensor<fp16, [1, 512, 188]> encoder_output_to_fp16 = cast(dtype = encoder_output_to_fp16_dtype_0, x = encoder_output)[name = tensor<string, []>("cast_1")];
15
+ tensor<fp16, [1, 1025, 188]> var_18_cast_fp16 = conv(bias = module_decoder_layers_0_bias_to_fp16, dilations = var_18_dilations_0, groups = var_18_groups_0, pad = var_18_pad_0, pad_type = var_18_pad_type_0, strides = var_18_strides_0, weight = module_decoder_layers_0_weight_to_fp16, x = encoder_output_to_fp16)[name = tensor<string, []>("op_18_cast_fp16")];
16
+ tensor<int32, [3]> input_perm_0 = const()[name = tensor<string, []>("input_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
17
+ tensor<fp16, [1, 188, 1025]> input_cast_fp16 = transpose(perm = input_perm_0, x = var_18_cast_fp16)[name = tensor<string, []>("transpose_0")];
18
+ tensor<fp16, [1, 188, 1025]> out_objects_softmax_cast_fp16 = softmax(axis = var_4, x = input_cast_fp16)[name = tensor<string, []>("out_objects_softmax_cast_fp16")];
19
+ tensor<fp32, []> out_objects_epsilon_0 = const()[name = tensor<string, []>("out_objects_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
20
+ tensor<fp16, [1, 188, 1025]> out_objects_cast_fp16 = log(epsilon = out_objects_epsilon_0, x = out_objects_softmax_cast_fp16)[name = tensor<string, []>("out_objects_cast_fp16")];
21
+ tensor<string, []> out_objects_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("out_objects_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
22
+ tensor<fp32, [1, 188, 1025]> ctc_logits = cast(dtype = out_objects_cast_fp16_to_fp32_dtype_0, x = out_objects_cast_fp16)[name = tensor<string, []>("cast_0")];
23
+ } -> (ctc_logits);
24
+ }
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
3
+ size 1051842
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:990455f6431342750254f66edf27bfb41be62a7ba17a18e1dd6afd4f5f56e9eb
3
+ size 243
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29009727821ad8551ab5fe9271e93c597d92a9714f64b94aa533a9ceb6e22b93
3
+ size 498
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet 110M decoder (RNNT prediction network)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 640, 1]",
13
+ "name" : "decoder",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 1, 640]",
23
+ "name" : "h_out",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Float32",
30
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 1, 640]",
33
+ "name" : "c_out",
34
+ "type" : "MultiArray"
35
+ }
36
+ ],
37
+ "storagePrecision" : "Float16",
38
+ "modelParameters" : [
39
+
40
+ ],
41
+ "author" : "Fluid Inference",
42
+ "specificationVersion" : 8,
43
+ "mlProgramOperationTypeHistogram" : {
44
+ "Ios17.squeeze" : 2,
45
+ "Ios17.gather" : 1,
46
+ "Ios17.cast" : 6,
47
+ "Ios17.lstm" : 1,
48
+ "Ios17.transpose" : 2,
49
+ "Identity" : 1,
50
+ "Ios17.expandDims" : 2
51
+ },
52
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
53
+ "isUpdatable" : "0",
54
+ "stateSchema" : [
55
+
56
+ ],
57
+ "availability" : {
58
+ "macOS" : "14.0",
59
+ "tvOS" : "17.0",
60
+ "visionOS" : "1.0",
61
+ "watchOS" : "10.0",
62
+ "iOS" : "17.0",
63
+ "macCatalyst" : "17.0"
64
+ },
65
+ "modelType" : {
66
+ "name" : "MLModelType_mlProgram"
67
+ },
68
+ "inputSchema" : [
69
+ {
70
+ "hasShapeFlexibility" : "0",
71
+ "isOptional" : "0",
72
+ "dataType" : "Int32",
73
+ "formattedType" : "MultiArray (Int32 1 × 1)",
74
+ "shortDescription" : "",
75
+ "shape" : "[1, 1]",
76
+ "name" : "targets",
77
+ "type" : "MultiArray"
78
+ },
79
+ {
80
+ "hasShapeFlexibility" : "0",
81
+ "isOptional" : "0",
82
+ "dataType" : "Int32",
83
+ "formattedType" : "MultiArray (Int32 1)",
84
+ "shortDescription" : "",
85
+ "shape" : "[1]",
86
+ "name" : "target_length",
87
+ "type" : "MultiArray"
88
+ },
89
+ {
90
+ "hasShapeFlexibility" : "0",
91
+ "isOptional" : "0",
92
+ "dataType" : "Float32",
93
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
94
+ "shortDescription" : "",
95
+ "shape" : "[1, 1, 640]",
96
+ "name" : "h_in",
97
+ "type" : "MultiArray"
98
+ },
99
+ {
100
+ "hasShapeFlexibility" : "0",
101
+ "isOptional" : "0",
102
+ "dataType" : "Float32",
103
+ "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
104
+ "shortDescription" : "",
105
+ "shape" : "[1, 1, 640]",
106
+ "name" : "c_in",
107
+ "type" : "MultiArray"
108
+ }
109
+ ],
110
+ "userDefinedMetadata" : {
111
+ "com.github.apple.coremltools.version" : "8.3.0",
112
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
113
+ "com.github.apple.coremltools.source" : "torch==2.9.0"
114
+ },
115
+ "generatedClassName" : "parakeet_decoder",
116
+ "method" : "predict"
117
+ }
118
+ ]
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
5
+ tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
6
+ tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
7
+ tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
8
+ tensor<fp16, [1025, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
9
+ tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
10
+ tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
11
+ tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
12
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
13
+ tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
14
+ tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
15
+ tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
16
+ tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
17
+ tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
18
+ tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
19
+ tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
20
+ tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
21
+ tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
22
+ tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
23
+ tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
24
+ tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
25
+ tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
26
+ tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1312128)))];
27
+ tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4588992)))];
28
+ tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7865856)))];
29
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
30
+ tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
31
+ tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
32
+ tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
33
+ tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
34
+ tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
35
+ tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
36
+ tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
37
+ tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
38
+ tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
39
+ tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
40
+ tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
41
+ tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
42
+ tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
43
+ tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
44
+ } -> (decoder, h_out, c_out);
45
+ }
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
3
+ size 7871040
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b7ae65e2af616df46066b7efca2d7c19941666ac0685f4ed005666890a052b0d
3
+ size 243
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0713c2d6ac5f8f6fb9582be250351ebd8efc925f71f4261191165f1406f2ee5d
3
+ size 437
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet 110M encoder (15 s window)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 512, 188]",
13
+ "name" : "encoder_output",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Int32",
20
+ "formattedType" : "MultiArray (Int32 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1]",
23
+ "name" : "encoder_length",
24
+ "type" : "MultiArray"
25
+ }
26
+ ],
27
+ "storagePrecision" : "Float16",
28
+ "modelParameters" : [
29
+
30
+ ],
31
+ "author" : "Fluid Inference",
32
+ "specificationVersion" : 8,
33
+ "mlProgramOperationTypeHistogram" : {
34
+ "Ios17.logicalAnd" : 2,
35
+ "Ios17.reshape" : 103,
36
+ "Ios16.softmax" : 17,
37
+ "Ios17.matmul" : 51,
38
+ "Ios17.transpose" : 123,
39
+ "Split" : 17,
40
+ "Ios17.expandDims" : 17,
41
+ "Select" : 51,
42
+ "Ios17.add" : 128,
43
+ "Tile" : 8,
44
+ "Ios17.sliceByIndex" : 34,
45
+ "Ios16.sigmoid" : 17,
46
+ "Pad" : 34,
47
+ "Ios17.logicalNot" : 2,
48
+ "Ios17.layerNorm" : 85,
49
+ "Ios16.silu" : 51,
50
+ "Ios17.less" : 5,
51
+ "Ios17.sub" : 3,
52
+ "Ios17.conv" : 56,
53
+ "Ios16.relu" : 3,
54
+ "Ios17.linear" : 137,
55
+ "Ios17.cast" : 11,
56
+ "Ios17.floorDiv" : 3,
57
+ "Ios17.mul" : 77
58
+ },
59
+ "computePrecision" : "Mixed (Float16, Float32, Int32)",
60
+ "isUpdatable" : "0",
61
+ "stateSchema" : [
62
+
63
+ ],
64
+ "availability" : {
65
+ "macOS" : "14.0",
66
+ "tvOS" : "17.0",
67
+ "visionOS" : "1.0",
68
+ "watchOS" : "10.0",
69
+ "iOS" : "17.0",
70
+ "macCatalyst" : "17.0"
71
+ },
72
+ "modelType" : {
73
+ "name" : "MLModelType_mlProgram"
74
+ },
75
+ "inputSchema" : [
76
+ {
77
+ "hasShapeFlexibility" : "0",
78
+ "isOptional" : "0",
79
+ "dataType" : "Float32",
80
+ "formattedType" : "MultiArray (Float32 1 × 80 × 1501)",
81
+ "shortDescription" : "",
82
+ "shape" : "[1, 80, 1501]",
83
+ "name" : "mel_features",
84
+ "type" : "MultiArray"
85
+ },
86
+ {
87
+ "hasShapeFlexibility" : "0",
88
+ "isOptional" : "0",
89
+ "dataType" : "Int32",
90
+ "formattedType" : "MultiArray (Int32 1)",
91
+ "shortDescription" : "",
92
+ "shape" : "[1]",
93
+ "name" : "mel_length",
94
+ "type" : "MultiArray"
95
+ }
96
+ ],
97
+ "userDefinedMetadata" : {
98
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
99
+ "com.github.apple.coremltools.source" : "torch==2.9.0",
100
+ "com.github.apple.coremltools.version" : "8.3.0"
101
+ },
102
+ "generatedClassName" : "parakeet_encoder",
103
+ "method" : "predict"
104
+ }
105
+ ]
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil ADDED
The diff for this file is too large to render. See raw diff
 
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
3
+ size 215143424
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:983ba26dd9276b8d2d4f75f3475aefb1817c542df87dbd0fdac95bd63647494f
3
+ size 243
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0800e3bdf4ecb1bd46fd27e1826d33125cd574f9ae1e15dd9ff70ea42944ca2d
3
+ size 476
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet 110M joint + decision head (split, softmax, argmax)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Int32",
10
+ "formattedType" : "MultiArray (Int32 1 × 188 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 188, 1]",
13
+ "name" : "token_id",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 188 × 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 188, 1]",
23
+ "name" : "token_prob",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Int32",
30
+ "formattedType" : "MultiArray (Int32 1 × 188 × 1)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 188, 1]",
33
+ "name" : "duration",
34
+ "type" : "MultiArray"
35
+ }
36
+ ],
37
+ "storagePrecision" : "Float16",
38
+ "modelParameters" : [
39
+
40
+ ],
41
+ "author" : "Fluid Inference",
42
+ "specificationVersion" : 8,
43
+ "mlProgramOperationTypeHistogram" : {
44
+ "Ios17.reduceArgmax" : 2,
45
+ "Ios17.squeeze" : 1,
46
+ "Ios17.cast" : 4,
47
+ "Ios17.linear" : 3,
48
+ "Ios17.transpose" : 2,
49
+ "Ios17.sliceByIndex" : 2,
50
+ "Ios17.add" : 1,
51
+ "Ios16.relu" : 1,
52
+ "Ios16.softmax" : 1,
53
+ "Ios17.gatherAlongAxis" : 1,
54
+ "Ios17.expandDims" : 3
55
+ },
56
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
57
+ "isUpdatable" : "0",
58
+ "stateSchema" : [
59
+
60
+ ],
61
+ "availability" : {
62
+ "macOS" : "14.0",
63
+ "tvOS" : "17.0",
64
+ "visionOS" : "1.0",
65
+ "watchOS" : "10.0",
66
+ "iOS" : "17.0",
67
+ "macCatalyst" : "17.0"
68
+ },
69
+ "modelType" : {
70
+ "name" : "MLModelType_mlProgram"
71
+ },
72
+ "inputSchema" : [
73
+ {
74
+ "hasShapeFlexibility" : "0",
75
+ "isOptional" : "0",
76
+ "dataType" : "Float32",
77
+ "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
78
+ "shortDescription" : "",
79
+ "shape" : "[1, 512, 188]",
80
+ "name" : "encoder",
81
+ "type" : "MultiArray"
82
+ },
83
+ {
84
+ "hasShapeFlexibility" : "0",
85
+ "isOptional" : "0",
86
+ "dataType" : "Float32",
87
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
88
+ "shortDescription" : "",
89
+ "shape" : "[1, 640, 1]",
90
+ "name" : "decoder",
91
+ "type" : "MultiArray"
92
+ }
93
+ ],
94
+ "userDefinedMetadata" : {
95
+ "com.github.apple.coremltools.version" : "8.3.0",
96
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
97
+ "com.github.apple.coremltools.source" : "torch==2.9.0"
98
+ },
99
+ "generatedClassName" : "parakeet_joint_decision",
100
+ "method" : "predict"
101
+ }
102
+ ]
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 512, 188]> encoder) {
5
+ tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
6
+ tensor<string, []> encoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
7
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
8
+ tensor<string, []> decoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
9
+ tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
10
+ tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
11
+ tensor<fp16, [1, 512, 188]> encoder_to_fp16 = cast(dtype = encoder_to_fp16_dtype_0, x = encoder)[name = tensor<string, []>("cast_6")];
12
+ tensor<fp16, [1, 188, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_to_fp16)[name = tensor<string, []>("transpose_1")];
13
+ tensor<fp16, [1, 188, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
14
+ tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
15
+ tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
16
+ tensor<fp16, [1, 640, 1]> decoder_to_fp16 = cast(dtype = decoder_to_fp16_dtype_0, x = decoder)[name = tensor<string, []>("cast_5")];
17
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_to_fp16)[name = tensor<string, []>("transpose_0")];
18
+ tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
19
+ tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
20
+ tensor<fp16, [1, 188, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
21
+ tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
22
+ tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
23
+ tensor<fp16, [1, 188, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
24
+ tensor<fp16, [1, 188, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
25
+ tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
26
+ tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
27
+ tensor<fp16, [1, 188, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
28
+ tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
29
+ tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1025])];
30
+ tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
31
+ tensor<fp16, [1, 188, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
32
+ tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
33
+ tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1030])];
34
+ tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
35
+ tensor<fp16, [1, 188, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
36
+ tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
37
+ tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
38
+ tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
39
+ tensor<int32, [1, 188, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
40
+ tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
41
+ tensor<fp16, [1, 188, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
42
+ tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
43
+ tensor<int32, [1, 188, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
44
+ tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
45
+ tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
46
+ tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
47
+ tensor<int16, [1, 188, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_4")];
48
+ tensor<fp16, [1, 188, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
49
+ tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
50
+ tensor<fp16, [1, 188, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
51
+ tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
52
+ tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
53
+ tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
54
+ tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
55
+ tensor<int32, [1, 188, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
56
+ tensor<fp32, [1, 188, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_3")];
57
+ } -> (token_id, token_prob, duration);
58
+ }
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
3
+ size 2798028
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7c11c6bb985fab7f835ba687a575f1eb04f4c93b0783155d634adbc49f0e797
3
+ size 243
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1af2cb9bcc13eec83ce006e4f1c2cf158393745cd9187428333fbcb6917da244
3
+ size 535
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet 110M single-step joint decision (current frame)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Int32",
10
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
11
+ "shortDescription" : "",
12
+ "shape" : "[1, 1, 1]",
13
+ "name" : "token_id",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Float32",
20
+ "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1, 1, 1]",
23
+ "name" : "token_prob",
24
+ "type" : "MultiArray"
25
+ },
26
+ {
27
+ "hasShapeFlexibility" : "0",
28
+ "isOptional" : "0",
29
+ "dataType" : "Int32",
30
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
31
+ "shortDescription" : "",
32
+ "shape" : "[1, 1, 1]",
33
+ "name" : "duration",
34
+ "type" : "MultiArray"
35
+ },
36
+ {
37
+ "hasShapeFlexibility" : "0",
38
+ "isOptional" : "0",
39
+ "dataType" : "Int32",
40
+ "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
41
+ "shortDescription" : "",
42
+ "shape" : "[1, 1, 1, 64]",
43
+ "name" : "top_k_ids",
44
+ "type" : "MultiArray"
45
+ },
46
+ {
47
+ "hasShapeFlexibility" : "0",
48
+ "isOptional" : "0",
49
+ "dataType" : "Float32",
50
+ "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
51
+ "shortDescription" : "",
52
+ "shape" : "[1, 1, 1, 64]",
53
+ "name" : "top_k_logits",
54
+ "type" : "MultiArray"
55
+ }
56
+ ],
57
+ "storagePrecision" : "Float16",
58
+ "modelParameters" : [
59
+
60
+ ],
61
+ "author" : "Fluid Inference",
62
+ "specificationVersion" : 8,
63
+ "mlProgramOperationTypeHistogram" : {
64
+ "Ios17.reduceArgmax" : 2,
65
+ "Ios17.linear" : 3,
66
+ "Ios17.transpose" : 2,
67
+ "Ios17.sliceByIndex" : 2,
68
+ "Ios17.add" : 1,
69
+ "Ios17.topk" : 1,
70
+ "Ios16.relu" : 1,
71
+ "Ios16.softmax" : 1,
72
+ "Ios17.expandDims" : 3,
73
+ "Ios17.squeeze" : 1,
74
+ "Ios17.cast" : 6,
75
+ "Ios17.gatherAlongAxis" : 1
76
+ },
77
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
78
+ "isUpdatable" : "0",
79
+ "stateSchema" : [
80
+
81
+ ],
82
+ "availability" : {
83
+ "macOS" : "14.0",
84
+ "tvOS" : "17.0",
85
+ "visionOS" : "1.0",
86
+ "watchOS" : "10.0",
87
+ "iOS" : "17.0",
88
+ "macCatalyst" : "17.0"
89
+ },
90
+ "modelType" : {
91
+ "name" : "MLModelType_mlProgram"
92
+ },
93
+ "inputSchema" : [
94
+ {
95
+ "hasShapeFlexibility" : "0",
96
+ "isOptional" : "0",
97
+ "dataType" : "Float32",
98
+ "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
99
+ "shortDescription" : "",
100
+ "shape" : "[1, 512, 1]",
101
+ "name" : "encoder_step",
102
+ "type" : "MultiArray"
103
+ },
104
+ {
105
+ "hasShapeFlexibility" : "0",
106
+ "isOptional" : "0",
107
+ "dataType" : "Float32",
108
+ "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
109
+ "shortDescription" : "",
110
+ "shape" : "[1, 640, 1]",
111
+ "name" : "decoder_step",
112
+ "type" : "MultiArray"
113
+ }
114
+ ],
115
+ "userDefinedMetadata" : {
116
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
117
+ "com.github.apple.coremltools.source" : "torch==2.9.0",
118
+ "com.github.apple.coremltools.version" : "8.3.0"
119
+ },
120
+ "generatedClassName" : "parakeet_joint_decision_single_step",
121
+ "method" : "predict"
122
+ }
123
+ ]
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
5
+ tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
6
+ tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
7
+ tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
8
+ tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
9
+ tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
10
+ tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
11
+ tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_9")];
12
+ tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
13
+ tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
14
+ tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
15
+ tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
16
+ tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_8")];
17
+ tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
18
+ tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
19
+ tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
20
+ tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
21
+ tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
22
+ tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
23
+ tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
24
+ tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
25
+ tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
26
+ tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
27
+ tensor<fp16, [1, 1, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
28
+ tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
29
+ tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1025])];
30
+ tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
31
+ tensor<fp16, [1, 1, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
32
+ tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
33
+ tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1030])];
34
+ tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
35
+ tensor<fp16, [1, 1, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
36
+ tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
37
+ tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
38
+ tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
39
+ tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
40
+ tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
41
+ tensor<fp16, [1, 1, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
42
+ tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
43
+ tensor<int32, [1, 1, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
44
+ tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
45
+ tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
46
+ tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
47
+ tensor<int16, [1, 1, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_7")];
48
+ tensor<fp16, [1, 1, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
49
+ tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
50
+ tensor<fp16, [1, 1, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
51
+ tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
52
+ tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
53
+ tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
54
+ tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
55
+ tensor<int32, [1, 1, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
56
+ tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(64)];
57
+ tensor<int32, []> var_76_axis_0 = const()[name = tensor<string, []>("op_76_axis_0"), val = tensor<int32, []>(-1)];
58
+ tensor<bool, []> var_76_ascending_0 = const()[name = tensor<string, []>("op_76_ascending_0"), val = tensor<bool, []>(false)];
59
+ tensor<bool, []> var_76_sort_0 = const()[name = tensor<string, []>("op_76_sort_0"), val = tensor<bool, []>(true)];
60
+ tensor<bool, []> var_76_return_indices_0 = const()[name = tensor<string, []>("op_76_return_indices_0"), val = tensor<bool, []>(true)];
61
+ tensor<string, []> var_76_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
62
+ tensor<fp16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_1 = topk(ascending = var_76_ascending_0, axis = var_76_axis_0, k = var_72, output_indices_dtype = var_76_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_76_return_indices_0, sort = var_76_sort_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_76_cast_fp16_cast_int16")];
63
+ tensor<string, []> var_76_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
64
+ tensor<string, []> var_76_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
65
+ tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_76_cast_fp16_0_to_fp32_dtype_0, x = var_76_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_4")];
66
+ tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_76_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_76_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_5")];
67
+ tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_6")];
68
+ } -> (token_id, token_prob, duration, top_k_ids, top_k_logits);
69
+ }
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
3
+ size 2798028
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1ac15543fbb9301fba5f018b147e44d767479dec352aaa91dfe7bcf65949693
3
+ size 243
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4940877938cc1b6d8830bbdd68ac8a49377cc57d75b61308883da5235b6a1914
3
+ size 439
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "metadataOutputVersion" : "3.0",
4
+ "shortDescription" : "Parakeet 110M preprocessor (15 s window)",
5
+ "outputSchema" : [
6
+ {
7
+ "hasShapeFlexibility" : "0",
8
+ "isOptional" : "0",
9
+ "dataType" : "Float32",
10
+ "formattedType" : "MultiArray (Float32)",
11
+ "shortDescription" : "",
12
+ "shape" : "[]",
13
+ "name" : "mel_features",
14
+ "type" : "MultiArray"
15
+ },
16
+ {
17
+ "hasShapeFlexibility" : "0",
18
+ "isOptional" : "0",
19
+ "dataType" : "Int32",
20
+ "formattedType" : "MultiArray (Int32 1)",
21
+ "shortDescription" : "",
22
+ "shape" : "[1]",
23
+ "name" : "mel_length",
24
+ "type" : "MultiArray"
25
+ }
26
+ ],
27
+ "storagePrecision" : "Float16",
28
+ "modelParameters" : [
29
+
30
+ ],
31
+ "author" : "Fluid Inference",
32
+ "specificationVersion" : 8,
33
+ "mlProgramOperationTypeHistogram" : {
34
+ "Range1d" : 3,
35
+ "Ios17.equal" : 1,
36
+ "Ios17.notEqual" : 1,
37
+ "Ios17.reshape" : 2,
38
+ "Identity" : 1,
39
+ "Ios17.matmul" : 1,
40
+ "Select" : 6,
41
+ "Ios17.expandDims" : 12,
42
+ "Ios17.add" : 3,
43
+ "Tile" : 2,
44
+ "Ios17.sliceByIndex" : 3,
45
+ "Ios16.reduceSum" : 4,
46
+ "Shape" : 4,
47
+ "Ios17.gather" : 4,
48
+ "Ios17.logicalNot" : 1,
49
+ "Pad" : 1,
50
+ "Ios17.log" : 1,
51
+ "Ios17.less" : 2,
52
+ "Ios17.sub" : 4,
53
+ "Ios17.conv" : 2,
54
+ "Ios17.pow" : 2,
55
+ "Ios17.cast" : 10,
56
+ "Ios17.concat" : 3,
57
+ "Stack" : 1,
58
+ "Ios17.floorDiv" : 1,
59
+ "Ios17.realDiv" : 4,
60
+ "Ios17.sqrt" : 1,
61
+ "Ios17.greaterEqual" : 1,
62
+ "Ios17.mul" : 1
63
+ },
64
+ "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
65
+ "isUpdatable" : "0",
66
+ "stateSchema" : [
67
+
68
+ ],
69
+ "availability" : {
70
+ "macOS" : "14.0",
71
+ "tvOS" : "17.0",
72
+ "visionOS" : "1.0",
73
+ "watchOS" : "10.0",
74
+ "iOS" : "17.0",
75
+ "macCatalyst" : "17.0"
76
+ },
77
+ "modelType" : {
78
+ "name" : "MLModelType_mlProgram"
79
+ },
80
+ "inputSchema" : [
81
+ {
82
+ "dataType" : "Float32",
83
+ "hasShapeFlexibility" : "1",
84
+ "isOptional" : "0",
85
+ "shapeFlexibility" : "1 × 1...240000",
86
+ "shapeRange" : "[[1, 1], [1, 240000]]",
87
+ "formattedType" : "MultiArray (Float32 1 × 1)",
88
+ "type" : "MultiArray",
89
+ "shape" : "[1, 1]",
90
+ "name" : "audio",
91
+ "shortDescription" : ""
92
+ },
93
+ {
94
+ "hasShapeFlexibility" : "0",
95
+ "isOptional" : "0",
96
+ "dataType" : "Int32",
97
+ "formattedType" : "MultiArray (Int32 1)",
98
+ "shortDescription" : "",
99
+ "shape" : "[1]",
100
+ "name" : "audio_length",
101
+ "type" : "MultiArray"
102
+ }
103
+ ],
104
+ "userDefinedMetadata" : {
105
+ "com.github.apple.coremltools.source_dialect" : "TorchScript",
106
+ "com.github.apple.coremltools.source" : "torch==2.9.0",
107
+ "com.github.apple.coremltools.version" : "8.3.0"
108
+ },
109
+ "generatedClassName" : "parakeet_preprocessor",
110
+ "method" : "predict"
111
+ }
112
+ ]
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ program(1.0)
2
+ [buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
3
+ {
4
+ func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 240000]]}})))] {
5
+ tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
6
+ tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
7
+ tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
8
+ tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
9
+ tensor<int32, [1]> var_35 = add(x = audio_length, y = var_34)[name = tensor<string, []>("op_35")];
10
+ tensor<int32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, []>(512)];
11
+ tensor<int32, [1]> var_37 = sub(x = var_35, y = var_36)[name = tensor<string, []>("op_37")];
12
+ tensor<int32, [1]> floor_div_0 = floor_div(x = var_37, y = var_10)[name = tensor<string, []>("floor_div_0")];
13
+ tensor<bool, [1]> var_40 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_40")];
14
+ tensor<int32, [1]> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, [1]>([0])];
15
+ tensor<int32, [1]> mel_length = select(a = var_41, b = floor_div_0, cond = var_40)[name = tensor<string, []>("seq_len")];
16
+ tensor<string, []> audio_to_fp16_dtype_0 = const()[name = tensor<string, []>("audio_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
17
+ tensor<fp16, [1, ?]> audio_to_fp16 = cast(dtype = audio_to_fp16_dtype_0, x = audio)[name = tensor<string, []>("cast_27")];
18
+ tensor<int32, [2]> var_43_shape_cast_fp16 = shape(x = audio_to_fp16)[name = tensor<string, []>("op_43_shape_cast_fp16")];
19
+ tensor<int32, []> gather_0_axis_0 = const()[name = tensor<string, []>("gather_0_axis_0"), val = tensor<int32, []>(0)];
20
+ tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
21
+ tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
22
+ tensor<string, []> var_43_shape_cast_fp16_to_int16_dtype_0 = const()[name = tensor<string, []>("op_43_shape_cast_fp16_to_int16_dtype_0"), val = tensor<string, []>("int16")];
23
+ tensor<uint16, []> select_0_to_uint16 = const()[name = tensor<string, []>("select_0_to_uint16"), val = tensor<uint16, []>(1)];
24
+ tensor<int16, [2]> var_43_shape_cast_fp16_to_int16 = cast(dtype = var_43_shape_cast_fp16_to_int16_dtype_0, x = var_43_shape_cast_fp16)[name = tensor<string, []>("cast_26")];
25
+ tensor<int16, []> gather_0_cast_uint16 = gather(axis = gather_0_axis_0, batch_dims = gather_0_batch_dims_0, indices = select_0_to_uint16, validate_indices = gather_0_validate_indices_0, x = var_43_shape_cast_fp16_to_int16)[name = tensor<string, []>("gather_0_cast_uint16")];
26
+ tensor<string, []> gather_0_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_0_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
27
+ tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
28
+ tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
29
+ tensor<int32, []> gather_0_cast_uint16_to_int32 = cast(dtype = gather_0_cast_uint16_to_int32_dtype_0, x = gather_0_cast_uint16)[name = tensor<string, []>("cast_25")];
30
+ tensor<int32, [?]> var_44 = range_1d(end = gather_0_cast_uint16_to_int32, start = const_0, step = const_1)[name = tensor<string, []>("op_44")];
31
+ tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([0])];
32
+ tensor<int32, [1, ?]> var_45 = expand_dims(axes = var_45_axes_0, x = var_44)[name = tensor<string, []>("op_45")];
33
+ tensor<int32, [1]> var_46_axes_0 = const()[name = tensor<string, []>("op_46_axes_0"), val = tensor<int32, [1]>([1])];
34
+ tensor<int32, [1, 1]> var_46 = expand_dims(axes = var_46_axes_0, x = audio_length)[name = tensor<string, []>("op_46")];
35
+ tensor<bool, [1, ?]> timemask = less(x = var_45, y = var_46)[name = tensor<string, []>("timemask")];
36
+ tensor<int32, [2]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [2]>([0, 0])];
37
+ tensor<int32, [2]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [2]>([1, 1])];
38
+ tensor<bool, [2]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [2]>([true, false])];
39
+ tensor<bool, [2]> var_49_squeeze_mask_0 = const()[name = tensor<string, []>("op_49_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
40
+ tensor<fp16, [1]> var_49_cast_fp16 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, squeeze_mask = var_49_squeeze_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_49_cast_fp16")];
41
+ tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
42
+ tensor<fp16, [1, 1]> var_50_cast_fp16 = expand_dims(axes = var_50_axes_0, x = var_49_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
43
+ tensor<int32, [2]> var_52_begin_0 = const()[name = tensor<string, []>("op_52_begin_0"), val = tensor<int32, [2]>([0, 1])];
44
+ tensor<int32, [2]> var_52_end_0 = const()[name = tensor<string, []>("op_52_end_0"), val = tensor<int32, [2]>([1, 0])];
45
+ tensor<bool, [2]> var_52_end_mask_0 = const()[name = tensor<string, []>("op_52_end_mask_0"), val = tensor<bool, [2]>([true, true])];
46
+ tensor<fp16, [1, ?]> var_52_cast_fp16 = slice_by_index(begin = var_52_begin_0, end = var_52_end_0, end_mask = var_52_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_52_cast_fp16")];
47
+ tensor<int32, [2]> var_54_begin_0 = const()[name = tensor<string, []>("op_54_begin_0"), val = tensor<int32, [2]>([0, 0])];
48
+ tensor<int32, [2]> var_54_end_0 = const()[name = tensor<string, []>("op_54_end_0"), val = tensor<int32, [2]>([1, -1])];
49
+ tensor<bool, [2]> var_54_end_mask_0 = const()[name = tensor<string, []>("op_54_end_mask_0"), val = tensor<bool, [2]>([true, false])];
50
+ tensor<fp16, [1, ?]> var_54_cast_fp16 = slice_by_index(begin = var_54_begin_0, end = var_54_end_0, end_mask = var_54_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_54_cast_fp16")];
51
+ tensor<fp16, []> var_55_to_fp16 = const()[name = tensor<string, []>("op_55_to_fp16"), val = tensor<fp16, []>(0x1.f0cp-1)];
52
+ tensor<fp16, [1, ?]> var_56_cast_fp16 = mul(x = var_54_cast_fp16, y = var_55_to_fp16)[name = tensor<string, []>("op_56_cast_fp16")];
53
+ tensor<fp16, [1, ?]> var_57_cast_fp16 = sub(x = var_52_cast_fp16, y = var_56_cast_fp16)[name = tensor<string, []>("op_57_cast_fp16")];
54
+ tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
55
+ tensor<fp16, [1, ?]> x_3_cast_fp16 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_50_cast_fp16, var_57_cast_fp16))[name = tensor<string, []>("x_3_cast_fp16")];
56
+ tensor<bool, [1, ?]> var_60 = logical_not(x = timemask)[name = tensor<string, []>("op_60")];
57
+ tensor<fp16, []> var_16_to_fp16 = const()[name = tensor<string, []>("op_16_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
58
+ tensor<fp16, [1, ?]> input_1_cast_fp16 = select(a = var_16_to_fp16, b = x_3_cast_fp16, cond = var_60)[name = tensor<string, []>("input_1_cast_fp16")];
59
+ tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
60
+ tensor<fp16, [1, 1, ?]> input_3_cast_fp16 = reshape(shape = concat_1x, x = input_1_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
61
+ tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
62
+ tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
63
+ tensor<fp16, []> const_3_to_fp16 = const()[name = tensor<string, []>("const_3_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
64
+ tensor<fp16, [1, 1, ?]> input_5_cast_fp16 = pad(constant_val = const_3_to_fp16, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
65
+ tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
66
+ tensor<fp16, [1, ?]> input_cast_fp16 = reshape(shape = concat_2x, x = input_5_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
67
+ tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
68
+ tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
69
+ tensor<fp16, [1, 1, ?]> expand_dims_4_cast_fp16 = expand_dims(axes = expand_dims_4_axes_0, x = input_cast_fp16)[name = tensor<string, []>("expand_dims_4_cast_fp16")];
70
+ tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
71
+ tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
72
+ tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
73
+ tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
74
+ tensor<fp16, [257, 1, 512]> expand_dims_1_to_fp16 = const()[name = tensor<string, []>("expand_dims_1_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
75
+ tensor<fp16, [1, 257, ?]> conv_0_cast_fp16 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_0_cast_fp16")];
76
+ tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
77
+ tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
78
+ tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
79
+ tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
80
+ tensor<fp16, [257, 1, 512]> expand_dims_2_to_fp16 = const()[name = tensor<string, []>("expand_dims_2_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263296)))];
81
+ tensor<fp16, [1, 257, ?]> conv_1_cast_fp16 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_1_cast_fp16")];
82
+ tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
83
+ tensor<fp16, [1, 257, ?, 2]> stack_0_cast_fp16 = stack(axis = stack_0_axis_0, values = (conv_0_cast_fp16, conv_1_cast_fp16))[name = tensor<string, []>("stack_0_cast_fp16")];
84
+ tensor<fp16, []> var_19_promoted_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
85
+ tensor<fp16, [1, 257, ?, 2]> var_75_cast_fp16 = pow(x = stack_0_cast_fp16, y = var_19_promoted_to_fp16)[name = tensor<string, []>("op_75_cast_fp16")];
86
+ tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([-1])];
87
+ tensor<bool, []> var_77_keep_dims_0 = const()[name = tensor<string, []>("op_77_keep_dims_0"), val = tensor<bool, []>(false)];
88
+ tensor<fp16, [1, 257, ?]> var_77_cast_fp16 = reduce_sum(axes = var_77_axes_0, keep_dims = var_77_keep_dims_0, x = var_75_cast_fp16)[name = tensor<string, []>("op_77_cast_fp16")];
89
+ tensor<fp16, [1, 257, ?]> x_11_cast_fp16 = identity(x = var_77_cast_fp16)[name = tensor<string, []>("x_11_cast_fp16")];
90
+ tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
91
+ tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
92
+ tensor<fp16, [1, 80, 257]> const_4_to_fp16 = const()[name = tensor<string, []>("const_4_to_fp16"), val = tensor<fp16, [1, 80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526528)))];
93
+ tensor<fp16, [1, 80, ?]> x_13_cast_fp16 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4_to_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("x_13_cast_fp16")];
94
+ tensor<fp16, []> var_84_to_fp16 = const()[name = tensor<string, []>("op_84_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
95
+ tensor<fp16, [1, 80, ?]> var_85_cast_fp16 = add(x = x_13_cast_fp16, y = var_84_to_fp16)[name = tensor<string, []>("op_85_cast_fp16")];
96
+ tensor<fp32, []> x_15_epsilon_0 = const()[name = tensor<string, []>("x_15_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
97
+ tensor<fp16, [1, 80, ?]> x_15_cast_fp16 = log(epsilon = x_15_epsilon_0, x = var_85_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
98
+ tensor<int32, [3]> var_87_shape_cast_fp16 = shape(x = x_15_cast_fp16)[name = tensor<string, []>("op_87_shape_cast_fp16")];
99
+ tensor<int32, []> gather_5 = const()[name = tensor<string, []>("gather_5"), val = tensor<int32, []>(1)];
100
+ tensor<int32, []> gather_6_axis_0 = const()[name = tensor<string, []>("gather_6_axis_0"), val = tensor<int32, []>(0)];
101
+ tensor<int32, []> gather_6_batch_dims_0 = const()[name = tensor<string, []>("gather_6_batch_dims_0"), val = tensor<int32, []>(0)];
102
+ tensor<bool, []> gather_6_validate_indices_0 = const()[name = tensor<string, []>("gather_6_validate_indices_0"), val = tensor<bool, []>(false)];
103
+ tensor<string, []> var_87_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_87_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
104
+ tensor<uint16, []> select_6_to_uint16 = const()[name = tensor<string, []>("select_6_to_uint16"), val = tensor<uint16, []>(2)];
105
+ tensor<uint16, [3]> var_87_shape_cast_fp16_to_uint16 = cast(dtype = var_87_shape_cast_fp16_to_uint16_dtype_0, x = var_87_shape_cast_fp16)[name = tensor<string, []>("cast_24")];
106
+ tensor<uint16, []> gather_6_cast_uint16 = gather(axis = gather_6_axis_0, batch_dims = gather_6_batch_dims_0, indices = select_6_to_uint16, validate_indices = gather_6_validate_indices_0, x = var_87_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_6_cast_uint16")];
107
+ tensor<string, []> gather_6_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_6_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
108
+ tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
109
+ tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
110
+ tensor<int32, []> gather_6_cast_uint16_to_int32 = cast(dtype = gather_6_cast_uint16_to_int32_dtype_0, x = gather_6_cast_uint16)[name = tensor<string, []>("cast_23")];
111
+ tensor<int32, [?]> var_89 = range_1d(end = gather_6_cast_uint16_to_int32, start = const_5, step = const_6)[name = tensor<string, []>("op_89")];
112
+ tensor<int32, [1]> var_90_axes_0 = const()[name = tensor<string, []>("op_90_axes_0"), val = tensor<int32, [1]>([0])];
113
+ tensor<int32, [1, ?]> var_90 = expand_dims(axes = var_90_axes_0, x = var_89)[name = tensor<string, []>("op_90")];
114
+ tensor<int32, []> concat_3_axis_0 = const()[name = tensor<string, []>("concat_3_axis_0"), val = tensor<int32, []>(0)];
115
+ tensor<bool, []> concat_3_interleave_0 = const()[name = tensor<string, []>("concat_3_interleave_0"), val = tensor<bool, []>(false)];
116
+ tensor<int32, [2]> concat_3 = concat(axis = concat_3_axis_0, interleave = concat_3_interleave_0, values = (gather_5, gather_6_cast_uint16_to_int32))[name = tensor<string, []>("concat_3")];
117
+ tensor<int32, [2]> shape_8 = shape(x = var_90)[name = tensor<string, []>("shape_8")];
118
+ tensor<int32, [2]> real_div_0 = real_div(x = concat_3, y = shape_8)[name = tensor<string, []>("real_div_0")];
119
+ tensor<int32, [?, ?]> time_steps = tile(reps = real_div_0, x = var_90)[name = tensor<string, []>("time_steps")];
120
+ tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
121
+ tensor<int32, [1, 1]> var_93 = expand_dims(axes = var_93_axes_0, x = mel_length)[name = tensor<string, []>("op_93")];
122
+ tensor<bool, [?, ?]> valid_mask = less(x = time_steps, y = var_93)[name = tensor<string, []>("valid_mask")];
123
+ tensor<int32, [1]> var_95_axes_0 = const()[name = tensor<string, []>("op_95_axes_0"), val = tensor<int32, [1]>([1])];
124
+ tensor<bool, [?, 1, ?]> var_95 = expand_dims(axes = var_95_axes_0, x = valid_mask)[name = tensor<string, []>("op_95")];
125
+ tensor<fp16, [1, 80, ?]> var_96_cast_fp16 = select(a = x_15_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_96_cast_fp16")];
126
+ tensor<int32, [1]> x_mean_numerator_axes_0 = const()[name = tensor<string, []>("x_mean_numerator_axes_0"), val = tensor<int32, [1]>([2])];
127
+ tensor<bool, []> x_mean_numerator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_numerator_keep_dims_0"), val = tensor<bool, []>(false)];
128
+ tensor<fp16, [1, 80]> x_mean_numerator_cast_fp16 = reduce_sum(axes = x_mean_numerator_axes_0, keep_dims = x_mean_numerator_keep_dims_0, x = var_96_cast_fp16)[name = tensor<string, []>("x_mean_numerator_cast_fp16")];
129
+ tensor<int32, [1]> x_mean_denominator_axes_0 = const()[name = tensor<string, []>("x_mean_denominator_axes_0"), val = tensor<int32, [1]>([1])];
130
+ tensor<bool, []> x_mean_denominator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_denominator_keep_dims_0"), val = tensor<bool, []>(false)];
131
+ tensor<string, []> cast_6_to_fp16_dtype_0 = const()[name = tensor<string, []>("cast_6_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
132
+ tensor<fp16, [?, ?]> valid_mask_to_fp16 = cast(dtype = cast_6_to_fp16_dtype_0, x = valid_mask)[name = tensor<string, []>("cast_22")];
133
+ tensor<fp16, [?]> x_mean_denominator_cast_fp16 = reduce_sum(axes = x_mean_denominator_axes_0, keep_dims = x_mean_denominator_keep_dims_0, x = valid_mask_to_fp16)[name = tensor<string, []>("x_mean_denominator_cast_fp16")];
134
+ tensor<int32, [1]> var_101_axes_0 = const()[name = tensor<string, []>("op_101_axes_0"), val = tensor<int32, [1]>([1])];
135
+ tensor<fp16, [?, 1]> var_101_cast_fp16 = expand_dims(axes = var_101_axes_0, x = x_mean_denominator_cast_fp16)[name = tensor<string, []>("op_101_cast_fp16")];
136
+ tensor<fp16, [?, 80]> x_mean_cast_fp16 = real_div(x = x_mean_numerator_cast_fp16, y = var_101_cast_fp16)[name = tensor<string, []>("x_mean_cast_fp16")];
137
+ tensor<int32, [1]> var_104_axes_0 = const()[name = tensor<string, []>("op_104_axes_0"), val = tensor<int32, [1]>([2])];
138
+ tensor<fp16, [?, 80, 1]> var_104_cast_fp16 = expand_dims(axes = var_104_axes_0, x = x_mean_cast_fp16)[name = tensor<string, []>("op_104_cast_fp16")];
139
+ tensor<fp16, [?, 80, ?]> var_105_cast_fp16 = sub(x = x_15_cast_fp16, y = var_104_cast_fp16)[name = tensor<string, []>("op_105_cast_fp16")];
140
+ tensor<fp16, [?, 80, ?]> var_106_cast_fp16 = select(a = var_105_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_106_cast_fp16")];
141
+ tensor<fp16, []> var_19_promoted_1_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_1_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
142
+ tensor<fp16, [?, 80, ?]> var_107_cast_fp16 = pow(x = var_106_cast_fp16, y = var_19_promoted_1_to_fp16)[name = tensor<string, []>("op_107_cast_fp16")];
143
+ tensor<int32, [1]> var_109_axes_0 = const()[name = tensor<string, []>("op_109_axes_0"), val = tensor<int32, [1]>([2])];
144
+ tensor<bool, []> var_109_keep_dims_0 = const()[name = tensor<string, []>("op_109_keep_dims_0"), val = tensor<bool, []>(false)];
145
+ tensor<fp16, [?, 80]> var_109_cast_fp16 = reduce_sum(axes = var_109_axes_0, keep_dims = var_109_keep_dims_0, x = var_107_cast_fp16)[name = tensor<string, []>("op_109_cast_fp16")];
146
+ tensor<fp16, []> var_111_to_fp16 = const()[name = tensor<string, []>("op_111_to_fp16"), val = tensor<fp16, []>(0x1p+0)];
147
+ tensor<fp16, [?, 1]> var_112_cast_fp16 = sub(x = var_101_cast_fp16, y = var_111_to_fp16)[name = tensor<string, []>("op_112_cast_fp16")];
148
+ tensor<fp16, [?, 80]> var_113_cast_fp16 = real_div(x = var_109_cast_fp16, y = var_112_cast_fp16)[name = tensor<string, []>("op_113_cast_fp16")];
149
+ tensor<fp16, [?, 80]> x_std_1_cast_fp16 = sqrt(x = var_113_cast_fp16)[name = tensor<string, []>("x_std_1_cast_fp16")];
150
+ tensor<bool, [?, 80]> var_115_cast_fp16 = not_equal(x = x_std_1_cast_fp16, y = x_std_1_cast_fp16)[name = tensor<string, []>("op_115_cast_fp16")];
151
+ tensor<fp16, [?, 80]> x_std_3_cast_fp16 = select(a = var_16_to_fp16, b = x_std_1_cast_fp16, cond = var_115_cast_fp16)[name = tensor<string, []>("x_std_3_cast_fp16")];
152
+ tensor<fp16, []> var_25_to_fp16 = const()[name = tensor<string, []>("op_25_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
153
+ tensor<fp16, [?, 80]> x_std_cast_fp16 = add(x = x_std_3_cast_fp16, y = var_25_to_fp16)[name = tensor<string, []>("x_std_cast_fp16")];
154
+ tensor<int32, [1]> var_120_axes_0 = const()[name = tensor<string, []>("op_120_axes_0"), val = tensor<int32, [1]>([2])];
155
+ tensor<fp16, [?, 80, 1]> var_120_cast_fp16 = expand_dims(axes = var_120_axes_0, x = x_std_cast_fp16)[name = tensor<string, []>("op_120_cast_fp16")];
156
+ tensor<fp16, [?, 80, ?]> x_cast_fp16 = real_div(x = var_105_cast_fp16, y = var_120_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
157
+ tensor<int32, [3]> var_122_shape_cast_fp16 = shape(x = x_cast_fp16)[name = tensor<string, []>("op_122_shape_cast_fp16")];
158
+ tensor<int32, []> gather_7_axis_0 = const()[name = tensor<string, []>("gather_7_axis_0"), val = tensor<int32, []>(0)];
159
+ tensor<int32, []> gather_7_batch_dims_0 = const()[name = tensor<string, []>("gather_7_batch_dims_0"), val = tensor<int32, []>(0)];
160
+ tensor<bool, []> gather_7_validate_indices_0 = const()[name = tensor<string, []>("gather_7_validate_indices_0"), val = tensor<bool, []>(false)];
161
+ tensor<string, []> var_122_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_122_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
162
+ tensor<uint16, []> select_7_to_uint16 = const()[name = tensor<string, []>("select_7_to_uint16"), val = tensor<uint16, []>(2)];
163
+ tensor<uint16, [3]> var_122_shape_cast_fp16_to_uint16 = cast(dtype = var_122_shape_cast_fp16_to_uint16_dtype_0, x = var_122_shape_cast_fp16)[name = tensor<string, []>("cast_21")];
164
+ tensor<uint16, []> gather_7_cast_uint16 = gather(axis = gather_7_axis_0, batch_dims = gather_7_batch_dims_0, indices = select_7_to_uint16, validate_indices = gather_7_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_7_cast_uint16")];
165
+ tensor<string, []> gather_7_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_7_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
166
+ tensor<int32, []> const_7 = const()[name = tensor<string, []>("const_7"), val = tensor<int32, []>(0)];
167
+ tensor<int32, []> const_8 = const()[name = tensor<string, []>("const_8"), val = tensor<int32, []>(1)];
168
+ tensor<int32, []> gather_7_cast_uint16_to_int32 = cast(dtype = gather_7_cast_uint16_to_int32_dtype_0, x = gather_7_cast_uint16)[name = tensor<string, []>("cast_20")];
169
+ tensor<int32, [?]> mask_1 = range_1d(end = gather_7_cast_uint16_to_int32, start = const_7, step = const_8)[name = tensor<string, []>("mask_1")];
170
+ tensor<int32, []> gather_8_axis_0 = const()[name = tensor<string, []>("gather_8_axis_0"), val = tensor<int32, []>(0)];
171
+ tensor<int32, []> gather_8_batch_dims_0 = const()[name = tensor<string, []>("gather_8_batch_dims_0"), val = tensor<int32, []>(0)];
172
+ tensor<bool, []> gather_8_validate_indices_0 = const()[name = tensor<string, []>("gather_8_validate_indices_0"), val = tensor<bool, []>(false)];
173
+ tensor<uint16, []> select_8_to_uint16 = const()[name = tensor<string, []>("select_8_to_uint16"), val = tensor<uint16, []>(0)];
174
+ tensor<uint16, []> gather_8_cast_uint16 = gather(axis = gather_8_axis_0, batch_dims = gather_8_batch_dims_0, indices = select_8_to_uint16, validate_indices = gather_8_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_8_cast_uint16")];
175
+ tensor<string, []> gather_8_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_8_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
176
+ tensor<int32, []> concat_4_axis_0 = const()[name = tensor<string, []>("concat_4_axis_0"), val = tensor<int32, []>(0)];
177
+ tensor<bool, []> concat_4_interleave_0 = const()[name = tensor<string, []>("concat_4_interleave_0"), val = tensor<bool, []>(false)];
178
+ tensor<int32, []> gather_8_cast_uint16_to_int32 = cast(dtype = gather_8_cast_uint16_to_int32_dtype_0, x = gather_8_cast_uint16)[name = tensor<string, []>("cast_19")];
179
+ tensor<int32, [2]> concat_4 = concat(axis = concat_4_axis_0, interleave = concat_4_interleave_0, values = (gather_8_cast_uint16_to_int32, var_9))[name = tensor<string, []>("concat_4")];
180
+ tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
181
+ tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
182
+ tensor<int32, [?, ?]> var_126 = tile(reps = concat_4, x = expand_dims_0)[name = tensor<string, []>("op_126")];
183
+ tensor<bool, [?, ?]> mask = greater_equal(x = var_126, y = var_93)[name = tensor<string, []>("mask")];
184
+ tensor<int32, [1]> var_129_axes_0 = const()[name = tensor<string, []>("op_129_axes_0"), val = tensor<int32, [1]>([1])];
185
+ tensor<bool, [?, 1, ?]> var_129 = expand_dims(axes = var_129_axes_0, x = mask)[name = tensor<string, []>("op_129")];
186
+ tensor<fp16, []> cast_15_to_fp16 = const()[name = tensor<string, []>("cast_15_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
187
+ tensor<fp16, [?, 80, ?]> processed_signal_cast_fp16 = select(a = cast_15_to_fp16, b = x_cast_fp16, cond = var_129)[name = tensor<string, []>("processed_signal_cast_fp16")];
188
+ tensor<string, []> processed_signal_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("processed_signal_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
189
+ tensor<fp32, [?, 80, ?]> mel_features = cast(dtype = processed_signal_cast_fp16_to_fp32_dtype_0, x = processed_signal_cast_fp16)[name = tensor<string, []>("cast_18")];
190
+ } -> (mel_features, mel_length);
191
+ }
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c062338de852a26607ce4101f74e6895de3a4134a57b07232bd72bfc6f1d7f1a
3
+ size 567712
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "nvidia/parakeet-tdt_ctc-110m",
3
+ "model_type": "hybrid_rnnt_ctc",
4
+ "sample_rate": 16000,
5
+ "max_audio_seconds": 15.0,
6
+ "max_audio_samples": 240000,
7
+ "max_symbol_steps": 1,
8
+ "vocab_size": 1024,
9
+ "joint_extra_outputs": 5,
10
+ "encoder_dim": 512,
11
+ "decoder_dim": 640,
12
+ "decoder_hidden": 640,
13
+ "decoder_layers": 1,
14
+ "blank_id": 1024,
15
+ "checkpoint": {
16
+ "type": "pretrained",
17
+ "model_id": "nvidia/parakeet-tdt_ctc-110m"
18
+ },
19
+ "coreml": {
20
+ "compute_units": "CPU_ONLY",
21
+ "compute_precision": "FLOAT32"
22
+ },
23
+ "components": {
24
+ "preprocessor": {
25
+ "inputs": {
26
+ "audio_signal": [
27
+ 1,
28
+ 240000
29
+ ],
30
+ "audio_length": [
31
+ 1
32
+ ]
33
+ },
34
+ "outputs": {
35
+ "mel": [
36
+ 1,
37
+ 80,
38
+ 1501
39
+ ],
40
+ "mel_length": [
41
+ 1
42
+ ]
43
+ },
44
+ "path": "parakeet_preprocessor.mlpackage"
45
+ },
46
+ "encoder": {
47
+ "inputs": {
48
+ "mel": [
49
+ 1,
50
+ 80,
51
+ 1501
52
+ ],
53
+ "mel_length": [
54
+ 1
55
+ ]
56
+ },
57
+ "outputs": {
58
+ "encoder": [
59
+ 1,
60
+ 512,
61
+ 188
62
+ ],
63
+ "encoder_length": [
64
+ 1
65
+ ]
66
+ },
67
+ "path": "parakeet_encoder.mlpackage"
68
+ },
69
+ "ctc_head": {
70
+ "inputs": {
71
+ "encoder": [
72
+ 1,
73
+ 512,
74
+ 188
75
+ ]
76
+ },
77
+ "outputs": {
78
+ "log_probs": [
79
+ 1,
80
+ 188,
81
+ 1025
82
+ ]
83
+ },
84
+ "path": "parakeet_ctc_head.mlpackage"
85
+ },
86
+ "mel_encoder": {
87
+ "inputs": {
88
+ "audio_signal": [
89
+ 1,
90
+ 240000
91
+ ],
92
+ "audio_length": [
93
+ 1
94
+ ]
95
+ },
96
+ "outputs": {
97
+ "encoder": [
98
+ 1,
99
+ 512,
100
+ 188
101
+ ],
102
+ "encoder_length": [
103
+ 1
104
+ ]
105
+ },
106
+ "path": "parakeet_mel_encoder.mlpackage"
107
+ },
108
+ "decoder": {
109
+ "inputs": {
110
+ "targets": [
111
+ 1,
112
+ 1
113
+ ],
114
+ "target_length": [
115
+ 1
116
+ ],
117
+ "h_in": [
118
+ 1,
119
+ 1,
120
+ 640
121
+ ],
122
+ "c_in": [
123
+ 1,
124
+ 1,
125
+ 640
126
+ ]
127
+ },
128
+ "outputs": {
129
+ "decoder": [
130
+ 1,
131
+ 640,
132
+ 1
133
+ ],
134
+ "h_out": [
135
+ 1,
136
+ 1,
137
+ 640
138
+ ],
139
+ "c_out": [
140
+ 1,
141
+ 1,
142
+ 640
143
+ ]
144
+ },
145
+ "path": "parakeet_decoder.mlpackage"
146
+ },
147
+ "joint": {
148
+ "inputs": {
149
+ "encoder": [
150
+ 1,
151
+ 512,
152
+ 188
153
+ ],
154
+ "decoder": [
155
+ 1,
156
+ 640,
157
+ 1
158
+ ]
159
+ },
160
+ "outputs": {
161
+ "logits": [
162
+ 1,
163
+ 188,
164
+ 1,
165
+ 1030
166
+ ]
167
+ },
168
+ "path": "parakeet_joint.mlpackage"
169
+ },
170
+ "joint_decision": {
171
+ "inputs": {
172
+ "encoder": [
173
+ 1,
174
+ 512,
175
+ 188
176
+ ],
177
+ "decoder": [
178
+ 1,
179
+ 640,
180
+ 1
181
+ ]
182
+ },
183
+ "outputs": {
184
+ "token_id": [
185
+ 1,
186
+ 188,
187
+ 1
188
+ ],
189
+ "token_prob": [
190
+ 1,
191
+ 188,
192
+ 1
193
+ ],
194
+ "duration": [
195
+ 1,
196
+ 188,
197
+ 1
198
+ ]
199
+ },
200
+ "path": "parakeet_joint_decision.mlpackage"
201
+ },
202
+ "joint_decision_single_step": {
203
+ "inputs": {
204
+ "encoder_step": [
205
+ 1,
206
+ 512,
207
+ 1
208
+ ],
209
+ "decoder_step": [
210
+ 1,
211
+ 640,
212
+ 1
213
+ ]
214
+ },
215
+ "outputs": {
216
+ "token_id": [
217
+ 1,
218
+ 1,
219
+ 1
220
+ ],
221
+ "token_prob": [
222
+ 1,
223
+ 1,
224
+ 1
225
+ ],
226
+ "duration": [
227
+ 1,
228
+ 1,
229
+ 1
230
+ ],
231
+ "top_k_ids": [
232
+ 1,
233
+ 1,
234
+ 1,
235
+ 64
236
+ ],
237
+ "top_k_logits": [
238
+ 1,
239
+ 1,
240
+ 1,
241
+ 64
242
+ ]
243
+ },
244
+ "path": "parakeet_joint_decision_single_step.mlpackage"
245
+ }
246
+ }
247
+ }
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": "<unk>", "1": "▁t", "2": "▁th", "3": "▁a", "4": "in", "5": "re", "6": "▁the", "7": "▁w", "8": "▁s", "9": "▁o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "▁h", "16": "▁c", "17": "▁b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "▁f", "23": "▁to", "24": "▁m", "25": "es", "26": "▁p", "27": "or", "28": "an", "29": "▁d", "30": "ll", "31": "▁I", "32": "ed", "33": "▁and", "34": "▁l", "35": "▁of", "36": "▁in", "37": "▁y", "38": "ar", "39": "▁g", "40": "▁you", "41": "as", "42": "om", "43": "▁n", "44": "ve", "45": "▁that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "▁e", "53": "ut", "54": "▁it", "55": "ot", "56": "▁be", "57": "▁T", "58": "ion", "59": "▁is", "60": "▁wh", "61": "▁re", "62": "▁on", "63": "▁we", "64": "ent", "65": "▁A", "66": "ay", "67": "▁ha", "68": "▁Th", "69": "id", "70": "▁S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "▁for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "▁he", "81": "▁st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "▁this", "91": "if", "92": "▁W", "93": "oo", "94": "ri", "95": "▁was", "96": "ght", "97": "▁u", "98": "▁with", "99": "ad", "100": "ch", "101": "▁se", "102": "▁k", "103": "▁an", "104": "▁The", "105": "▁li", "106": "▁do", "107": "▁B", "108": "▁have", "109": "▁as", "110": "th", "111": "▁are", "112": "▁sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "▁H", "118": "▁j", "119": "ter", "120": "▁go", "121": "▁And", "122": "ation", "123": "▁C", "124": "▁so", "125": "ome", "126": "▁not", "127": "op", "128": "il", "129": "ore", "130": "▁ne", "131": "▁can", "132": "▁me", "133": "▁at", "134": "ould", "135": "ant", "136": "▁M", "137": "▁like", "138": "ere", "139": "▁they", "140": "ra", "141": "ers", "142": "▁ab", "143": "▁de", "144": "▁kn", "145": "ge", "146": "▁Y", "147": "▁ch", "148": "ul", "149": "pp", "150": "▁or", "151": "▁al", "152": "▁con", "153": "▁com", "154": "ess", "155": "▁su", "156": "out", "157": "▁your", "158": "▁So", "159": "ate", "160": "▁one", "161": "▁all", "162": "▁ex", "163": "est", "164": "▁fr", "165": "▁just", "166": "▁pro", "167": "▁know", "168": "▁O", "169": "ain", "170": "▁but", "171": "ol", "172": "ive", "173": "▁v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "▁my", "179": "el", "180": "▁N", "181": "nt", "182": "▁It", "183": "▁what", "184": "ab", "185": "▁P", "186": "▁wor", "187": "▁out", "188": "▁there", "189": "▁up", "190": "um", "191": "▁from", "192": "pe", "193": "▁tw", "194": "▁r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "▁L", "200": "ist", "201": "▁about", "202": "ide", "203": "ig", "204": "ake", "205": "▁D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "▁We", "214": "▁get", "215": "▁E", "216": "▁G", "217": "ack", "218": "▁le", "219": "ity", "220": "od", "221": "▁F", "222": "ard", "223": "▁pl", "224": "▁our", "225": "▁int", "226": "ment", "227": "▁will", "228": "ies", "229": "▁by", "230": "ink", "231": "ca", "232": "▁if", "233": "red", "234": "her", "235": "ie", "236": "▁us", "237": "▁some", "238": "▁don", "239": "ven", "240": "ood", "241": "ast", "242": "▁R", "243": "▁his", "244": "▁tim", "245": "▁tr", "246": "▁more", "247": "ich", "248": "ous", "249": "ame", "250": "▁going", "251": "▁had", "252": "▁them", "253": "ook", "254": "▁pe", "255": "▁Wh", "256": "▁You", "257": "▁But", "258": "ine", "259": "▁here", "260": "▁would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "▁has", "267": "ect", "268": "▁think", "269": "▁fe", "270": "ong", "271": "▁see", "272": "▁when", "273": "▁who", "274": "▁were", "275": "▁really", "276": "▁their", "277": "▁want", "278": "one", "279": "ople", "280": "▁then", "281": "▁time", "282": "▁sa", "283": "ap", "284": "▁te", "285": "▁He", "286": "▁ye", "287": "ck", "288": "▁her", "289": "▁thing", "290": "▁right", "291": "▁which", "292": "itt", "293": "ice", "294": "act", "295": "▁people", "296": "ty", "297": "▁two", "298": "▁J", "299": "▁im", "300": "ther", "301": "ci", "302": "ose", "303": "▁cl", "304": "▁qu", "305": "▁man", "306": "▁also", "307": "ree", "308": "▁en", "309": "ud", "310": "▁how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "▁any", "316": "ff", "317": "ace", "318": "per", "319": "▁because", "320": "▁very", "321": "own", "322": "▁ad", "323": "▁act", "324": "▁been", "325": "▁now", "326": "▁ag", "327": "▁into", "328": "▁comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "▁these", "335": "ays", "336": "ep", "337": "▁This", "338": "▁she", "339": "ans", "340": "ah", "341": "een", "342": "▁over", "343": "ry", "344": "▁lo", "345": "age", "346": "▁pr", "347": "▁sp", "348": "ue", "349": "▁co", "350": "ick", "351": "ber", "352": "▁did", "353": "ip", "354": "ach", "355": "▁back", "356": "▁no", "357": "▁cont", "358": "▁other", "359": "▁every", "360": "pt", "361": "▁need", "362": "▁him", "363": "▁U", "364": "▁In", "365": "▁work", "366": "irst", "367": "▁part", "368": "▁look", "369": "ittle", "370": "ble", "371": "iz", "372": "▁un", "373": "▁make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "▁little", "379": "▁off", "380": "▁than", "381": "▁got", "382": "ually", "383": "▁per", "384": "▁good", "385": "▁way", "386": "▁could", "387": "▁ac", "388": "▁imp", "389": "able", "390": "▁where", "391": "iff", "392": "▁That", "393": "▁res", "394": "ount", "395": "pl", "396": "ance", "397": "▁first", "398": "▁ro", "399": "▁pre", "400": "ass", "401": "▁say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "▁somet", "408": "ound", "409": "▁down", "410": "▁diff", "411": "sel", "412": "▁gu", "413": "▁am", "414": "ress", "415": "▁lot", "416": "ence", "417": "▁dis", "418": "orm", "419": "ix", "420": "▁po", "421": "ving", "422": "enty", "423": "▁K", "424": "▁spe", "425": "und", "426": "he", "427": "▁much", "428": "▁ar", "429": "round", "430": "▁app", "431": "co", "432": "ark", "433": "▁new", "434": "ater", "435": "ult", "436": "end", "437": "▁even", "438": "▁start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "▁well", "444": "be", "445": "▁They", "446": "▁three", "447": "ign", "448": "ild", "449": "▁said", "450": "ough", "451": "ang", "452": "▁too", "453": "ade", "454": "▁bl", "455": "ens", "456": "▁inc", "457": "ia", "458": "▁those", "459": "▁mo", "460": "▁take", "461": "▁through", "462": "▁fl", "463": "▁kind", "464": "▁things", "465": "▁bet", "466": "▁only", "467": "▁St", "468": "▁let", "469": "cess", "470": "▁Ch", "471": "ary", "472": "vel", "473": "▁If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "▁again", "480": "▁something", "481": "onna", "482": "fore", "483": "▁may", "484": "ting", "485": "▁bu", "486": "▁differe", "487": "urn", "488": "▁gonna", "489": "▁does", "490": "uct", "491": "og", "492": "▁twenty", "493": "▁gr", "494": "▁Ye", "495": "wn", "496": "▁should", "497": "▁comm", "498": "ition", "499": "▁under", "500": "▁hel", "501": "ory", "502": "▁fo", "503": "▁use", "504": "igh", "505": "ife", "506": "▁actually", "507": "▁tal", "508": "▁call", "509": "ents", "510": "ious", "511": "ull", "512": "▁There", "513": "▁Yeah", "514": "▁most", "515": "▁ke", "516": "ors", "517": "ved", "518": "ys", "519": "▁sc", "520": "▁happ", "521": "ope", "522": "▁help", "523": "atch", "524": "▁What", "525": "▁rem", "526": "ple", "527": "▁Now", "528": "▁br", "529": "ool", "530": "oth", "531": "▁four", "532": "self", "533": "▁str", "534": "ne", "535": "thing", "536": "▁put", "537": "ial", "538": "▁great", "539": "ail", "540": "ub", "541": "ning", "542": "▁sm", "543": "▁feel", "544": "▁five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "▁many", "552": "▁hundred", "553": "▁years", "554": "▁being", "555": "▁come", "556": "▁mean", "557": "ily", "558": "▁different", "559": "▁after", "560": "▁ser", "561": "▁show", "562": "form", "563": "ful", "564": "oy", "565": "▁six", "566": "▁vide", "567": "▁V", "568": "▁its", "569": "▁point", "570": "▁day", "571": "▁des", "572": "ons", "573": "▁bit", "574": "▁bel", "575": "▁before", "576": "▁aw", "577": "▁end", "578": "▁Oh", "579": "▁still", "580": "ath", "581": "▁long", "582": "▁'", "583": "ise", "584": "ob", "585": "day", "586": "▁add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "▁cr", "592": "▁around", "593": "▁try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "▁find", "600": "ward", "601": "▁As", "602": "▁eight", "603": "lic", "604": "▁same", "605": "▁pos", "606": "▁em", "607": "▁made", "608": "▁supp", "609": "▁life", "610": "▁Be", "611": "pect", "612": "▁dec", "613": "▁play", "614": "ange", "615": "▁att", "616": "▁pers", "617": "ways", "618": "▁high", "619": "▁hand", "620": "▁next", "621": "▁cons", "622": "▁own", "623": "▁inv", "624": "ower", "625": "▁ind", "626": "ert", "627": "ng", "628": "ave", "629": "▁year", "630": "▁big", "631": "ating", "632": "▁world", "633": "▁rel", "634": "▁sure", "635": "▁tra", "636": "ew", "637": "ered", "638": "▁fin", "639": "▁Well", "640": "▁sl", "641": "▁doing", "642": "bs", "643": "▁set", "644": "▁rec", "645": "ual", "646": "cial", "647": "▁ph", "648": "erm", "649": "▁love", "650": "ph", "651": "▁real", "652": "▁last", "653": "ict", "654": "▁bo", "655": "▁ra", "656": "ible", "657": "▁wr", "658": "mer", "659": "▁count", "660": "ities", "661": "▁always", "662": "inet", "663": "ments", "664": "uc", "665": "▁might", "666": "▁inter", "667": "▁video", "668": "gin", "669": "▁tell", "670": "▁never", "671": "vent", "672": "▁import", "673": "ied", "674": "▁sy", "675": "▁How", "676": "ically", "677": "ought", "678": "▁thir", "679": "▁rep", "680": "ks", "681": "ib", "682": "▁fam", "683": "ject", "684": "▁bas", "685": "▁She", "686": "▁give", "687": "akes", "688": "▁ninet", "689": "▁reg", "690": "▁min", "691": "▁op", "692": "▁def", "693": "▁didn", "694": "te", "695": "▁cour", "696": "▁why", "697": "▁ent", "698": "▁place", "699": "▁ins", "700": "▁car", "701": "ather", "702": "▁person", "703": "ular", "704": "▁inst", "705": "▁prod", "706": "lect", "707": "▁Al", "708": "▁today", "709": "▁bec", "710": "▁sur", "711": "▁All", "712": "▁another", "713": "▁bus", "714": "▁keep", "715": "ell", "716": "ese", "717": "riend", "718": "▁quest", "719": "▁talk", "720": "als", "721": "ings", "722": "▁mon", "723": "cond", "724": "old", "725": "▁acc", "726": "▁la", "727": "▁num", "728": "ident", "729": "▁che", "730": "iness", "731": "▁turn", "732": "▁ear", "733": "▁No", "734": "ousand", "735": "▁better", "736": "ific", "737": "▁loo", "738": "▁gl", "739": "oc", "740": "▁important", "741": "ited", "742": "▁An", "743": "▁thousand", "744": "ility", "745": "llow", "746": "▁used", "747": "▁gen", "748": "▁sim", "749": "li", "750": "▁happen", "751": "▁Un", "752": "▁Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "▁watch", "758": "▁For", "759": "▁sw", "760": "ren", "761": "ute", "762": "ever", "763": "▁pol", "764": "▁sch", "765": "▁When", "766": "▁such", "767": "▁fif", "768": "▁home", "769": "▁cle", "770": "▁contin", "771": "ouse", "772": "▁friend", "773": "uring", "774": "▁Okay", "775": "gr", "776": "▁able", "777": "▁stud", "778": "▁eff", "779": "hip", "780": "body", "781": "▁top", "782": "ness", "783": "▁exper", "784": "▁pret", "785": "▁both", "786": "▁done", "787": "cri", "788": "▁mark", "789": "▁while", "790": "▁old", "791": "ros", "792": "ont", "793": "▁second", "794": "ative", "795": "▁thought", "796": "▁best", "797": "▁found", "798": "iew", "799": "▁belie", "800": "▁each", "801": "erest", "802": "▁tri", "803": "▁eas", "804": "▁ca", "805": "▁fact", "806": "▁care", "807": "▁fun", "808": "atter", "809": "ures", "810": "▁head", "811": "▁lear", "812": "▁water", "813": "▁hard", "814": "▁few", "815": "▁side", "816": "ween", "817": "▁exp", "818": "▁away", "819": "its", "820": "▁ext", "821": "lud", "822": "▁run", "823": "▁trans", "824": "ince", "825": "▁sk", "826": "▁open", "827": "cus", "828": "▁between", "829": "▁called", "830": "▁wee", "831": "▁pretty", "832": "ason", "833": "▁far", "834": "ember", "835": "omm", "836": "▁interest", "837": "any", "838": "ner", "839": "uff", "840": "▁pres", "841": "▁cur", "842": "▁child", "843": "ee", "844": "▁toget", "845": "▁together", "846": "olog", "847": "▁God", "848": "ond", "849": "▁char", "850": "▁looking", "851": "stem", "852": "az", "853": "cent", "854": "▁ob", "855": "▁ass", "856": "land", "857": "▁doesn", "858": "▁business", "859": "▁course", "860": "▁ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "▁ref", "868": "▁name", "869": "ross", "870": "▁grow", "871": "oney", "872": "▁went", "873": "ics", "874": "teen", "875": "▁cou", "876": "▁prob", "877": "▁ret", "878": "▁guys", "879": "▁came", "880": "ash", "881": "led", "882": "▁Eur", "883": "ues", "884": "▁ide", "885": "gan", "886": "▁everything", "887": "▁getting", "888": "▁ask", "889": "▁cor", "890": "▁build", "891": "▁sign", "892": "▁small", "893": "uck", "894": "▁el", "895": "▁col", "896": "▁Is", "897": "ational", "898": "stand", "899": "cy", "900": "▁conf", "901": "der", "902": "▁bre", "903": "▁cap", "904": "▁mod", "905": "ets", "906": "ike", "907": "▁number", "908": "▁comple", "909": "ertain", "910": "▁ever", "911": "▁coll", "912": "▁hum", "913": "▁Europe", "914": "▁cre", "915": "▁met", "916": "▁exam", "917": "▁move", "918": "▁pass", "919": "▁left", "920": "▁system", "921": "▁includ", "922": "▁Thank", "923": "cept", "924": "▁wom", "925": "▁product", "926": "ten", "927": "▁rest", "928": "▁probably", "929": "▁dri", "930": "▁Do", "931": "▁gener", "932": "▁anything", "933": "▁lar", "934": "▁My", "935": "▁school", "936": "▁lead", "937": "▁sub", "938": "▁ty", "939": "▁plan", "940": "▁seem", "941": "▁whole", "942": "irect", "943": "▁light", "944": "▁must", "945": "▁mom", "946": "▁opp", "947": "▁support", "948": "▁family", "949": "ices", "950": "amp", "951": "▁proble", "952": "▁dr", "953": "ready", "954": "▁using", "955": "ense", "956": "▁prov", "957": "ush", "958": "ax", "959": "▁power", "960": "▁Re", "961": "alth", "962": "▁ev", "963": "▁stand", "964": "��war", "965": "ts", "966": "▁", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}
convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """CLI for exporting Parakeet TDT-CTC 110M Hybrid components to CoreML."""
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from dataclasses import asdict
7
+ from pathlib import Path
8
+ from typing import Dict, Optional, Tuple
9
+
10
+ import coremltools as ct
11
+ import numpy as np
12
+ import soundfile as sf
13
+ import torch
14
+ import typer
15
+
16
+ import nemo.collections.asr as nemo_asr
17
+
18
+ from individual_components import (
19
+ CTCHeadWrapper,
20
+ DecoderWrapper,
21
+ EncoderWrapper,
22
+ ExportSettings,
23
+ JointWrapper,
24
+ JointDecisionWrapper,
25
+ JointDecisionSingleStep,
26
+ PreprocessorWrapper,
27
+ MelEncoderWrapper,
28
+ _coreml_convert,
29
+ )
30
+
31
+ DEFAULT_MODEL_ID = "nvidia/parakeet-tdt_ctc-110m"
32
+ AUTHOR = "Fluid Inference"
33
+
34
+
35
+ def _compute_length(seconds: float, sample_rate: int) -> int:
36
+ return int(round(seconds * sample_rate))
37
+
38
+
39
+ def _prepare_audio(
40
+ validation_audio: Optional[Path],
41
+ sample_rate: int,
42
+ max_samples: int,
43
+ seed: Optional[int],
44
+ ) -> torch.Tensor:
45
+ if validation_audio is None:
46
+ if seed is not None:
47
+ torch.manual_seed(seed)
48
+ audio = torch.randn(1, max_samples, dtype=torch.float32)
49
+ return audio
50
+
51
+ data, sr = sf.read(str(validation_audio), dtype="float32")
52
+ if sr != sample_rate:
53
+ raise typer.BadParameter(
54
+ f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
55
+ )
56
+
57
+ if data.ndim > 1:
58
+ data = data[:, 0]
59
+
60
+ if data.size == 0:
61
+ raise typer.BadParameter("Validation audio is empty")
62
+
63
+ if data.size < max_samples:
64
+ pad_width = max_samples - data.size
65
+ data = np.pad(data, (0, pad_width))
66
+ elif data.size > max_samples:
67
+ data = data[:max_samples]
68
+
69
+ audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
70
+ return audio
71
+
72
+
73
+ def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
74
+ # Ensure iOS 17+ target for MLProgram ops and ANE readiness
75
+ try:
76
+ model.minimum_deployment_target = ct.target.iOS17
77
+ except Exception:
78
+ pass
79
+ model.short_description = description
80
+ model.author = AUTHOR
81
+ path.parent.mkdir(parents=True, exist_ok=True)
82
+ model.save(str(path))
83
+
84
+
85
+ def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
86
+ return tuple(int(dim) for dim in tensor.shape)
87
+
88
+
89
+ def _parse_compute_units(name: str) -> ct.ComputeUnit:
90
+ """Parse a human-friendly compute units string into ct.ComputeUnit.
91
+
92
+ Accepted (case-insensitive): ALL, CPU_ONLY, CPU_AND_GPU, CPU_AND_NE.
93
+ """
94
+ normalized = str(name).strip().upper()
95
+ mapping = {
96
+ "ALL": ct.ComputeUnit.ALL,
97
+ "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
98
+ "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
99
+ "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
100
+ "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
101
+ }
102
+ if normalized not in mapping:
103
+ raise typer.BadParameter(
104
+ f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
105
+ )
106
+ return mapping[normalized]
107
+
108
+
109
+ def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
110
+ """Parse compute precision string into ct.precision or None.
111
+
112
+ Accepted (case-insensitive): FLOAT32, FLOAT16. If None/empty, returns None (tool default).
113
+ """
114
+ if name is None:
115
+ return None
116
+ normalized = str(name).strip().upper()
117
+ if normalized == "":
118
+ return None
119
+ mapping = {
120
+ "FLOAT32": ct.precision.FLOAT32,
121
+ "FLOAT16": ct.precision.FLOAT16,
122
+ }
123
+ if normalized not in mapping:
124
+ raise typer.BadParameter(
125
+ f"Unknown compute precision '{name}'. Choose from: " + ", ".join(mapping.keys())
126
+ )
127
+ return mapping[normalized]
128
+
129
+
130
+ # Fixed export choices: CPU_ONLY + FP32, min target iOS17
131
+
132
+
133
+ app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
134
+
135
+
136
+ @app.command()
137
+ def convert(
138
+ nemo_path: Optional[Path] = typer.Option(
139
+ None,
140
+ "--nemo-path",
141
+ exists=True,
142
+ resolve_path=True,
143
+ help="Path to parakeet-tdt_ctc-110m .nemo checkpoint (skip to auto-download)",
144
+ ),
145
+ model_id: str = typer.Option(
146
+ DEFAULT_MODEL_ID,
147
+ "--model-id",
148
+ help="Model identifier to download when --nemo-path is omitted",
149
+ ),
150
+ output_dir: Path = typer.Option(Path("parakeet_110m_coreml"), help="Directory where mlpackages and metadata will be written"),
151
+ preprocessor_cu: str = typer.Option(
152
+ "CPU_ONLY",
153
+ "--preprocessor-cu",
154
+ help="Compute units for preprocessor (default CPU_ONLY)",
155
+ ),
156
+ mel_encoder_cu: str = typer.Option(
157
+ "CPU_ONLY",
158
+ "--mel-encoder-cu",
159
+ help="Compute units for fused mel+encoder (default CPU_ONLY)",
160
+ ),
161
+ compute_precision: Optional[str] = typer.Option(
162
+ None,
163
+ "--compute-precision",
164
+ help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
165
+ ),
166
+ ) -> None:
167
+ """Export all Parakeet TDT-CTC 110M Hybrid sub-modules to CoreML with a fixed 15-second window.
168
+
169
+ This exports both CTC and TDT components from the hybrid model.
170
+ """
171
+ # Runtime CoreML contract keeps U=1 so the prediction net matches the streaming decoder.
172
+ export_settings = ExportSettings(
173
+ output_dir=output_dir,
174
+ compute_units=ct.ComputeUnit.CPU_ONLY, # Default: CPU-only for all components
175
+ deployment_target=ct.target.iOS17, # iOS 17+ features and kernels
176
+ compute_precision=_parse_compute_precision(compute_precision),
177
+ max_audio_seconds=15.0,
178
+ max_symbol_steps=1,
179
+ )
180
+
181
+ typer.echo("Export configuration:")
182
+ typer.echo(asdict(export_settings))
183
+
184
+ output_dir.mkdir(parents=True, exist_ok=True)
185
+ pre_cu = _parse_compute_units(preprocessor_cu)
186
+ melenc_cu = _parse_compute_units(mel_encoder_cu)
187
+
188
+ if nemo_path is not None:
189
+ typer.echo(f"Loading NeMo model from {nemo_path}…")
190
+ # 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
191
+ asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(
192
+ str(nemo_path), map_location="cpu"
193
+ )
194
+ checkpoint_meta = {
195
+ "type": "file",
196
+ "path": str(nemo_path),
197
+ }
198
+ else:
199
+ typer.echo(f"Downloading NeMo model via {model_id}…")
200
+ # 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
201
+ asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
202
+ model_id, map_location="cpu"
203
+ )
204
+ checkpoint_meta = {
205
+ "type": "pretrained",
206
+ "model_id": model_id,
207
+ }
208
+ asr_model.eval()
209
+
210
+ sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
211
+ max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
212
+
213
+ # Look for a bundled 15s 16kHz audio file
214
+ default_audio = (Path(__file__).parent / "audio" / "yc_first_minute_16k_15s.wav").resolve()
215
+ if default_audio.exists():
216
+ typer.echo(f"Using trace audio: {default_audio}")
217
+ audio_tensor = _prepare_audio(default_audio, sample_rate, max_samples, seed=None)
218
+ else:
219
+ typer.echo("No trace audio found, using random noise for tracing")
220
+ audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
221
+ audio_length = torch.tensor([max_samples], dtype=torch.int32)
222
+
223
+ preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
224
+ encoder = EncoderWrapper(asr_model.encoder.eval())
225
+ decoder = DecoderWrapper(asr_model.decoder.eval())
226
+ joint = JointWrapper(asr_model.joint.eval())
227
+ # CTC head for hybrid model
228
+ ctc_head = CTCHeadWrapper(asr_model.ctc_decoder.eval())
229
+
230
+ decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
231
+ asr_model.decoder._rnnt_export = True
232
+
233
+ try:
234
+ with torch.inference_mode():
235
+ mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
236
+ mel_length_ref = mel_length_ref.to(dtype=torch.int32)
237
+ encoder_ref, encoder_length_ref = encoder(mel_ref, mel_length_ref)
238
+ encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
239
+ # CTC log probs
240
+ ctc_log_probs_ref = ctc_head(encoder_ref)
241
+
242
+ # Clone Tensors to drop the inference tensor flag before tracing
243
+ mel_ref = mel_ref.clone()
244
+ mel_length_ref = mel_length_ref.clone()
245
+ encoder_ref = encoder_ref.clone()
246
+ encoder_length_ref = encoder_length_ref.clone()
247
+ ctc_log_probs_ref = ctc_log_probs_ref.clone()
248
+
249
+ vocab_size = int(asr_model.tokenizer.vocab_size)
250
+ num_extra = int(asr_model.joint.num_extra_outputs)
251
+ decoder_hidden = int(asr_model.decoder.pred_hidden)
252
+ decoder_layers = int(asr_model.decoder.pred_rnn_layers)
253
+
254
+ typer.echo(f"Model info:")
255
+ typer.echo(f" Vocab size: {vocab_size}")
256
+ typer.echo(f" Num extra (duration bins): {num_extra}")
257
+ typer.echo(f" Decoder hidden: {decoder_hidden}")
258
+ typer.echo(f" Decoder layers: {decoder_layers}")
259
+ typer.echo(f" Encoder output shape: {_tensor_shape(encoder_ref)}")
260
+
261
+ targets = torch.full(
262
+ (1, export_settings.max_symbol_steps),
263
+ fill_value=asr_model.decoder.blank_idx,
264
+ dtype=torch.int32,
265
+ )
266
+ target_lengths = torch.tensor(
267
+ [export_settings.max_symbol_steps], dtype=torch.int32
268
+ )
269
+ zero_state = torch.zeros(
270
+ decoder_layers,
271
+ 1,
272
+ decoder_hidden,
273
+ dtype=torch.float32,
274
+ )
275
+
276
+ with torch.inference_mode():
277
+ decoder_ref, h_ref, c_ref = decoder(targets, target_lengths, zero_state, zero_state)
278
+ joint_ref = joint(encoder_ref, decoder_ref)
279
+
280
+ decoder_ref = decoder_ref.clone()
281
+ h_ref = h_ref.clone()
282
+ c_ref = c_ref.clone()
283
+ joint_ref = joint_ref.clone()
284
+
285
+ typer.echo(f" Decoder output shape: {_tensor_shape(decoder_ref)}")
286
+ typer.echo(f" Joint output shape: {_tensor_shape(joint_ref)}")
287
+ typer.echo(f" CTC log probs shape: {_tensor_shape(ctc_log_probs_ref)}")
288
+
289
+ typer.echo("Tracing and converting preprocessor…")
290
+ # Ensure tracing happens on CPU explicitly
291
+ preprocessor = preprocessor.cpu()
292
+ audio_tensor = audio_tensor.cpu()
293
+ audio_length = audio_length.cpu()
294
+ traced_preprocessor = torch.jit.trace(
295
+ preprocessor, (audio_tensor, audio_length), strict=False
296
+ )
297
+ traced_preprocessor.eval()
298
+ preprocessor_inputs = [
299
+ # Allow variable-length audio up to the fixed 15s window using RangeDim
300
+ ct.TensorType(
301
+ name="audio",
302
+ shape=(1, ct.RangeDim(1, max_samples)),
303
+ dtype=np.float32,
304
+ ),
305
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
306
+ ]
307
+ preprocessor_outputs = [
308
+ ct.TensorType(name="mel_features", dtype=np.float32),
309
+ ct.TensorType(name="mel_length", dtype=np.int32),
310
+ ]
311
+ # Preprocessor compute units (parametrized; default CPU_ONLY)
312
+ preprocessor_model = _coreml_convert(
313
+ traced_preprocessor,
314
+ preprocessor_inputs,
315
+ preprocessor_outputs,
316
+ export_settings,
317
+ compute_units_override=pre_cu,
318
+ )
319
+ preprocessor_path = output_dir / "parakeet_preprocessor.mlpackage"
320
+ _save_mlpackage(
321
+ preprocessor_model,
322
+ preprocessor_path,
323
+ "Parakeet 110M preprocessor (15 s window)",
324
+ )
325
+
326
+ typer.echo("Tracing and converting encoder…")
327
+ traced_encoder = torch.jit.trace(
328
+ encoder, (mel_ref, mel_length_ref), strict=False
329
+ )
330
+ traced_encoder.eval()
331
+ encoder_inputs = [
332
+ ct.TensorType(name="mel_features", shape=_tensor_shape(mel_ref), dtype=np.float32),
333
+ ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
334
+ ]
335
+ encoder_outputs = [
336
+ ct.TensorType(name="encoder_output", dtype=np.float32),
337
+ ct.TensorType(name="encoder_length", dtype=np.int32),
338
+ ]
339
+ # Encoder: CPU only
340
+ encoder_model = _coreml_convert(
341
+ traced_encoder,
342
+ encoder_inputs,
343
+ encoder_outputs,
344
+ export_settings,
345
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
346
+ )
347
+ encoder_path = output_dir / "parakeet_encoder.mlpackage"
348
+ _save_mlpackage(
349
+ encoder_model,
350
+ encoder_path,
351
+ "Parakeet 110M encoder (15 s window)",
352
+ )
353
+
354
+ # CTC Head for hybrid model
355
+ typer.echo("Tracing and converting CTC head…")
356
+ traced_ctc_head = torch.jit.trace(
357
+ ctc_head, (encoder_ref,), strict=False
358
+ )
359
+ traced_ctc_head.eval()
360
+ ctc_head_inputs = [
361
+ ct.TensorType(name="encoder_output", shape=_tensor_shape(encoder_ref), dtype=np.float32),
362
+ ]
363
+ ctc_head_outputs = [
364
+ ct.TensorType(name="ctc_logits", dtype=np.float32),
365
+ ]
366
+ ctc_head_model = _coreml_convert(
367
+ traced_ctc_head,
368
+ ctc_head_inputs,
369
+ ctc_head_outputs,
370
+ export_settings,
371
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
372
+ )
373
+ ctc_head_path = output_dir / "parakeet_ctc_head.mlpackage"
374
+ _save_mlpackage(
375
+ ctc_head_model,
376
+ ctc_head_path,
377
+ "Parakeet 110M CTC decoder head",
378
+ )
379
+
380
+ # Optional fused export: Preprocessor + Encoder
381
+ typer.echo("Tracing and converting fused mel+encoder…")
382
+ mel_encoder = MelEncoderWrapper(preprocessor, encoder)
383
+ traced_mel_encoder = torch.jit.trace(
384
+ mel_encoder, (audio_tensor, audio_length), strict=False
385
+ )
386
+ traced_mel_encoder.eval()
387
+ mel_encoder_inputs = [
388
+ # Keep fixed 15s window for fused Mel+Encoder
389
+ ct.TensorType(name="audio", shape=(1, max_samples), dtype=np.float32),
390
+ ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
391
+ ]
392
+ mel_encoder_outputs = [
393
+ ct.TensorType(name="encoder_output", dtype=np.float32),
394
+ ct.TensorType(name="encoder_length", dtype=np.int32),
395
+ ]
396
+ # Fused mel+encoder compute units (parametrized; default CPU_ONLY)
397
+ mel_encoder_model = _coreml_convert(
398
+ traced_mel_encoder,
399
+ mel_encoder_inputs,
400
+ mel_encoder_outputs,
401
+ export_settings,
402
+ compute_units_override=melenc_cu,
403
+ )
404
+ mel_encoder_path = output_dir / "parakeet_mel_encoder.mlpackage"
405
+ _save_mlpackage(
406
+ mel_encoder_model,
407
+ mel_encoder_path,
408
+ "Parakeet 110M fused Mel+Encoder (15 s window)",
409
+ )
410
+
411
+ typer.echo("Tracing and converting decoder…")
412
+ traced_decoder = torch.jit.trace(
413
+ decoder,
414
+ (targets, target_lengths, zero_state, zero_state),
415
+ strict=False,
416
+ )
417
+ traced_decoder.eval()
418
+ decoder_inputs = [
419
+ ct.TensorType(name="targets", shape=_tensor_shape(targets), dtype=np.int32),
420
+ ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
421
+ ct.TensorType(name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32),
422
+ ct.TensorType(name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32),
423
+ ]
424
+ decoder_outputs = [
425
+ ct.TensorType(name="decoder", dtype=np.float32),
426
+ ct.TensorType(name="h_out", dtype=np.float32),
427
+ ct.TensorType(name="c_out", dtype=np.float32),
428
+ ]
429
+ # Decoder: CPU only
430
+ decoder_model = _coreml_convert(
431
+ traced_decoder,
432
+ decoder_inputs,
433
+ decoder_outputs,
434
+ export_settings,
435
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
436
+ )
437
+ decoder_path = output_dir / "parakeet_decoder.mlpackage"
438
+ _save_mlpackage(
439
+ decoder_model,
440
+ decoder_path,
441
+ "Parakeet 110M decoder (RNNT prediction network)",
442
+ )
443
+
444
+ typer.echo("Tracing and converting joint…")
445
+ traced_joint = torch.jit.trace(
446
+ joint,
447
+ (encoder_ref, decoder_ref),
448
+ strict=False,
449
+ )
450
+ traced_joint.eval()
451
+ joint_inputs = [
452
+ ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
453
+ ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
454
+ ]
455
+ joint_outputs = [
456
+ ct.TensorType(name="logits", dtype=np.float32),
457
+ ]
458
+ # Joint: CPU only
459
+ joint_model = _coreml_convert(
460
+ traced_joint,
461
+ joint_inputs,
462
+ joint_outputs,
463
+ export_settings,
464
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
465
+ )
466
+ joint_path = output_dir / "parakeet_joint.mlpackage"
467
+ _save_mlpackage(
468
+ joint_model,
469
+ joint_path,
470
+ "Parakeet 110M joint network (RNNT)",
471
+ )
472
+
473
+ # Joint + decision head (split logits, softmax, argmax)
474
+ typer.echo("Tracing and converting joint decision head…")
475
+ vocab_size = int(asr_model.tokenizer.vocab_size)
476
+ num_extra = int(asr_model.joint.num_extra_outputs)
477
+ joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size, num_extra=num_extra)
478
+ traced_joint_decision = torch.jit.trace(
479
+ joint_decision,
480
+ (encoder_ref, decoder_ref),
481
+ strict=False,
482
+ )
483
+ traced_joint_decision.eval()
484
+ joint_decision_inputs = [
485
+ ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
486
+ ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
487
+ ]
488
+ joint_decision_outputs = [
489
+ ct.TensorType(name="token_id", dtype=np.int32),
490
+ ct.TensorType(name="token_prob", dtype=np.float32),
491
+ ct.TensorType(name="duration", dtype=np.int32),
492
+ ]
493
+ # JointDecision: CPU only
494
+ joint_decision_model = _coreml_convert(
495
+ traced_joint_decision,
496
+ joint_decision_inputs,
497
+ joint_decision_outputs,
498
+ export_settings,
499
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
500
+ )
501
+ joint_decision_path = output_dir / "parakeet_joint_decision.mlpackage"
502
+ _save_mlpackage(
503
+ joint_decision_model,
504
+ joint_decision_path,
505
+ "Parakeet 110M joint + decision head (split, softmax, argmax)",
506
+ )
507
+
508
+ # Single-step JointDecision for [1,512,1] x [1,640,1] -> [1,1,1]
509
+ # Note: 110M encoder dim is 512 (not 1024 like 0.6B)
510
+ typer.echo("Tracing and converting single-step joint decision…")
511
+ jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size, num_extra=num_extra)
512
+ # Create single-step slices from refs
513
+ enc_step = encoder_ref[:, :, :1].contiguous()
514
+ dec_step = decoder_ref[:, :, :1].contiguous()
515
+ traced_jd_single = torch.jit.trace(
516
+ jd_single,
517
+ (enc_step, dec_step),
518
+ strict=False,
519
+ )
520
+ traced_jd_single.eval()
521
+ jd_single_inputs = [
522
+ ct.TensorType(name="encoder_step", shape=(1, enc_step.shape[1], 1), dtype=np.float32),
523
+ ct.TensorType(name="decoder_step", shape=(1, dec_step.shape[1], 1), dtype=np.float32),
524
+ ]
525
+ jd_single_outputs = [
526
+ ct.TensorType(name="token_id", dtype=np.int32),
527
+ ct.TensorType(name="token_prob", dtype=np.float32),
528
+ ct.TensorType(name="duration", dtype=np.int32),
529
+ ct.TensorType(name="top_k_ids", dtype=np.int32),
530
+ ct.TensorType(name="top_k_logits", dtype=np.float32),
531
+ ]
532
+ # Single-step JointDecision: CPU only
533
+ jd_single_model = _coreml_convert(
534
+ traced_jd_single,
535
+ jd_single_inputs,
536
+ jd_single_outputs,
537
+ export_settings,
538
+ compute_units_override=ct.ComputeUnit.CPU_ONLY,
539
+ )
540
+ jd_single_path = output_dir / "parakeet_joint_decision_single_step.mlpackage"
541
+ _save_mlpackage(
542
+ jd_single_model,
543
+ jd_single_path,
544
+ "Parakeet 110M single-step joint decision (current frame)",
545
+ )
546
+
547
+ # Export vocabulary
548
+ typer.echo("Exporting vocabulary…")
549
+ vocab_path = output_dir / "vocab.json"
550
+ vocab_dict = {
551
+ "vocab_size": vocab_size,
552
+ "blank_id": int(asr_model.decoder.blank_idx),
553
+ "tokens": asr_model.tokenizer.vocab,
554
+ }
555
+ vocab_path.write_text(json.dumps(vocab_dict, indent=2, ensure_ascii=False))
556
+
557
+ metadata: Dict[str, object] = {
558
+ "model_id": model_id,
559
+ "model_type": "hybrid_rnnt_ctc",
560
+ "sample_rate": sample_rate,
561
+ "max_audio_seconds": export_settings.max_audio_seconds,
562
+ "max_audio_samples": max_samples,
563
+ "max_symbol_steps": export_settings.max_symbol_steps,
564
+ "vocab_size": vocab_size,
565
+ "joint_extra_outputs": num_extra,
566
+ "encoder_dim": int(encoder_ref.shape[1]), # 512 for 110M
567
+ "decoder_dim": int(decoder_ref.shape[1]), # 640
568
+ "decoder_hidden": decoder_hidden,
569
+ "decoder_layers": decoder_layers,
570
+ "blank_id": int(asr_model.decoder.blank_idx),
571
+ "checkpoint": checkpoint_meta,
572
+ "coreml": {
573
+ "compute_units": export_settings.compute_units.name,
574
+ "compute_precision": (
575
+ export_settings.compute_precision.name
576
+ if export_settings.compute_precision is not None
577
+ else "FLOAT32"
578
+ ),
579
+ },
580
+ "components": {
581
+ "preprocessor": {
582
+ "inputs": {
583
+ "audio_signal": list(_tensor_shape(audio_tensor)),
584
+ "audio_length": [1],
585
+ },
586
+ "outputs": {
587
+ "mel": list(_tensor_shape(mel_ref)),
588
+ "mel_length": [1],
589
+ },
590
+ "path": preprocessor_path.name,
591
+ },
592
+ "encoder": {
593
+ "inputs": {
594
+ "mel": list(_tensor_shape(mel_ref)),
595
+ "mel_length": [1],
596
+ },
597
+ "outputs": {
598
+ "encoder": list(_tensor_shape(encoder_ref)),
599
+ "encoder_length": [1],
600
+ },
601
+ "path": encoder_path.name,
602
+ },
603
+ "ctc_head": {
604
+ "inputs": {
605
+ "encoder": list(_tensor_shape(encoder_ref)),
606
+ },
607
+ "outputs": {
608
+ "log_probs": list(_tensor_shape(ctc_log_probs_ref)),
609
+ },
610
+ "path": ctc_head_path.name,
611
+ },
612
+ "mel_encoder": {
613
+ "inputs": {
614
+ "audio_signal": [1, max_samples],
615
+ "audio_length": [1],
616
+ },
617
+ "outputs": {
618
+ "encoder": list(_tensor_shape(encoder_ref)),
619
+ "encoder_length": [1],
620
+ },
621
+ "path": mel_encoder_path.name,
622
+ },
623
+ "decoder": {
624
+ "inputs": {
625
+ "targets": list(_tensor_shape(targets)),
626
+ "target_length": [1],
627
+ "h_in": list(_tensor_shape(zero_state)),
628
+ "c_in": list(_tensor_shape(zero_state)),
629
+ },
630
+ "outputs": {
631
+ "decoder": list(_tensor_shape(decoder_ref)),
632
+ "h_out": list(_tensor_shape(h_ref)),
633
+ "c_out": list(_tensor_shape(c_ref)),
634
+ },
635
+ "path": decoder_path.name,
636
+ },
637
+ "joint": {
638
+ "inputs": {
639
+ "encoder": list(_tensor_shape(encoder_ref)),
640
+ "decoder": list(_tensor_shape(decoder_ref)),
641
+ },
642
+ "outputs": {
643
+ "logits": list(_tensor_shape(joint_ref)),
644
+ },
645
+ "path": joint_path.name,
646
+ },
647
+ "joint_decision": {
648
+ "inputs": {
649
+ "encoder": list(_tensor_shape(encoder_ref)),
650
+ "decoder": list(_tensor_shape(decoder_ref)),
651
+ },
652
+ "outputs": {
653
+ "token_id": [
654
+ _tensor_shape(encoder_ref)[0],
655
+ _tensor_shape(encoder_ref)[2],
656
+ _tensor_shape(decoder_ref)[2],
657
+ ],
658
+ "token_prob": [
659
+ _tensor_shape(encoder_ref)[0],
660
+ _tensor_shape(encoder_ref)[2],
661
+ _tensor_shape(decoder_ref)[2],
662
+ ],
663
+ "duration": [
664
+ _tensor_shape(encoder_ref)[0],
665
+ _tensor_shape(encoder_ref)[2],
666
+ _tensor_shape(decoder_ref)[2],
667
+ ],
668
+ },
669
+ "path": joint_decision_path.name,
670
+ },
671
+ "joint_decision_single_step": {
672
+ "inputs": {
673
+ "encoder_step": [1, int(encoder_ref.shape[1]), 1],
674
+ "decoder_step": [1, int(decoder_ref.shape[1]), 1],
675
+ },
676
+ "outputs": {
677
+ "token_id": [1, 1, 1],
678
+ "token_prob": [1, 1, 1],
679
+ "duration": [1, 1, 1],
680
+ "top_k_ids": [1, 1, 1, 64],
681
+ "top_k_logits": [1, 1, 1, 64],
682
+ },
683
+ "path": jd_single_path.name,
684
+ },
685
+ },
686
+ }
687
+
688
+ metadata_path = output_dir / "metadata.json"
689
+ metadata_path.write_text(json.dumps(metadata, indent=2))
690
+ typer.echo(f"Export complete. Metadata written to {metadata_path}")
691
+
692
+ finally:
693
+ asr_model.decoder._rnnt_export = decoder_export_flag
694
+
695
+
696
+ if __name__ == "__main__":
697
+ app()
convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "approach" : "single-encoder",
3
+ "model" : "parakeet-tdt-ctc-110m-hybrid",
4
+ "results" : [
5
+ {
6
+ "audioLength" : 15,
7
+ "ctcDetections" : [
8
+ {
9
+ "endTime" : 6.0800000000000001,
10
+ "inReference" : true,
11
+ "score" : -8.3699999999999992,
12
+ "source" : "ctc",
13
+ "startTime" : 4.96,
14
+ "word" : "LATAM Airlines"
15
+ }
16
+ ],
17
+ "dictFound" : 1,
18
+ "dictTotal" : 1,
19
+ "fileId" : "4329526_chunk0",
20
+ "hypothesis" : "goodday everyone and welcome to latam airlines group earnings release confonference call just as a reminder this conference is being recorded lat tam airlines group eararnings releaseed for the",
21
+ "processingTime" : 0.070000000000000007,
22
+ "reference" : "good day everyone and welcome to latam airlines group earnings release conference call just as a reminder this conference is being recorded latam airlines group earnings released for the",
23
+ "wer" : 24.140000000000001
24
+ }
25
+ ],
26
+ "summary" : {
27
+ "avgWer" : 24.140000000000001,
28
+ "dictPass" : 1,
29
+ "dictRate" : 100,
30
+ "dictTotal" : 1,
31
+ "totalAudioDuration" : 15,
32
+ "totalProcessingTime" : 0.070000000000000007,
33
+ "totalTests" : 1
34
+ }
35
+ }
convert/parakeet-tdt-ctc-110m/coreml/individual_components.py ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Export Parakeet TDT-CTC 110M Hybrid RNNT components into CoreML and validate outputs."""
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import dataclass
6
+ from pathlib import Path
7
+ from typing import Optional, Tuple
8
+
9
+ import coremltools as ct
10
+ import torch
11
+
12
+
13
+ @dataclass
14
+ class ExportSettings:
15
+ output_dir: Path
16
+ compute_units: ct.ComputeUnit
17
+ deployment_target: Optional[ct.target.iOS17]
18
+ compute_precision: Optional[ct.precision]
19
+ max_audio_seconds: float
20
+ max_symbol_steps: int
21
+
22
+
23
+ @dataclass
24
+ class ValidationSettings:
25
+ audio_path: Optional[Path]
26
+ seconds: float
27
+ seed: Optional[int]
28
+ rtol: float
29
+ atol: float
30
+ skip: bool
31
+
32
+
33
+ @dataclass
34
+ class ValidationDiff:
35
+ name: str
36
+ max_abs_diff: float
37
+ max_rel_diff: float
38
+
39
+
40
+ @dataclass
41
+ class ValidationResult:
42
+ source: str
43
+ audio_num_samples: int
44
+ audio_seconds: float
45
+ token_length: int
46
+ atol: float
47
+ rtol: float
48
+ diffs: Tuple[ValidationDiff, ...]
49
+
50
+
51
+ class PreprocessorWrapper(torch.nn.Module):
52
+ def __init__(self, module: torch.nn.Module) -> None:
53
+ super().__init__()
54
+ self.module = module
55
+
56
+ def forward(self, audio_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
57
+ mel, mel_length = self.module(input_signal=audio_signal, length=length.to(dtype=torch.long))
58
+ return mel, mel_length
59
+
60
+
61
+ class EncoderWrapper(torch.nn.Module):
62
+ def __init__(self, module: torch.nn.Module) -> None:
63
+ super().__init__()
64
+ self.module = module
65
+
66
+ def forward(self, features: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
67
+ encoded, encoded_lengths = self.module(audio_signal=features, length=length.to(dtype=torch.long))
68
+ return encoded, encoded_lengths
69
+
70
+
71
+ class DecoderWrapper(torch.nn.Module):
72
+ def __init__(self, module: torch.nn.Module) -> None:
73
+ super().__init__()
74
+ self.module = module
75
+
76
+ def forward(
77
+ self,
78
+ targets: torch.Tensor,
79
+ target_lengths: torch.Tensor,
80
+ h_in: torch.Tensor,
81
+ c_in: torch.Tensor,
82
+ ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
83
+ state = [h_in, c_in]
84
+ decoder_output, _, new_state = self.module(
85
+ targets=targets.to(dtype=torch.long),
86
+ target_length=target_lengths.to(dtype=torch.long),
87
+ states=state,
88
+ )
89
+ return decoder_output, new_state[0], new_state[1]
90
+
91
+
92
+ class JointWrapper(torch.nn.Module):
93
+ """Joint network for 110M hybrid model.
94
+
95
+ Note: The 110M model has encoder_dim=512 and decoder_dim=640.
96
+ The joint network projects both to 640, then combines them.
97
+ """
98
+ def __init__(self, module: torch.nn.Module) -> None:
99
+ super().__init__()
100
+ self.module = module
101
+
102
+ def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor) -> torch.Tensor:
103
+ # Input: encoder_outputs [B, D_enc, T], decoder_outputs [B, D_dec, U]
104
+ # For 110M: D_enc=512, D_dec=640
105
+ # Transpose to match what projection layers expect
106
+ encoder_outputs = encoder_outputs.transpose(1, 2) # [B, T, D_enc]
107
+ decoder_outputs = decoder_outputs.transpose(1, 2) # [B, U, D_dec]
108
+
109
+ # Apply projections
110
+ enc_proj = self.module.enc(encoder_outputs) # [B, T, 640]
111
+ dec_proj = self.module.pred(decoder_outputs) # [B, U, 640]
112
+
113
+ # Explicit broadcasting along T and U to avoid converter ambiguity
114
+ x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1) # [B, T, U, 640]
115
+ x = self.module.joint_net[0](x) # ReLU
116
+ x = self.module.joint_net[1](x) # Dropout (no-op in eval)
117
+ out = self.module.joint_net[2](x) # Linear -> logits [B, T, U, vocab+1+durations]
118
+ return out
119
+
120
+
121
+ class CTCHeadWrapper(torch.nn.Module):
122
+ """CTC decoder head for 110M hybrid model.
123
+
124
+ Takes encoder output and produces log probabilities over vocabulary.
125
+ The NeMo CTC decoder (ConvASRDecoder) uses Conv1d so it expects [B, D, T] format.
126
+ """
127
+ def __init__(self, module: torch.nn.Module) -> None:
128
+ super().__init__()
129
+ self.module = module
130
+
131
+ def forward(self, encoder_outputs: torch.Tensor) -> torch.Tensor:
132
+ # Input: encoder_outputs [B, D_enc, T] - already in the format CTC decoder expects
133
+ # The NeMo CTC decoder uses Conv1d internally, so it expects [B, D, T]
134
+ # Output: log probabilities [B, T, vocab+1]
135
+ log_probs = self.module(encoder_output=encoder_outputs)
136
+ return log_probs
137
+
138
+
139
+ class MelEncoderWrapper(torch.nn.Module):
140
+ """Fused wrapper: waveform -> mel -> encoder.
141
+
142
+ Inputs:
143
+ - audio_signal: [B, S]
144
+ - audio_length: [B]
145
+
146
+ Outputs:
147
+ - encoder: [B, D, T_enc]
148
+ - encoder_length: [B]
149
+ """
150
+ def __init__(self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper) -> None:
151
+ super().__init__()
152
+ self.preprocessor = preprocessor
153
+ self.encoder = encoder
154
+
155
+ def forward(self, audio_signal: torch.Tensor, audio_length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
156
+ mel, mel_length = self.preprocessor(audio_signal, audio_length)
157
+ encoded, enc_len = self.encoder(mel, mel_length.to(dtype=torch.int32))
158
+ return encoded, enc_len
159
+
160
+
161
+ class JointDecisionWrapper(torch.nn.Module):
162
+ """Joint + decision head: outputs label id, label prob, duration frames.
163
+
164
+ Splits joint logits into token logits and duration logits, applies softmax
165
+ over tokens, argmax for both heads, and gathers probability of the chosen token.
166
+
167
+ Inputs:
168
+ - encoder_outputs: [B, D, T]
169
+ - decoder_outputs: [B, D, U]
170
+
171
+ Returns:
172
+ - token_id: [B, T, U] int32
173
+ - token_prob: [B, T, U] float32
174
+ - duration: [B, T, U] int32 (frames; for v3 bins=[0,1,2,3,4])
175
+ """
176
+ def __init__(self, joint: JointWrapper, vocab_size: int, num_extra: int) -> None:
177
+ super().__init__()
178
+ self.joint = joint
179
+ self.vocab_with_blank = int(vocab_size) + 1
180
+ self.num_extra = int(num_extra)
181
+
182
+ def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
183
+ logits = self.joint(encoder_outputs, decoder_outputs)
184
+ token_logits = logits[..., : self.vocab_with_blank]
185
+ duration_logits = logits[..., -self.num_extra :]
186
+
187
+ # Token selection
188
+ token_ids = torch.argmax(token_logits, dim=-1).to(dtype=torch.int32)
189
+ token_probs_all = torch.softmax(token_logits, dim=-1)
190
+ # gather expects int64 (long) indices; cast only for gather
191
+ token_prob = torch.gather(
192
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
193
+ ).squeeze(-1)
194
+
195
+ # Duration prediction (bins are identity mapping to frames for v3)
196
+ duration = torch.argmax(duration_logits, dim=-1).to(dtype=torch.int32)
197
+ return token_ids, token_prob, duration
198
+
199
+
200
+ class JointDecisionSingleStep(torch.nn.Module):
201
+ """Single-step variant for streaming: encoder_step [1, 512, 1] -> [1,1,1].
202
+
203
+ Note: For 110M model, encoder_dim is 512 (not 1024 like 0.6B).
204
+
205
+ Inputs:
206
+ - encoder_step: [B=1, D=512, T=1]
207
+ - decoder_step: [B=1, D=640, U=1]
208
+
209
+ Returns:
210
+ - token_id: [1, 1, 1] int32
211
+ - token_prob: [1, 1, 1] float32
212
+ - duration: [1, 1, 1] int32
213
+ - top_k_ids: [1, 1, 1, K] int32
214
+ - top_k_logits: [1, 1, 1, K] float32
215
+ """
216
+ def __init__(self, joint: JointWrapper, vocab_size: int, num_extra: int, top_k: int = 64) -> None:
217
+ super().__init__()
218
+ self.joint = joint
219
+ self.vocab_with_blank = int(vocab_size) + 1
220
+ self.num_extra = int(num_extra)
221
+ # Emit top-K candidates to enable host-side re-ranking with contextual biasing
222
+ self.top_k = int(top_k)
223
+
224
+ def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
225
+ # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
226
+ logits = self.joint(encoder_step, decoder_step) # [1, 1, 1, V+extra]
227
+ token_logits = logits[..., : self.vocab_with_blank]
228
+ duration_logits = logits[..., -self.num_extra :]
229
+
230
+ token_ids = torch.argmax(token_logits, dim=-1, keepdim=False).to(dtype=torch.int32)
231
+ token_probs_all = torch.softmax(token_logits, dim=-1)
232
+ token_prob = torch.gather(
233
+ token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
234
+ ).squeeze(-1)
235
+ duration = torch.argmax(duration_logits, dim=-1, keepdim=False).to(dtype=torch.int32)
236
+
237
+ # Also expose top-K candidates for host-side re-ranking.
238
+ # Shapes preserved as [1, 1, 1, K] to match CoreML broadcasting expectations.
239
+ # Note: topk expects last dimension; original shape is [1, 1, 1, V].
240
+ topk_logits, topk_ids_long = torch.topk(token_logits, k=min(self.top_k, token_logits.shape[-1]), dim=-1)
241
+ topk_ids = topk_ids_long.to(dtype=torch.int32)
242
+ return token_ids, token_prob, duration, topk_ids, topk_logits
243
+
244
+
245
+ def _coreml_convert(
246
+ traced: torch.jit.ScriptModule,
247
+ inputs,
248
+ outputs,
249
+ settings: ExportSettings,
250
+ compute_units_override: Optional[ct.ComputeUnit] = None,
251
+ ) -> ct.models.MLModel:
252
+ cu = compute_units_override if compute_units_override is not None else settings.compute_units
253
+ kwargs = {
254
+ "convert_to": "mlprogram",
255
+ "inputs": inputs,
256
+ "outputs": outputs,
257
+ "compute_units": cu,
258
+ }
259
+ print("Converting:", traced.__class__.__name__)
260
+ print("Conversion kwargs:", kwargs)
261
+ if settings.deployment_target is not None:
262
+ kwargs["minimum_deployment_target"] = settings.deployment_target
263
+ if settings.compute_precision is not None:
264
+ kwargs["compute_precision"] = settings.compute_precision
265
+ return ct.convert(traced, **kwargs)
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/metadata.json ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "nvidia/parakeet-tdt_ctc-110m",
3
+ "model_type": "hybrid_rnnt_ctc",
4
+ "sample_rate": 16000,
5
+ "max_audio_seconds": 15.0,
6
+ "max_audio_samples": 240000,
7
+ "max_symbol_steps": 1,
8
+ "vocab_size": 1024,
9
+ "joint_extra_outputs": 5,
10
+ "encoder_dim": 512,
11
+ "decoder_dim": 640,
12
+ "decoder_hidden": 640,
13
+ "decoder_layers": 1,
14
+ "blank_id": 1024,
15
+ "checkpoint": {
16
+ "type": "pretrained",
17
+ "model_id": "nvidia/parakeet-tdt_ctc-110m"
18
+ },
19
+ "coreml": {
20
+ "compute_units": "CPU_ONLY",
21
+ "compute_precision": "FLOAT32"
22
+ },
23
+ "components": {
24
+ "preprocessor": {
25
+ "inputs": {
26
+ "audio_signal": [
27
+ 1,
28
+ 240000
29
+ ],
30
+ "audio_length": [
31
+ 1
32
+ ]
33
+ },
34
+ "outputs": {
35
+ "mel": [
36
+ 1,
37
+ 80,
38
+ 1501
39
+ ],
40
+ "mel_length": [
41
+ 1
42
+ ]
43
+ },
44
+ "path": "parakeet_preprocessor.mlpackage"
45
+ },
46
+ "encoder": {
47
+ "inputs": {
48
+ "mel": [
49
+ 1,
50
+ 80,
51
+ 1501
52
+ ],
53
+ "mel_length": [
54
+ 1
55
+ ]
56
+ },
57
+ "outputs": {
58
+ "encoder": [
59
+ 1,
60
+ 512,
61
+ 188
62
+ ],
63
+ "encoder_length": [
64
+ 1
65
+ ]
66
+ },
67
+ "path": "parakeet_encoder.mlpackage"
68
+ },
69
+ "ctc_head": {
70
+ "inputs": {
71
+ "encoder": [
72
+ 1,
73
+ 512,
74
+ 188
75
+ ]
76
+ },
77
+ "outputs": {
78
+ "log_probs": [
79
+ 1,
80
+ 188,
81
+ 1025
82
+ ]
83
+ },
84
+ "path": "parakeet_ctc_head.mlpackage"
85
+ },
86
+ "mel_encoder": {
87
+ "inputs": {
88
+ "audio_signal": [
89
+ 1,
90
+ 240000
91
+ ],
92
+ "audio_length": [
93
+ 1
94
+ ]
95
+ },
96
+ "outputs": {
97
+ "encoder": [
98
+ 1,
99
+ 512,
100
+ 188
101
+ ],
102
+ "encoder_length": [
103
+ 1
104
+ ]
105
+ },
106
+ "path": "parakeet_mel_encoder.mlpackage"
107
+ },
108
+ "decoder": {
109
+ "inputs": {
110
+ "targets": [
111
+ 1,
112
+ 1
113
+ ],
114
+ "target_length": [
115
+ 1
116
+ ],
117
+ "h_in": [
118
+ 1,
119
+ 1,
120
+ 640
121
+ ],
122
+ "c_in": [
123
+ 1,
124
+ 1,
125
+ 640
126
+ ]
127
+ },
128
+ "outputs": {
129
+ "decoder": [
130
+ 1,
131
+ 640,
132
+ 1
133
+ ],
134
+ "h_out": [
135
+ 1,
136
+ 1,
137
+ 640
138
+ ],
139
+ "c_out": [
140
+ 1,
141
+ 1,
142
+ 640
143
+ ]
144
+ },
145
+ "path": "parakeet_decoder.mlpackage"
146
+ },
147
+ "joint": {
148
+ "inputs": {
149
+ "encoder": [
150
+ 1,
151
+ 512,
152
+ 188
153
+ ],
154
+ "decoder": [
155
+ 1,
156
+ 640,
157
+ 1
158
+ ]
159
+ },
160
+ "outputs": {
161
+ "logits": [
162
+ 1,
163
+ 188,
164
+ 1,
165
+ 1030
166
+ ]
167
+ },
168
+ "path": "parakeet_joint.mlpackage"
169
+ },
170
+ "joint_decision": {
171
+ "inputs": {
172
+ "encoder": [
173
+ 1,
174
+ 512,
175
+ 188
176
+ ],
177
+ "decoder": [
178
+ 1,
179
+ 640,
180
+ 1
181
+ ]
182
+ },
183
+ "outputs": {
184
+ "token_id": [
185
+ 1,
186
+ 188,
187
+ 1
188
+ ],
189
+ "token_prob": [
190
+ 1,
191
+ 188,
192
+ 1
193
+ ],
194
+ "duration": [
195
+ 1,
196
+ 188,
197
+ 1
198
+ ]
199
+ },
200
+ "path": "parakeet_joint_decision.mlpackage"
201
+ },
202
+ "joint_decision_single_step": {
203
+ "inputs": {
204
+ "encoder_step": [
205
+ 1,
206
+ 512,
207
+ 1
208
+ ],
209
+ "decoder_step": [
210
+ 1,
211
+ 640,
212
+ 1
213
+ ]
214
+ },
215
+ "outputs": {
216
+ "token_id": [
217
+ 1,
218
+ 1,
219
+ 1
220
+ ],
221
+ "token_prob": [
222
+ 1,
223
+ 1,
224
+ 1
225
+ ],
226
+ "duration": [
227
+ 1,
228
+ 1,
229
+ 1
230
+ ],
231
+ "top_k_ids": [
232
+ 1,
233
+ 1,
234
+ 1,
235
+ 64
236
+ ],
237
+ "top_k_logits": [
238
+ 1,
239
+ 1,
240
+ 1,
241
+ 64
242
+ ]
243
+ },
244
+ "path": "parakeet_joint_decision_single_step.mlpackage"
245
+ }
246
+ }
247
+ }
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6459b9564e0630f2eec300eb732fceccbc1d2d16f12cb0694ce310d84bfbecf2
3
+ size 3366
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
3
+ size 1051842
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "6651E3CE-C3ED-4267-AAC3-5A772FC3515A": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "A3F7798B-67CA-418C-B8BB-58731D3A413F": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "6651E3CE-C3ED-4267-AAC3-5A772FC3515A"
18
+ }
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a05548eb455c5cd564782b125a5f9279a789be1f4141e5f044453ea79cd68b47
3
+ size 6729
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
3
+ size 7871040
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Manifest.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "fileFormatVersion": "1.0.0",
3
+ "itemInfoEntries": {
4
+ "83E7B87A-4EBE-48BF-BF3C-EE74DEA4C7AF": {
5
+ "author": "com.apple.CoreML",
6
+ "description": "CoreML Model Specification",
7
+ "name": "model.mlmodel",
8
+ "path": "com.apple.CoreML/model.mlmodel"
9
+ },
10
+ "98BF03AC-26AF-410B-95AC-C9B99B3B240C": {
11
+ "author": "com.apple.CoreML",
12
+ "description": "CoreML Model Weights",
13
+ "name": "weights",
14
+ "path": "com.apple.CoreML/weights"
15
+ }
16
+ },
17
+ "rootModelIdentifier": "83E7B87A-4EBE-48BF-BF3C-EE74DEA4C7AF"
18
+ }
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:70d7747b57beba0248fabb6cbfa5d276e3604d0d7e234f4ccb578ea0a4d25110
3
+ size 508107
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
3
+ size 215143424