alexwengg commited on Jan 2

Commit

26cfae7

verified ·

1 Parent(s): 6e7e587

Upload 401 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +20 -0
cli/CtcEarningsBenchmark.swift +1048 -0
cli/HybridEarningsBenchmark.swift +554 -0
convert/.DS_Store +0 -0
convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py +323 -0
convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json +66 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil +24 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json +118 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil +45 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json +105 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil +0 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json +102 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil +58 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json +123 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil +69 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json +112 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil +191 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json +247 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json +1 -0
convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py +697 -0
convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json +35 -0
convert/parakeet-tdt-ctc-110m/coreml/individual_components.py +265 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/metadata.json +247 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Manifest.json +18 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Manifest.json +18 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel +3 -0
convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k.wav filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute.wav filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/context/Efficient[[:space:]]Sequence[[:space:]]Transduction[[:space:]]by[[:space:]]Jointly[[:space:]]Predicting[[:space:]]Tokens[[:space:]]and[[:space:]]Durations.pdf filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/context/FAST[[:space:]]CONFORMER[[:space:]]WITH[[:space:]]LINEARLY[[:space:]]SCALABLE[[:space:]]ATTENTION.pdf filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/plots/compare-components/mel_composite.png filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compile.png filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compression.png filter=lfs diff=lfs merge=lfs -text
+convert/parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_quality.png filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k_15s.wav filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute_16k.wav filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/audio/yc_first_minute.wav filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/context/Efficient[[:space:]]Sequence[[:space:]]Transduction[[:space:]]by[[:space:]]Jointly[[:space:]]Predicting[[:space:]]Tokens[[:space:]]and[[:space:]]Durations.pdf filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/context/FAST[[:space:]]CONFORMER[[:space:]]WITH[[:space:]]LINEARLY[[:space:]]SCALABLE[[:space:]]ATTENTION.pdf filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/plots/compare-components/mel_composite.png filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compile.png filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_compression.png filter=lfs diff=lfs merge=lfs -text
+parakeet-tdt-v2-0.6b/coreml/plots/quantize/cpu_and_ne/all_components_quality.png filter=lfs diff=lfs merge=lfs -text

cli/CtcEarningsBenchmark.swift ADDED Viewed

	@@ -0,0 +1,1048 @@

+#if os(macOS)
+import AVFoundation
+import CoreML
+import FluidAudio
+import Foundation
+/// Earnings22 benchmark using TDT for transcription + CTC for keyword spotting.
+/// TDT provides low WER transcription, CTC provides high recall dictionary detection.
+public enum CtcEarningsBenchmark {
+    private enum KeywordMode: String {
+        case chunk
+        case file
+    }
+    /// Default CTC model directory
+    private static func defaultCtcModelPath() -> String? {
+        let appSupport = FileManager.default.urls(
+            for: .applicationSupportDirectory, in: .userDomainMask
+        ).first!
+        let modelPath = appSupport.appendingPathComponent("FluidAudio/Models/parakeet-ctc-110m-coreml")
+        if FileManager.default.fileExists(atPath: modelPath.path) {
+            return modelPath.path
+        }
+        return nil
+    }
+    /// Default data directory (from download command)
+    private static func defaultDataDir() -> String? {
+        let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
+        if FileManager.default.fileExists(atPath: dataDir.path) {
+            return dataDir.path
+        }
+        return nil
+    }
+    public static func runCLI(arguments: [String]) async {
+        // Check for help
+        if arguments.contains("--help") || arguments.contains("-h") {
+            printUsage()
+            return
+        }
+        // Parse arguments
+        var dataDir: String? = nil
+        var outputFile = "ctc_earnings_benchmark.json"
+        var maxFiles: Int? = nil
+        var ctcModelPath: String? = nil
+        // Note: Using v2 by default because v3 has issues with certain audio files
+        // (returns empty transcription for ~7 files in Earnings22 dataset)
+        var tdtVersion: AsrModelVersion = .v2
+        var autoDownload = false
+        var keywordMode: KeywordMode = .chunk
+        var i = 0
+        while i < arguments.count {
+            switch arguments[i] {
+            case "--data-dir":
+                if i + 1 < arguments.count {
+                    dataDir = arguments[i + 1]
+                    i += 1
+                }
+            case "--output", "-o":
+                if i + 1 < arguments.count {
+                    outputFile = arguments[i + 1]
+                    i += 1
+                }
+            case "--max-files":
+                if i + 1 < arguments.count {
+                    maxFiles = Int(arguments[i + 1])
+                    i += 1
+                }
+            case "--ctc-model":
+                if i + 1 < arguments.count {
+                    ctcModelPath = arguments[i + 1]
+                    i += 1
+                }
+            case "--tdt-version":
+                if i + 1 < arguments.count {
+                    if arguments[i + 1] == "v2" || arguments[i + 1] == "2" {
+                        tdtVersion = .v2
+                    }
+                    i += 1
+                }
+            case "--auto-download":
+                autoDownload = true
+            case "--keyword-mode":
+                if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
+                    keywordMode = mode
+                    i += 1
+                }
+            default:
+                break
+            }
+            i += 1
+        }
+        // Use defaults if not specified
+        if dataDir == nil {
+            dataDir = defaultDataDir()
+        }
+        if ctcModelPath == nil {
+            ctcModelPath = defaultCtcModelPath()
+        }
+        // Handle auto-download for dataset
+        if autoDownload && dataDir == nil {
+            print("📥 Downloading earnings22-kws dataset...")
+            await DatasetDownloader.downloadEarnings22KWS(force: false)
+            dataDir = defaultDataDir()
+        }
+        // Handle auto-download for CTC models
+        if autoDownload && ctcModelPath == nil {
+            print("📥 Downloading CTC models...")
+            do {
+                _ = try await CtcModels.download()
+                ctcModelPath = defaultCtcModelPath()
+            } catch {
+                print("ERROR: Failed to download CTC models: \(error)")
+            }
+        }
+        print("Earnings Benchmark (TDT transcription + CTC keyword spotting)")
+        print("  Data directory: \(dataDir ?? "not found")")
+        print("  Output file: \(outputFile)")
+        print("  TDT version: \(tdtVersion == .v2 ? "v2" : "v3")")
+        print("  CTC model: \(ctcModelPath ?? "not found")")
+        print("  Keyword mode: \(keywordMode.rawValue)")
+        guard let finalDataDir = dataDir else {
+            print("ERROR: Data directory not found")
+            print("💡 Download with: fluidaudio download --dataset earnings22-kws")
+            print("   Or specify: --data-dir <path>")
+            printUsage()
+            return
+        }
+        guard let modelPath = ctcModelPath else {
+            print("ERROR: CTC model not found")
+            print("💡 Download parakeet-ctc-110m-coreml model to:")
+            print("   ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
+            print("   Or specify: --ctc-model <path>")
+            printUsage()
+            return
+        }
+        let dataDirResolved = finalDataDir
+        do {
+            // Load TDT models for transcription
+            print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...")
+            let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
+            let asrManager = AsrManager(config: .default)
+            try await asrManager.initialize(models: tdtModels)
+            print("TDT models loaded successfully")
+            // Load CTC models for keyword spotting
+            print("Loading CTC models from: \(modelPath)")
+            let modelDir = URL(fileURLWithPath: modelPath)
+            let ctcModels = try await CtcModels.loadDirect(from: modelDir)
+            print("Loaded CTC vocabulary with \(ctcModels.vocabulary.count) tokens")
+            // Create keyword spotter
+            let vocabSize = ctcModels.vocabulary.count
+            let blankId = vocabSize  // Blank is at index = vocab_size
+            let spotter = CtcKeywordSpotter(models: ctcModels, blankId: blankId)
+            print("Created CTC spotter with blankId=\(blankId)")
+            // Collect test files
+            let dataDirURL = URL(fileURLWithPath: dataDirResolved)
+            let fileIds = try collectFileIds(from: dataDirURL, maxFiles: maxFiles)
+            let keywordIndex = try buildKeywordIndex(dataDir: dataDirURL, keywordMode: keywordMode)
+            if fileIds.isEmpty {
+                print("ERROR: No test files found in \(dataDirResolved)")
+                return
+            }
+            print("Processing \(fileIds.count) test files...")
+            var results: [[String: Any]] = []
+            var totalWer = 0.0
+            var totalKeywordReference = 0
+            var totalKeywordPredicted = 0
+            var totalKeywordTruePositives = 0
+            var totalKeywordFalsePositives = 0
+            var totalKeywordFalseNegatives = 0
+            var totalAudioDuration = 0.0
+            var totalProcessingTime = 0.0
+            for (index, fileId) in fileIds.enumerated() {
+                print("[\(index + 1)/\(fileIds.count)] \(fileId)")
+                if let result = try await processFile(
+                    fileId: fileId,
+                    dataDir: dataDirURL,
+                    asrManager: asrManager,
+                    ctcModels: ctcModels,
+                    spotter: spotter,
+                    keywordMode: keywordMode,
+                    keywordIndex: keywordIndex
+                ) {
+                    results.append(result)
+                    totalWer += result["wer"] as? Double ?? 0
+                    totalKeywordReference += result["keywordReference"] as? Int ?? 0
+                    totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
+                    totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
+                    totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
+                    totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
+                    totalAudioDuration += result["audioLength"] as? Double ?? 0
+                    totalProcessingTime += result["processingTime"] as? Double ?? 0
+                    let wer = result["wer"] as? Double ?? 0
+                    let precision = result["keywordPrecision"] as? Double ?? 0
+                    let recall = result["keywordRecall"] as? Double ?? 0
+                    let fscore = result["keywordFscore"] as? Double ?? 0
+                    print(
+                        "  WER: \(String(format: "%.1f", wer))%, " +
+                            "KW P/R/F: \(String(format: "%.2f", precision))/" +
+                            "\(String(format: "%.2f", recall))/" +
+                            "\(String(format: "%.2f", fscore))"
+                    )
+                }
+            }
+            // Calculate summary
+            let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
+            let keywordPrecision =
+                totalKeywordPredicted > 0
+                ? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
+                : 0
+            let keywordRecall =
+                totalKeywordReference > 0
+                ? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
+                : 0
+            let keywordFscore =
+                (keywordPrecision + keywordRecall) > 0
+                ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
+                : 0
+            // Print summary
+            print("\n" + String(repeating: "=", count: 60))
+            print("EARNINGS22 BENCHMARK (TDT + CTC)")
+            print(String(repeating: "=", count: 60))
+            print("Model: \(modelPath)")
+            print("Total tests: \(results.count)")
+            print("Average WER: \(String(format: "%.2f", avgWer))%")
+            print(
+                "Keyword Precision/Recall/F1: " +
+                    "\(String(format: "%.2f", keywordPrecision))/" +
+                    "\(String(format: "%.2f", keywordRecall))/" +
+                    "\(String(format: "%.2f", keywordFscore))"
+            )
+            print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
+            print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
+            if totalProcessingTime > 0 {
+                print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
+            }
+            print(String(repeating: "=", count: 60))
+            // Sort results by WER descending (worst first)
+            let sortedResults = results.sorted { r1, r2 in
+                let wer1 = r1["wer"] as? Double ?? 0
+                let wer2 = r2["wer"] as? Double ?? 0
+                return wer1 > wer2
+            }
+            // Save to JSON
+            let summaryDict: [String: Any] = [
+                "totalTests": results.count,
+                "avgWer": round(avgWer * 100) / 100,
+                "keywordTruePositives": totalKeywordTruePositives,
+                "keywordFalsePositives": totalKeywordFalsePositives,
+                "keywordFalseNegatives": totalKeywordFalseNegatives,
+                "keywordPredicted": totalKeywordPredicted,
+                "keywordReference": totalKeywordReference,
+                "keywordPrecision": round(keywordPrecision * 1000) / 1000,
+                "keywordRecall": round(keywordRecall * 1000) / 1000,
+                "keywordFscore": round(keywordFscore * 1000) / 1000,
+                "totalAudioDuration": round(totalAudioDuration * 100) / 100,
+                "totalProcessingTime": round(totalProcessingTime * 100) / 100,
+            ]
+            let output: [String: Any] = [
+                "model": modelPath,
+                "keywordMode": keywordMode.rawValue,
+                "summary": summaryDict,
+                "results": sortedResults,
+            ]
+            let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
+            try jsonData.write(to: URL(fileURLWithPath: outputFile))
+            print("\nResults written to: \(outputFile)")
+        } catch {
+            print("ERROR: Benchmark failed: \(error)")
+        }
+    }
+    private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
+        var fileIds: [String] = []
+        let suffix = ".dictionary.txt"
+        let fileManager = FileManager.default
+        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
+        for url in contents.sorted(by: { $0.path < $1.path }) {
+            let name = url.lastPathComponent
+            if name.hasSuffix(suffix) {
+                let data = try? Data(contentsOf: url)
+                if let data = data, !data.isEmpty {
+                    let fileId = String(name.dropLast(suffix.count))
+                    fileIds.append(fileId)
+                }
+            }
+        }
+        if let maxFiles = maxFiles {
+            return Array(fileIds.prefix(maxFiles))
+        }
+        return fileIds
+    }
+    private static func processFile(
+        fileId: String,
+        dataDir: URL,
+        asrManager: AsrManager,
+        ctcModels: CtcModels,
+        spotter: CtcKeywordSpotter,
+        keywordMode: KeywordMode,
+        keywordIndex: [String: [String]]
+    ) async throws -> [String: Any]? {
+        let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
+        let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
+        let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
+        let fm = FileManager.default
+        guard fm.fileExists(atPath: wavFile.path),
+            fm.fileExists(atPath: dictionaryFile.path)
+        else {
+            return nil
+        }
+        // Load dictionary words (chunk or file keywords)
+        let dictionaryWords = try loadDictionaryWords(
+            fileId: fileId,
+            dictionaryFile: dictionaryFile,
+            keywordMode: keywordMode,
+            keywordIndex: keywordIndex
+        )
+        // Load reference text
+        let referenceRaw =
+            (try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+        // Get audio samples
+        let audioFile = try AVAudioFile(forReading: wavFile)
+        let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
+        let format = audioFile.processingFormat
+        let frameCount = AVAudioFrameCount(audioFile.length)
+        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
+            throw NSError(
+                domain: "CtcEarningsBenchmark", code: 1,
+                userInfo: [NSLocalizedDescriptionKey: "Failed to create audio buffer"])
+        }
+        try audioFile.read(into: buffer)
+        // Resample to 16kHz
+        let converter = AudioConverter()
+        let samples = try converter.resampleBuffer(buffer)
+        let startTime = Date()
+        // 1. TDT transcription for low WER
+        let tdtResult = try await asrManager.transcribe(wavFile)
+        // Skip files where TDT returns empty (some audio files cause model issues)
+        if tdtResult.text.isEmpty {
+            print("  SKIPPED: TDT returned empty transcription")
+            return nil
+        }
+        // 2. Build custom vocabulary for CTC keyword spotting
+        var vocabTerms: [CustomVocabularyTerm] = []
+        for word in dictionaryWords {
+            let tokenIds = tokenize(word, vocabulary: ctcModels.vocabulary)
+            if !tokenIds.isEmpty {
+                let term = CustomVocabularyTerm(
+                    text: word,
+                    weight: nil,
+                    aliases: nil,
+                    tokenIds: nil,
+                    ctcTokenIds: tokenIds
+                )
+                vocabTerms.append(term)
+            }
+        }
+        let customVocab = CustomVocabularyContext(terms: vocabTerms)
+        // 3. CTC keyword spotting for high recall dictionary detection
+        let spotResult = try await spotter.spotKeywordsWithLogProbs(
+            audioSamples: samples,
+            customVocabulary: customVocab,
+            minScore: nil
+        )
+        // 4. Post-process: Use VocabularyRescorer with Argmax-style parameters
+        // Argmax uses cbw=3.0 (context-biasing weight) for boosting vocab terms
+        let useRescorer = ProcessInfo.processInfo.environment["NO_CTC_RESCORING"] != "1"
+        let hypothesis: String
+        if useRescorer {
+            let rescorerConfig = VocabularyRescorer.Config(
+                minScoreAdvantage: 1.0,  // Lower threshold - rely more on CTC scoring
+                minVocabScore: -15.0,  // Permissive to include more detections
+                maxOriginalScoreForReplacement: -2.0,  // Don't replace very confident words
+                vocabBoostWeight: 3.0  // Argmax cbw=3.0
+            )
+            let rescorer = VocabularyRescorer(
+                spotter: spotter,
+                vocabulary: customVocab,
+                config: rescorerConfig
+            )
+            let rescoreResult = rescorer.rescore(transcript: tdtResult.text, spotResult: spotResult)
+            hypothesis = rescoreResult.text
+        } else {
+            hypothesis = tdtResult.text  // Baseline: no CTC corrections
+        }
+        let processingTime = Date().timeIntervalSince(startTime)
+        // Normalize texts
+        let referenceNormalized = TextNormalizer.normalize(referenceRaw)
+        let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
+        // Keyword sets for precision/recall
+        let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
+        let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
+        let truePositives = referenceKeywords.intersection(predictedKeywords)
+        let falsePositives = predictedKeywords.subtracting(referenceKeywords)
+        let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
+        let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
+        let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
+        let keywordFscore =
+            (keywordPrecision + keywordRecall) > 0
+            ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
+            : 0
+        let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
+            !$0.isEmpty
+        }
+        let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
+            !$0.isEmpty
+        }
+        // Calculate WER
+        let wer: Double
+        if referenceWords.isEmpty {
+            wer = hypothesisWords.isEmpty ? 0.0 : 1.0
+        } else {
+            wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
+        }
+        // Count dictionary detections (debug only)
+        let minCtcScore: Float = -15.0  // Permissive threshold for detection
+        var detectionDetails: [[String: Any]] = []
+        var ctcFoundWords: Set<String> = []
+        // 1. CTC detections
+        for detection in spotResult.detections {
+            let inRef = referenceKeywords.contains(detection.term.text.lowercased())
+            let detail: [String: Any] = [
+                "word": detection.term.text,
+                "score": round(Double(detection.score) * 100) / 100,
+                "startTime": round(detection.startTime * 100) / 100,
+                "endTime": round(detection.endTime * 100) / 100,
+                "source": "ctc",
+                "inReference": inRef,
+            ]
+            detectionDetails.append(detail)
+            if detection.score >= minCtcScore {  // Use >= to include edge cases
+                ctcFoundWords.insert(detection.term.text.lowercased())
+            }
+        }
+        // 2. Fallback: check hypothesis for dictionary words not found by CTC
+        let hypothesisLower = hypothesis.lowercased()
+        for word in dictionaryWords {
+            let wordLower = word.lowercased()
+            if !ctcFoundWords.contains(wordLower) {
+                // Check if word appears as whole word in hypothesis (avoid substring false positives)
+                let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
+                if let regex = try? NSRegularExpression(pattern: pattern, options: []),
+                    regex.firstMatch(
+                        in: hypothesisLower, options: [],
+                        range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
+                {
+                    ctcFoundWords.insert(wordLower)
+                    let inRef = referenceKeywords.contains(wordLower)
+                    let detail: [String: Any] = [
+                        "word": word,
+                        "score": 0.0,
+                        "startTime": 0.0,
+                        "endTime": 0.0,
+                        "source": "hypothesis",
+                        "inReference": inRef,
+                    ]
+                    detectionDetails.append(detail)
+                }
+            }
+        }
+        let result: [String: Any] = [
+            "fileId": fileId,
+            "reference": referenceNormalized,
+            "hypothesis": hypothesisNormalized,
+            "wer": round(wer * 10000) / 100,
+            "dictFound": predictedKeywords.count,
+            "dictTotal": referenceKeywords.count,
+            "keywordPredicted": predictedKeywords.count,
+            "keywordReference": referenceKeywords.count,
+            "keywordTruePositives": truePositives.count,
+            "keywordFalsePositives": falsePositives.count,
+            "keywordFalseNegatives": falseNegatives.count,
+            "keywordPrecision": round(keywordPrecision * 1000) / 1000,
+            "keywordRecall": round(keywordRecall * 1000) / 1000,
+            "keywordFscore": round(keywordFscore * 1000) / 1000,
+            "audioLength": round(audioLength * 100) / 100,
+            "processingTime": round(processingTime * 1000) / 1000,
+            "ctcDetections": detectionDetails,
+        ]
+        return result
+    }
+    /// Simple tokenization using vocabulary lookup
+    private static func tokenize(_ text: String, vocabulary: [Int: String]) -> [Int] {
+        // Build reverse vocabulary (token -> id)
+        var tokenToId: [String: Int] = [:]
+        for (id, token) in vocabulary {
+            tokenToId[token] = id
+        }
+        let normalizedText = text.lowercased()
+        var result: [Int] = []
+        var position = normalizedText.startIndex
+        var isWordStart = true
+        while position < normalizedText.endIndex {
+            var matched = false
+            let remaining = normalizedText.distance(from: position, to: normalizedText.endIndex)
+            var matchLength = min(20, remaining)
+            while matchLength > 0 {
+                let endPos = normalizedText.index(position, offsetBy: matchLength)
+                let substring = String(normalizedText[position..<endPos])
+                // Try with SentencePiece prefix for word start
+                let withPrefix = isWordStart ? "▁" + substring : substring
+                if let tokenId = tokenToId[withPrefix] {
+                    result.append(tokenId)
+                    position = endPos
+                    isWordStart = false
+                    matched = true
+                    break
+                } else if let tokenId = tokenToId[substring] {
+                    result.append(tokenId)
+                    position = endPos
+                    isWordStart = false
+                    matched = true
+                    break
+                }
+                matchLength -= 1
+            }
+            if !matched {
+                let char = normalizedText[position]
+                if char == " " {
+                    isWordStart = true
+                    position = normalizedText.index(after: position)
+                } else {
+                    // Unknown character - skip
+                    position = normalizedText.index(after: position)
+                    isWordStart = false
+                }
+            }
+        }
+        return result
+    }
+    /// Apply CTC keyword corrections to TDT transcription using multiple strategies:
+    /// 1. Fuzzy matching (for words that are phonetically similar)
+    /// 2. Context pattern matching (for "this is X" type patterns)
+    /// 3. Proper noun replacement (for names after common patterns)
+    private static func applyKeywordCorrections(
+        tdtResult: ASRResult,
+        detections: [CtcKeywordSpotter.KeywordDetection],
+        minScore: Float
+    ) -> String {
+        // Filter detections by score
+        let validDetections = detections.filter { $0.score >= minScore }
+        guard !validDetections.isEmpty else {
+            return tdtResult.text
+        }
+        var text = tdtResult.text
+        var usedDetections: Set<String> = []
+        // PASS 1: Fuzzy matching for phonetically similar words
+        for detection in validDetections {
+            let keyword = detection.term.text
+            let keywordLower = keyword.lowercased()
+            let keywordParts = keywordLower.components(separatedBy: " ").filter { !$0.isEmpty }
+            let words = text.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }
+            // Handle multi-word keywords
+            if keywordParts.count > 1 {
+                for i in 0..<(words.count - keywordParts.count + 1) {
+                    var allMatch = true
+                    var matchedWords: [String] = []
+                    for j in 0..<keywordParts.count {
+                        let wordClean = words[i + j].trimmingCharacters(in: .punctuationCharacters).lowercased()
+                        if isSimilar(wordClean, keywordParts[j]) {
+                            matchedWords.append(words[i + j])
+                        } else {
+                            allMatch = false
+                            break
+                        }
+                    }
+                    if allMatch && !matchedWords.isEmpty {
+                        let originalPhrase = matchedWords.joined(separator: " ")
+                        let replacement = matchCase(keyword, to: matchedWords[0])
+                        text = text.replacingOccurrences(of: originalPhrase, with: replacement)
+                        usedDetections.insert(keyword)
+                        break
+                    }
+                }
+            } else {
+                // Single word keyword
+                for word in words {
+                    let wordClean = word.trimmingCharacters(in: .punctuationCharacters).lowercased()
+                    guard !wordClean.isEmpty else { continue }
+                    if isSimilar(wordClean, keywordLower) && wordClean != keywordLower {
+                        let replacement = matchCase(keyword, to: word)
+                        text = text.replacingOccurrences(of: word, with: replacement)
+                        usedDetections.insert(keyword)
+                        break
+                    }
+                }
+            }
+        }
+        // PASS 2: Context pattern matching - specifically for "this is X" pattern
+        // Only replace if keyword is NOT already in the text
+        for detection in validDetections {
+            let keyword = detection.term.text
+            guard !usedDetections.contains(keyword) else { continue }
+            let keywordLower = keyword.lowercased()
+            // Skip if keyword already exists in text (case-insensitive)
+            if text.lowercased().contains(keywordLower) {
+                usedDetections.insert(keyword)  // Mark as handled
+                continue
+            }
+            // Check if keyword looks like a proper noun (starts with uppercase)
+            let isProperNoun =
+                keyword.first?.isUppercase == true
+                && keyword.count >= 3
+                && !stopWords.contains(keywordLower)
+            guard isProperNoun else { continue }
+            // Look for "this is X" pattern specifically for names
+            let thisIsPattern = try? NSRegularExpression(pattern: "this is ([A-Z][a-z]+)", options: [])
+            if let regex = thisIsPattern {
+                let textRange = NSRange(text.startIndex..., in: text)
+                if let match = regex.firstMatch(in: text, options: [], range: textRange),
+                    match.numberOfRanges > 1,
+                    let captureRange = Range(match.range(at: 1), in: text)
+                {
+                    let capturedWord = String(text[captureRange])
+                    let capturedLower = capturedWord.lowercased()
+                    // Skip if captured word is already a detected keyword
+                    let isOtherKeyword = validDetections.contains { det in
+                        det.term.text.lowercased() == capturedLower
+                    }
+                    if !isOtherKeyword && !stopWords.contains(capturedLower) {
+                        // Similar length check
+                        if abs(capturedWord.count - keyword.count) <= 3 {
+                            text = text.replacingOccurrences(of: capturedWord, with: keyword)
+                            usedDetections.insert(keyword)
+                        }
+                    }
+                }
+            }
+        }
+        return text
+    }
+    /// Build word timings by merging subword tokens (tokens starting with "▁" begin new words)
+    private static func buildWordTimings(
+        from tokenTimings: [TokenTiming]
+    ) -> [(word: String, startTime: Double, endTime: Double)] {
+        var wordTimings: [(word: String, startTime: Double, endTime: Double)] = []
+        var currentWord = ""
+        var wordStart: Double = 0
+        var wordEnd: Double = 0
+        for timing in tokenTimings {
+            let token = timing.token
+            // Skip special tokens
+            if token.isEmpty || token == "<blank>" || token == "<pad>" {
+                continue
+            }
+            // Check if this starts a new word (has ▁ prefix or is first token)
+            let startsNewWord = token.hasPrefix("▁") || currentWord.isEmpty
+            if startsNewWord && !currentWord.isEmpty {
+                // Save previous word
+                wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
+                currentWord = ""
+            }
+            if startsNewWord {
+                currentWord = token.hasPrefix("▁") ? String(token.dropFirst()) : token
+                wordStart = timing.startTime
+            } else {
+                currentWord += token
+            }
+            wordEnd = timing.endTime
+        }
+        // Save final word
+        if !currentWord.isEmpty {
+            wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
+        }
+        return wordTimings
+    }
+    /// Common English words that should never be replaced by keyword matching
+    private static let stopWords: Set<String> = [
+        // Pronouns
+        "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
+        "my", "your", "his", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
+        "this", "that", "these", "those", "who", "whom", "what", "which", "whose",
+        // Common verbs
+        "is", "are", "was", "were", "be", "been", "being", "am",
+        "have", "has", "had", "having", "do", "does", "did", "doing", "done",
+        "will", "would", "shall", "should", "may", "might", "must", "can", "could",
+        "get", "got", "getting", "go", "goes", "went", "going", "gone",
+        "come", "came", "coming", "see", "saw", "seen", "know", "knew", "known",
+        "think", "thought", "make", "made", "take", "took", "taken", "give", "gave", "given",
+        "say", "said", "tell", "told", "ask", "asked", "use", "used", "want", "wanted",
+        "need", "needed", "try", "tried", "let", "put", "keep", "kept", "look", "looked",
+        // Articles and determiners
+        "a", "an", "the", "some", "any", "no", "every", "each", "all", "both", "few", "many",
+        "much", "more", "most", "other", "another", "such",
+        // Prepositions
+        "in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "down", "out",
+        "about", "into", "over", "after", "before", "between", "under", "through", "during",
+        // Conjunctions
+        "and", "or", "but", "so", "yet", "nor", "if", "then", "than", "because", "while",
+        "although", "unless", "since", "when", "where", "as",
+        // Adverbs
+        "not", "very", "just", "also", "only", "even", "still", "already", "always", "never",
+        "often", "sometimes", "usually", "really", "well", "now", "here", "there", "how", "why",
+        // Common words
+        "yes", "no", "okay", "ok", "thank", "thanks", "please", "sorry", "hello", "hi", "bye",
+        "good", "great", "bad", "new", "old", "first", "last", "long", "short", "big", "small",
+        "high", "low", "right", "left", "next", "back", "same", "different", "own", "able",
+        "way", "thing", "things", "time", "times", "year", "years", "day", "days", "week", "weeks",
+        "part", "place", "case", "point", "fact", "end", "kind", "lot", "set",
+        // Numbers
+        "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+        "hundred", "thousand", "million", "billion",
+    ]
+    /// Check if two words are similar (edit distance / length ratio)
+    private static func isSimilar(_ a: String, _ b: String) -> Bool {
+        // Never match stop words - they're too common to be proper nouns
+        if stopWords.contains(a) || stopWords.contains(b) {
+            return false
+        }
+        let maxLen = max(a.count, b.count)
+        let minLen = min(a.count, b.count)
+        guard maxLen > 0, minLen >= 3 else { return false }
+        // Allow more length difference for longer words
+        let lenDiff = abs(a.count - b.count)
+        if lenDiff > max(3, maxLen / 2) { return false }
+        // Calculate edit distance
+        let distance = editDistance(a, b)
+        // More aggressive threshold: allow up to 40% of max length as edits
+        let threshold = max(2, Int(Double(maxLen) * 0.4))
+        // Also check if one is substring of other (handles "Erik" in "Ririek")
+        if a.contains(b) || b.contains(a) {
+            return true
+        }
+        // Check common prefix/suffix (handles "Heri" vs "Harry")
+        let commonPrefix = commonPrefixLength(a, b)
+        let commonSuffix = commonSuffixLength(a, b)
+        if commonPrefix >= 2 || commonSuffix >= 2 {
+            return distance <= threshold + 1
+        }
+        return distance <= threshold
+    }
+    /// Get length of common prefix
+    private static func commonPrefixLength(_ a: String, _ b: String) -> Int {
+        let aChars = Array(a)
+        let bChars = Array(b)
+        var count = 0
+        for i in 0..<min(aChars.count, bChars.count) {
+            if aChars[i] == bChars[i] {
+                count += 1
+            } else {
+                break
+            }
+        }
+        return count
+    }
+    /// Get length of common suffix
+    private static func commonSuffixLength(_ a: String, _ b: String) -> Int {
+        let aChars = Array(a.reversed())
+        let bChars = Array(b.reversed())
+        var count = 0
+        for i in 0..<min(aChars.count, bChars.count) {
+            if aChars[i] == bChars[i] {
+                count += 1
+            } else {
+                break
+            }
+        }
+        return count
+    }
+    /// Simple edit distance calculation
+    private static func editDistance(_ a: String, _ b: String) -> Int {
+        let a = Array(a)
+        let b = Array(b)
+        let m = a.count
+        let n = b.count
+        if m == 0 { return n }
+        if n == 0 { return m }
+        var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
+        for i in 0...m { dp[i][0] = i }
+        for j in 0...n { dp[0][j] = j }
+        for i in 1...m {
+            for j in 1...n {
+                if a[i - 1] == b[j - 1] {
+                    dp[i][j] = dp[i - 1][j - 1]
+                } else {
+                    dp[i][j] = 1 + min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1]))
+                }
+            }
+        }
+        return dp[m][n]
+    }
+    /// Match the case pattern of the original word
+    private static func matchCase(_ keyword: String, to original: String) -> String {
+        let origClean = original.trimmingCharacters(in: .punctuationCharacters)
+        // Check case pattern
+        if origClean.first?.isUppercase == true {
+            // Capitalize first letter
+            return keyword.prefix(1).uppercased() + keyword.dropFirst()
+        }
+        return keyword
+    }
+    private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
+        if reference.isEmpty {
+            return hypothesis.isEmpty ? 0.0 : 1.0
+        }
+        let m = reference.count
+        let n = hypothesis.count
+        var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
+        for i in 0...m { dp[i][0] = i }
+        for j in 0...n { dp[0][j] = j }
+        for i in 1...m {
+            for j in 1...n {
+                if reference[i - 1] == hypothesis[j - 1] {
+                    dp[i][j] = dp[i - 1][j - 1]
+                } else {
+                    dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
+                }
+            }
+        }
+        return Double(dp[m][n]) / Double(m)
+    }
+    private static func printUsage() {
+        print(
+            """
+            CTC Earnings Benchmark (TDT + CTC keyword spotting)
+            Usage: fluidaudio ctc-earnings-benchmark [options]
+            Options:
+                --data-dir <path>     Path to earnings test dataset (auto-detected if downloaded)
+                --ctc-model <path>    Path to CTC model directory (auto-detected if in standard location)
+                --max-files <n>       Maximum number of files to process
+                --output, -o <path>   Output JSON file (default: ctc_earnings_benchmark.json)
+                --auto-download       Download earnings22-kws dataset if not found
+                --keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
+            Default locations:
+                Dataset: ~/Library/Application Support/FluidAudio/earnings22-kws/test-dataset/
+                CTC Model: ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/
+            Setup:
+                1. Download dataset: fluidaudio download --dataset earnings22-kws
+                2. Place CTC model in standard location
+                3. Run: fluidaudio ctc-earnings-benchmark
+            Examples:
+                # Run with auto-detected paths
+                fluidaudio ctc-earnings-benchmark
+                # Run with auto-download
+                fluidaudio ctc-earnings-benchmark --auto-download
+                # Run with explicit paths
+                fluidaudio ctc-earnings-benchmark \\
+                    --data-dir /path/to/test-dataset \\
+                    --ctc-model /path/to/parakeet-ctc-110m-coreml \\
+                    --max-files 100
+            """)
+    }
+    private static func parseKeywordMode(_ value: String) -> KeywordMode? {
+        switch value.lowercased() {
+        case "chunk", "chunk-keywords":
+            return .chunk
+        case "file", "file-keywords":
+            return .file
+        default:
+            return nil
+        }
+    }
+    private static func parentId(from fileId: String) -> String {
+        guard let range = fileId.range(of: "_chunk") else {
+            return fileId
+        }
+        return String(fileId[..<range.lowerBound])
+    }
+    private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
+        guard keywordMode == .file else {
+            return [:]
+        }
+        var index: [String: Set<String>] = [:]
+        let suffix = ".dictionary.txt"
+        let fileManager = FileManager.default
+        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
+        for url in contents {
+            let name = url.lastPathComponent
+            guard name.hasSuffix(suffix) else { continue }
+            let fileId = String(name.dropLast(suffix.count))
+            let parent = parentId(from: fileId)
+            let words = try loadDictionaryWords(from: url)
+            var set = index[parent] ?? Set<String>()
+            set.formUnion(words)
+            index[parent] = set
+        }
+        return index.mapValues { Array($0).sorted() }
+    }
+    private static func loadDictionaryWords(
+        fileId: String,
+        dictionaryFile: URL,
+        keywordMode: KeywordMode,
+        keywordIndex: [String: [String]]
+    ) throws -> [String] {
+        switch keywordMode {
+        case .chunk:
+            return try loadDictionaryWords(from: dictionaryFile)
+        case .file:
+            let parent = parentId(from: fileId)
+            if let words = keywordIndex[parent] {
+                return words
+            }
+            return try loadDictionaryWords(from: dictionaryFile)
+        }
+    }
+    private static func loadDictionaryWords(from url: URL) throws -> [String] {
+        let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
+        return dictionaryContent
+            .components(separatedBy: .newlines)
+            .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
+            .filter { !$0.isEmpty }
+    }
+    private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
+        let textLower = text.lowercased()
+        var result: Set<String> = []
+        for word in dictionaryWords {
+            let wordLower = word.lowercased()
+            let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
+            guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
+            let range = NSRange(textLower.startIndex..., in: textLower)
+            if regex.firstMatch(in: textLower, options: [], range: range) != nil {
+                result.insert(wordLower)
+            }
+        }
+        return result
+    }
+}
+#endif

cli/HybridEarningsBenchmark.swift ADDED Viewed

	@@ -0,0 +1,554 @@

+#if os(macOS)
+import AVFoundation
+import FluidAudio
+import Foundation
+/// Earnings22 benchmark using ONLY the Hybrid 110M model (single encoder).
+/// CTC head provides both transcription AND keyword spotting from the same encoder.
+public enum HybridEarningsBenchmark {
+    private enum KeywordMode: String {
+        case chunk
+        case file
+    }
+    public static func runCLI(arguments: [String]) async {
+        if arguments.contains("--help") || arguments.contains("-h") {
+            printUsage()
+            return
+        }
+        // Parse arguments
+        var outputFile = "hybrid_earnings_benchmark.json"
+        var maxFiles: Int? = nil
+        var decodingMode: HybridDecodingMode = .tdt
+        var useRescoring = false
+        var keywordMode: KeywordMode = .chunk
+        var i = 0
+        while i < arguments.count {
+            switch arguments[i] {
+            case "--output", "-o":
+                if i + 1 < arguments.count {
+                    outputFile = arguments[i + 1]
+                    i += 1
+                }
+            case "--max-files":
+                if i + 1 < arguments.count {
+                    maxFiles = Int(arguments[i + 1])
+                    i += 1
+                }
+            case "--ctc":
+                decodingMode = .ctc
+            case "--tdt":
+                decodingMode = .tdt
+            case "--rescore":
+                useRescoring = true
+            case "--keyword-mode":
+                if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
+                    keywordMode = mode
+                    i += 1
+                }
+            default:
+                break
+            }
+            i += 1
+        }
+        let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
+        guard FileManager.default.fileExists(atPath: dataDir.path) else {
+            print("ERROR: Earnings dataset not found at \(dataDir.path)")
+            print("Download with: fluidaudio download --dataset earnings22-kws")
+            return
+        }
+        let modeStr = decodingMode == .ctc ? "CTC" : "TDT"
+        let rescoringStr = useRescoring ? " + Rescoring" : ""
+        print("Hybrid 110M Earnings Benchmark (Decoding: \(modeStr)\(rescoringStr))")
+        print("  Output file: \(outputFile)")
+        print("  Decoding mode: \(modeStr)")
+        print("  Rescoring: \(useRescoring ? "enabled" : "disabled")")
+        print("  Keyword mode: \(keywordMode.rawValue)")
+        do {
+            // Load Hybrid 110M model (single encoder with CTC head)
+            print("Loading Hybrid 110M model...")
+            let hybridModels = try await HybridAsrModels.downloadAndLoad()
+            let hybridManager = HybridAsrManager(models: hybridModels, decodingMode: decodingMode)
+            let spotter = HybridKeywordSpotter(vocabulary: hybridModels.vocabulary, blankId: hybridModels.blankId)
+            print("  Vocab size: \(hybridModels.vocabSize)")
+            // Collect test files
+            let fileIds = try collectFileIds(from: dataDir, maxFiles: maxFiles)
+            let keywordIndex = try buildKeywordIndex(dataDir: dataDir, keywordMode: keywordMode)
+            if fileIds.isEmpty {
+                print("ERROR: No test files found")
+                return
+            }
+            print("Processing \(fileIds.count) test files...")
+            var results: [[String: Any]] = []
+            var totalWer = 0.0
+            var totalKeywordReference = 0
+            var totalKeywordPredicted = 0
+            var totalKeywordTruePositives = 0
+            var totalKeywordFalsePositives = 0
+            var totalKeywordFalseNegatives = 0
+            var totalAudioDuration = 0.0
+            var totalProcessingTime = 0.0
+            for (index, fileId) in fileIds.enumerated() {
+                print("[\(index + 1)/\(fileIds.count)] \(fileId)")
+                if let result = try await processFile(
+                    fileId: fileId,
+                    dataDir: dataDir,
+                    hybridManager: hybridManager,
+                    spotter: spotter,
+                    useRescoring: useRescoring,
+                    keywordMode: keywordMode,
+                    keywordIndex: keywordIndex
+                ) {
+                    results.append(result)
+                    totalWer += result["wer"] as? Double ?? 0
+                    totalKeywordReference += result["keywordReference"] as? Int ?? 0
+                    totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
+                    totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
+                    totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
+                    totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
+                    totalAudioDuration += result["audioLength"] as? Double ?? 0
+                    totalProcessingTime += result["processingTime"] as? Double ?? 0
+                    let wer = result["wer"] as? Double ?? 0
+                    let precision = result["keywordPrecision"] as? Double ?? 0
+                    let recall = result["keywordRecall"] as? Double ?? 0
+                    let fscore = result["keywordFscore"] as? Double ?? 0
+                    print(
+                        "  WER: \(String(format: "%.1f", wer))%, " +
+                            "KW P/R/F: \(String(format: "%.2f", precision))/" +
+                            "\(String(format: "%.2f", recall))/" +
+                            "\(String(format: "%.2f", fscore))"
+                    )
+                }
+            }
+            // Calculate summary
+            let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
+            let keywordPrecision =
+                totalKeywordPredicted > 0
+                ? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
+                : 0
+            let keywordRecall =
+                totalKeywordReference > 0
+                ? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
+                : 0
+            let keywordFscore =
+                (keywordPrecision + keywordRecall) > 0
+                ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
+                : 0
+            // Print summary
+            print("\n" + String(repeating: "=", count: 60))
+            print("HYBRID 110M BENCHMARK (\(modeStr)\(rescoringStr))")
+            print(String(repeating: "=", count: 60))
+            print("Model: parakeet-tdt-ctc-110m-hybrid")
+            print("Decoding: \(modeStr), Rescoring: \(useRescoring ? "yes" : "no")")
+            print("Total tests: \(results.count)")
+            print("Average WER: \(String(format: "%.2f", avgWer))%")
+            print(
+                "Keyword Precision/Recall/F1: " +
+                    "\(String(format: "%.2f", keywordPrecision))/" +
+                    "\(String(format: "%.2f", keywordRecall))/" +
+                    "\(String(format: "%.2f", keywordFscore))"
+            )
+            print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
+            print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
+            if totalProcessingTime > 0 {
+                print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
+            }
+            print(String(repeating: "=", count: 60))
+            // Sort results by WER descending (worst first)
+            let sortedResults = results.sorted { r1, r2 in
+                let wer1 = r1["wer"] as? Double ?? 0
+                let wer2 = r2["wer"] as? Double ?? 0
+                return wer1 > wer2
+            }
+            // Save to JSON
+            let summaryDict: [String: Any] = [
+                "totalTests": results.count,
+                "avgWer": round(avgWer * 100) / 100,
+                "keywordTruePositives": totalKeywordTruePositives,
+                "keywordFalsePositives": totalKeywordFalsePositives,
+                "keywordFalseNegatives": totalKeywordFalseNegatives,
+                "keywordPredicted": totalKeywordPredicted,
+                "keywordReference": totalKeywordReference,
+                "keywordPrecision": round(keywordPrecision * 1000) / 1000,
+                "keywordRecall": round(keywordRecall * 1000) / 1000,
+                "keywordFscore": round(keywordFscore * 1000) / 1000,
+                "totalAudioDuration": round(totalAudioDuration * 100) / 100,
+                "totalProcessingTime": round(totalProcessingTime * 100) / 100,
+            ]
+            let output: [String: Any] = [
+                "model": "parakeet-tdt-ctc-110m-hybrid",
+                "approach": "single-encoder",
+                "decodingMode": modeStr,
+                "rescoring": useRescoring,
+                "keywordMode": keywordMode.rawValue,
+                "summary": summaryDict,
+                "results": sortedResults,
+            ]
+            let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
+            try jsonData.write(to: URL(fileURLWithPath: outputFile))
+            print("\nResults written to: \(outputFile)")
+        } catch {
+            print("ERROR: \(error)")
+        }
+    }
+    private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
+        var fileIds: [String] = []
+        let suffix = ".dictionary.txt"
+        let fileManager = FileManager.default
+        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
+        for url in contents.sorted(by: { $0.path < $1.path }) {
+            let name = url.lastPathComponent
+            if name.hasSuffix(suffix) {
+                let data = try? Data(contentsOf: url)
+                if let data = data, !data.isEmpty {
+                    let fileId = String(name.dropLast(suffix.count))
+                    fileIds.append(fileId)
+                }
+            }
+        }
+        if let maxFiles = maxFiles {
+            return Array(fileIds.prefix(maxFiles))
+        }
+        return fileIds
+    }
+    private static func processFile(
+        fileId: String,
+        dataDir: URL,
+        hybridManager: HybridAsrManager,
+        spotter: HybridKeywordSpotter,
+        useRescoring: Bool,
+        keywordMode: KeywordMode,
+        keywordIndex: [String: [String]]
+    ) async throws -> [String: Any]? {
+        let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
+        let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
+        let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
+        let fm = FileManager.default
+        guard fm.fileExists(atPath: wavFile.path),
+            fm.fileExists(atPath: dictionaryFile.path)
+        else {
+            return nil
+        }
+        // Load dictionary words (chunk or file keywords)
+        let dictionaryWords = try loadDictionaryWords(
+            fileId: fileId,
+            dictionaryFile: dictionaryFile,
+            keywordMode: keywordMode,
+            keywordIndex: keywordIndex
+        )
+        // Load reference text
+        let referenceRaw =
+            (try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
+        // Get audio samples
+        let audioFile = try AVAudioFile(forReading: wavFile)
+        let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
+        let format = audioFile.processingFormat
+        let frameCount = AVAudioFrameCount(audioFile.length)
+        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
+            return nil
+        }
+        try audioFile.read(into: buffer)
+        // Resample to 16kHz
+        let converter = AudioConverter()
+        let samples = try converter.resampleBuffer(buffer)
+        // Build custom vocabulary for keyword spotting
+        var vocabTerms: [CustomVocabularyTerm] = []
+        for word in dictionaryWords {
+            let term = CustomVocabularyTerm(
+                text: word,
+                weight: nil,
+                aliases: nil,
+                tokenIds: nil,
+                ctcTokenIds: nil
+            )
+            vocabTerms.append(term)
+        }
+        let customVocab = CustomVocabularyContext(terms: vocabTerms)
+        // Run Hybrid 110M using new API (TDT transcription + CTC keyword detection)
+        let rescorerConfig: HybridTextRescorer.Config? = useRescoring ? .default : nil
+        let hybridResult = try await hybridManager.transcribeHybrid(
+            audioSamples: samples,
+            customVocabulary: customVocab,
+            rescorerConfig: rescorerConfig
+        )
+        // Skip if empty transcription
+        if hybridResult.text.isEmpty {
+            print("  SKIPPED: Empty transcription")
+            return nil
+        }
+        let detections = hybridResult.keywordDetections
+        let processingTime = hybridResult.processingTime
+        // Use hybrid transcription as hypothesis (may be rescored if enabled)
+        let hypothesis = hybridResult.text
+        // Normalize texts
+        let referenceNormalized = TextNormalizer.normalize(referenceRaw)
+        let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
+        // Keyword sets for precision/recall
+        let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
+        let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
+        let truePositives = referenceKeywords.intersection(predictedKeywords)
+        let falsePositives = predictedKeywords.subtracting(referenceKeywords)
+        let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
+        let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
+        let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
+        let keywordFscore =
+            (keywordPrecision + keywordRecall) > 0
+            ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
+            : 0
+        let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
+            !$0.isEmpty
+        }
+        let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
+            !$0.isEmpty
+        }
+        // Calculate WER
+        let wer: Double
+        if referenceWords.isEmpty {
+            wer = hypothesisWords.isEmpty ? 0.0 : 1.0
+        } else {
+            wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
+        }
+        // Count dictionary detections for debugging
+        let minCtcScore: Float = -15.0
+        var detectionDetails: [[String: Any]] = []
+        var foundWords: Set<String> = []
+        // CTC detections
+        for detection in detections {
+            let inRef = referenceKeywords.contains(detection.term.text.lowercased())
+            let detail: [String: Any] = [
+                "word": detection.term.text,
+                "score": round(Double(detection.score) * 100) / 100,
+                "startTime": round(detection.startTime * 100) / 100,
+                "endTime": round(detection.endTime * 100) / 100,
+                "source": "ctc",
+                "inReference": inRef,
+            ]
+            detectionDetails.append(detail)
+            if detection.score >= minCtcScore {
+                foundWords.insert(detection.term.text.lowercased())
+            }
+        }
+        // Fallback: check hypothesis for dictionary words not found by CTC
+        let hypothesisLower = hypothesis.lowercased()
+        for word in dictionaryWords {
+            let wordLower = word.lowercased()
+            if !foundWords.contains(wordLower) {
+                let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
+                if let regex = try? NSRegularExpression(pattern: pattern, options: []),
+                    regex.firstMatch(
+                        in: hypothesisLower, options: [],
+                        range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
+                {
+                    foundWords.insert(wordLower)
+                    let inRef = referenceKeywords.contains(wordLower)
+                    let detail: [String: Any] = [
+                        "word": word,
+                        "score": 0.0,
+                        "startTime": 0.0,
+                        "endTime": 0.0,
+                        "source": "hypothesis",
+                        "inReference": inRef,
+                    ]
+                    detectionDetails.append(detail)
+                }
+            }
+        }
+        let result: [String: Any] = [
+            "fileId": fileId,
+            "reference": referenceNormalized,
+            "hypothesis": hypothesisNormalized,
+            "wer": round(wer * 10000) / 100,
+            "dictFound": predictedKeywords.count,
+            "dictTotal": referenceKeywords.count,
+            "keywordPredicted": predictedKeywords.count,
+            "keywordReference": referenceKeywords.count,
+            "keywordTruePositives": truePositives.count,
+            "keywordFalsePositives": falsePositives.count,
+            "keywordFalseNegatives": falseNegatives.count,
+            "keywordPrecision": round(keywordPrecision * 1000) / 1000,
+            "keywordRecall": round(keywordRecall * 1000) / 1000,
+            "keywordFscore": round(keywordFscore * 1000) / 1000,
+            "audioLength": round(audioLength * 100) / 100,
+            "processingTime": round(processingTime * 1000) / 1000,
+            "ctcDetections": detectionDetails,
+        ]
+        return result
+    }
+    private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
+        if reference.isEmpty {
+            return hypothesis.isEmpty ? 0.0 : 1.0
+        }
+        let m = reference.count
+        let n = hypothesis.count
+        var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
+        for i in 0...m { dp[i][0] = i }
+        for j in 0...n { dp[0][j] = j }
+        for i in 1...m {
+            for j in 1...n {
+                if reference[i - 1] == hypothesis[j - 1] {
+                    dp[i][j] = dp[i - 1][j - 1]
+                } else {
+                    dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
+                }
+            }
+        }
+        return Double(dp[m][n]) / Double(m)
+    }
+    private static func printUsage() {
+        print(
+            """
+            Hybrid 110M Earnings Benchmark (Single Encoder)
+            Usage: fluidaudio hybrid-earnings-benchmark [options]
+            This benchmark uses ONLY the Hybrid 110M model:
+            - Single encoder provides CTC log-probs
+            - CTC greedy decode for transcription
+            - CTC keyword spotting from same encoder output
+            Options:
+                --max-files <n>       Maximum number of files to process
+                --output, -o <path>   Output JSON file (default: hybrid_earnings_benchmark.json)
+                --keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
+            Compare with:
+                fluidaudio ctc-earnings-benchmark  (Canary-CTC + TDT 0.6B, two encoders)
+            """)
+    }
+    private static func parseKeywordMode(_ value: String) -> KeywordMode? {
+        switch value.lowercased() {
+        case "chunk", "chunk-keywords":
+            return .chunk
+        case "file", "file-keywords":
+            return .file
+        default:
+            return nil
+        }
+    }
+    private static func parentId(from fileId: String) -> String {
+        guard let range = fileId.range(of: "_chunk") else {
+            return fileId
+        }
+        return String(fileId[..<range.lowerBound])
+    }
+    private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
+        guard keywordMode == .file else {
+            return [:]
+        }
+        var index: [String: Set<String>] = [:]
+        let suffix = ".dictionary.txt"
+        let fileManager = FileManager.default
+        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
+        for url in contents {
+            let name = url.lastPathComponent
+            guard name.hasSuffix(suffix) else { continue }
+            let fileId = String(name.dropLast(suffix.count))
+            let parent = parentId(from: fileId)
+            let words = try loadDictionaryWords(from: url)
+            var set = index[parent] ?? Set<String>()
+            set.formUnion(words)
+            index[parent] = set
+        }
+        return index.mapValues { Array($0).sorted() }
+    }
+    private static func loadDictionaryWords(
+        fileId: String,
+        dictionaryFile: URL,
+        keywordMode: KeywordMode,
+        keywordIndex: [String: [String]]
+    ) throws -> [String] {
+        switch keywordMode {
+        case .chunk:
+            return try loadDictionaryWords(from: dictionaryFile)
+        case .file:
+            let parent = parentId(from: fileId)
+            if let words = keywordIndex[parent] {
+                return words
+            }
+            return try loadDictionaryWords(from: dictionaryFile)
+        }
+    }
+    private static func loadDictionaryWords(from url: URL) throws -> [String] {
+        let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
+        return dictionaryContent
+            .components(separatedBy: .newlines)
+            .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
+            .filter { !$0.isEmpty }
+    }
+    private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
+        let textLower = text.lowercased()
+        var result: Set<String> = []
+        for word in dictionaryWords {
+            let wordLower = word.lowercased()
+            let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
+            guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
+            let range = NSRange(textLower.startIndex..., in: textLower)
+            if regex.firstMatch(in: textLower, options: [], range: range) != nil {
+                result.insert(wordLower)
+            }
+        }
+        return result
+    }
+}
+#endif

convert/.DS_Store ADDED Viewed

Binary file (10.2 kB). View file

convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py ADDED Viewed

	@@ -0,0 +1,323 @@

+#!/usr/bin/env python3
+"""
+Convert Parakeet TDT-CTC 110M decoder components to CoreML.
+This script exports the TDT decoder (prediction network) and joint network
+with the SAME format as the working 0.6B model:
+- JointDecision outputs token_id, token_prob, duration (argmax done inside)
+- Uses shape [1, dim, 1] for encoder/decoder steps
+- Matches the interface expected by TdtDecoderV3
+"""
+import argparse
+import os
+import torch
+import torch.nn.functional as F
+import coremltools as ct
+import numpy as np
+from pathlib import Path
+# NeMo imports
+import nemo.collections.asr as nemo_asr
+def get_model_config(model):
+    """Extract model configuration."""
+    encoder_dim = None
+    pred_hidden = 640  # Default for parakeet models
+    num_layers = 1
+    vocab_size = 1024
+    num_durations = 5
+    # Get encoder dimension
+    if hasattr(model, 'encoder'):
+        encoder = model.encoder
+        if hasattr(encoder, 'd_model'):
+            encoder_dim = encoder.d_model
+        elif hasattr(encoder, '_feat_out'):
+            encoder_dim = encoder._feat_out
+    # Get decoder config
+    if hasattr(model, 'decoder'):
+        decoder = model.decoder
+        if hasattr(decoder, 'pred_hidden'):
+            pred_hidden = decoder.pred_hidden
+        if hasattr(decoder, 'pred_rnn_layers'):
+            num_layers = decoder.pred_rnn_layers
+    # Get joint config
+    if hasattr(model, 'joint'):
+        joint = model.joint
+        if hasattr(joint, 'num_extra_outputs'):
+            num_durations = joint.num_extra_outputs
+        if hasattr(joint, 'num_classes'):
+            vocab_size = joint.num_classes - num_durations
+    return {
+        'encoder_dim': encoder_dim,
+        'pred_hidden': pred_hidden,
+        'num_layers': num_layers,
+        'vocab_size': vocab_size,
+        'num_durations': num_durations,
+    }
+class DecoderWrapper(torch.nn.Module):
+    """
+    Wrapper for the RNNT/TDT decoder (prediction network).
+    Matches 0.6B format:
+    - Input: targets[1,1], target_lengths[1], h_in[num_layers,1,pred_hidden], c_in[...]
+    - Output: decoder_output[1,pred_hidden,2], h_out[...], c_out[...]
+    """
+    def __init__(self, decoder, pred_hidden):
+        super().__init__()
+        self.decoder = decoder
+        self.pred_hidden = pred_hidden
+    def forward(self, targets, target_lengths, h_in, c_in):
+        """
+        Args:
+            targets: [1, 1] - previous token ID
+            target_lengths: [1] - always 1
+            h_in: [num_layers, 1, pred_hidden]
+            c_in: [num_layers, 1, pred_hidden]
+        Returns:
+            decoder_output: [1, pred_hidden, 2] - prediction network output (transposed)
+            h_out: [num_layers, 1, pred_hidden]
+            c_out: [num_layers, 1, pred_hidden]
+        """
+        state = (h_in, c_in)
+        # pred_output shape: [batch, time, pred_hidden] = [1, 1, pred_hidden]
+        pred_output, new_state = self.decoder.predict(targets, state=state, add_sos=False)
+        h_out, c_out = new_state
+        # Transpose to [batch, pred_hidden, time] and concat two time steps
+        # (0.6B outputs [1, 640, 2] - we match this by duplicating)
+        pred_transposed = pred_output.transpose(1, 2)  # [1, pred_hidden, 1]
+        decoder_output = torch.cat([pred_transposed, pred_transposed], dim=2)  # [1, pred_hidden, 2]
+        return decoder_output, h_out, c_out
+class JointWrapper(torch.nn.Module):
+    """
+    Wrapper for the TDT joint network with internal argmax.
+    Matches 0.6B format:
+    - Input: encoder_step[1,encoder_dim,1], decoder_step[1,pred_hidden,1]
+    - Output: token_id[1,1,1], token_prob[1,1,1], duration[1,1,1]
+    """
+    def __init__(self, joint, vocab_size, num_durations=5):
+        super().__init__()
+        self.joint = joint
+        self.vocab_size = vocab_size
+        self.num_durations = num_durations
+    def forward(self, encoder_step, decoder_step):
+        """
+        Args:
+            encoder_step: [1, encoder_dim, 1]
+            decoder_step: [1, pred_hidden, 1]
+        Returns:
+            token_id: [1, 1, 1] - argmax token ID
+            token_prob: [1, 1, 1] - probability of selected token
+            duration: [1, 1, 1] - argmax duration bin
+        """
+        # Transpose to [batch, 1, dim] for joint network
+        enc = encoder_step.transpose(1, 2)  # [1, 1, encoder_dim]
+        dec = decoder_step.transpose(1, 2)  # [1, 1, pred_hidden]
+        # Run joint network
+        # Joint output: [1, 1, 1, vocab_size + 1 (blank) + num_durations]
+        joint_out = self.joint.joint(enc, dec)
+        # Debug: print shape on first call
+        if not hasattr(self, '_debug_printed'):
+            self._debug_printed = True
+            print(f"  Joint output shape: {joint_out.shape}")
+            print(f"  Expected: vocab={self.vocab_size} + blank=1 + durations={self.num_durations} = {self.vocab_size + 1 + self.num_durations}")
+        # Split: token logits include vocab + blank, durations are separate
+        # vocab_size = 1024 tokens (0-1023), blank = index 1024, durations = indices 1025+
+        num_tokens = self.vocab_size + 1  # Include blank at vocab_size
+        logits = joint_out[..., :num_tokens]  # [1, 1, 1, vocab_size + 1]
+        duration_logits = joint_out[..., num_tokens:]  # [1, 1, 1, num_durations]
+        # Apply softmax and get probabilities
+        probs = F.softmax(logits, dim=-1)
+        # Argmax for token
+        token_id = torch.argmax(logits, dim=-1, keepdim=True)  # [1, 1, 1, 1]
+        token_id = token_id.squeeze(-1)  # [1, 1, 1]
+        # Get probability of selected token
+        token_prob = torch.gather(probs, -1, token_id.unsqueeze(-1))  # [1, 1, 1, 1]
+        token_prob = token_prob.squeeze(-1)  # [1, 1, 1]
+        # Argmax for duration
+        duration = torch.argmax(duration_logits, dim=-1, keepdim=False)  # [1, 1, 1]
+        return token_id.int(), token_prob, duration.int()
+def convert_decoder(model, config, output_dir: Path):
+    """Convert decoder to CoreML."""
+    print(f"Converting Decoder...")
+    print(f"  pred_hidden={config['pred_hidden']}, num_layers={config['num_layers']}")
+    wrapper = DecoderWrapper(model.decoder, config['pred_hidden'])
+    wrapper.eval()
+    # Create example inputs
+    targets = torch.zeros(1, 1, dtype=torch.long)
+    target_lengths = torch.ones(1, dtype=torch.long)
+    h_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
+    c_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
+    # Trace the model
+    with torch.no_grad():
+        traced = torch.jit.trace(wrapper, (targets, target_lengths, h_in, c_in))
+    # Convert to CoreML
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
+            ct.TensorType(name="target_lengths", shape=(1,), dtype=np.int32),
+            ct.TensorType(name="h_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
+            ct.TensorType(name="c_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
+        ],
+        outputs=[
+            ct.TensorType(name="decoder_output"),
+            ct.TensorType(name="h_out"),
+            ct.TensorType(name="c_out"),
+        ],
+        minimum_deployment_target=ct.target.iOS17,
+        compute_precision=ct.precision.FLOAT16,
+    )
+    # Add metadata
+    mlmodel.author = "Fluid Inference"
+    mlmodel.short_description = "Hybrid TDT Decoder (110M)"
+    # Save
+    output_path = output_dir / "Decoder.mlpackage"
+    mlmodel.save(str(output_path))
+    print(f"  Saved to {output_path}")
+    return mlmodel
+def convert_joint(model, config, output_dir: Path):
+    """Convert joint network to CoreML."""
+    print(f"Converting JointDecision...")
+    print(f"  encoder_dim={config['encoder_dim']}, pred_hidden={config['pred_hidden']}")
+    print(f"  vocab_size={config['vocab_size']}, num_durations={config['num_durations']}")
+    wrapper = JointWrapper(
+        model.joint,
+        vocab_size=config['vocab_size'],
+        num_durations=config['num_durations']
+    )
+    wrapper.eval()
+    # Create example inputs - shape [1, dim, 1]
+    encoder_step = torch.randn(1, config['encoder_dim'], 1)
+    decoder_step = torch.randn(1, config['pred_hidden'], 1)
+    # Trace the model
+    with torch.no_grad():
+        traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
+    # Convert to CoreML
+    mlmodel = ct.convert(
+        traced,
+        inputs=[
+            ct.TensorType(name="encoder_step", shape=(1, config['encoder_dim'], 1), dtype=np.float32),
+            ct.TensorType(name="decoder_step", shape=(1, config['pred_hidden'], 1), dtype=np.float32),
+        ],
+        outputs=[
+            ct.TensorType(name="token_id"),
+            ct.TensorType(name="token_prob"),
+            ct.TensorType(name="duration"),
+        ],
+        minimum_deployment_target=ct.target.iOS17,
+        compute_precision=ct.precision.FLOAT16,
+    )
+    # Add metadata
+    mlmodel.author = "Fluid Inference"
+    mlmodel.short_description = "Hybrid Joint Decision (110M)"
+    # Save
+    output_path = output_dir / "JointDecision.mlpackage"
+    mlmodel.save(str(output_path))
+    print(f"  Saved to {output_path}")
+    return mlmodel
+def main():
+    parser = argparse.ArgumentParser(description="Convert TDT decoder to CoreML (0.6B format)")
+    parser.add_argument(
+        "--model-name",
+        default="nvidia/parakeet-tdt_ctc-110m",
+        help="NeMo model name or path"
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=Path,
+        default=Path("./output"),
+        help="Output directory for CoreML models"
+    )
+    args = parser.parse_args()
+    # Create output directory
+    args.output_dir.mkdir(parents=True, exist_ok=True)
+    # Load model
+    print(f"Loading model: {args.model_name}")
+    model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(args.model_name)
+    model.eval()
+    # Get model configuration
+    config = get_model_config(model)
+    # Auto-detect encoder dim if not found
+    if config['encoder_dim'] is None:
+        print("Auto-detecting encoder dimension...")
+        dummy_audio = torch.randn(1, 16000)
+        dummy_length = torch.tensor([16000])
+        with torch.no_grad():
+            enc_out, enc_len = model.encoder(
+                audio_signal=dummy_audio,
+                length=dummy_length
+            )
+        config['encoder_dim'] = enc_out.shape[-1]
+    print(f"\nModel config:")
+    for k, v in config.items():
+        print(f"  {k}: {v}")
+    # Convert components
+    print()
+    convert_decoder(model, config, args.output_dir)
+    convert_joint(model, config, args.output_dir)
+    print("\nConversion complete!")
+    print(f"Models saved to: {args.output_dir}")
+    print("\nNext steps:")
+    print("1. Compile to .mlmodelc:")
+    print(f"   cd {args.output_dir}")
+    print("   xcrun coremlcompiler compile Decoder.mlpackage .")
+    print("   xcrun coremlcompiler compile JointDecision.mlpackage .")
+    print("2. Copy to model cache:")
+    print("   cp -r Decoder.mlmodelc JointDecision.mlmodelc ~/Library/Application\\ Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
+    print("3. Test with: swift run fluidaudio hybrid-earnings-benchmark --max-files 1")
+if __name__ == "__main__":
+    main()

convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c79c8bc763b4efccb3e12f199ec0a59aa2edc5e9e4d21ca70fde8f36762d4147
+size 480078

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fc681823d92eca3dbece3a30c975afa7251eedae0e718b07ffbf1a8b4313b87e
+size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2ebec8fc38c063de4b2159e21b1f981309fa5947c24d7e4883aca20f7c15fbb9
+size 377

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,66 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet 110M CTC decoder head",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 188 × 1025)",
+        "shortDescription" : "",
+        "shape" : "[1, 188, 1025]",
+        "name" : "ctc_logits",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.cast" : 2,
+      "Ios17.conv" : 1,
+      "Ios17.transpose" : 1,
+      "Ios16.softmax" : 1,
+      "Ios17.log" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 188]",
+        "name" : "encoder_output",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.9.0",
+      "com.github.apple.coremltools.version" : "8.3.0"
+    },
+    "generatedClassName" : "parakeet_ctc_head",
+    "method" : "predict"
+  }
+]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,24 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 512, 188]> encoder_output) {
+            tensor<int32, []> var_4 = const()[name = tensor<string, []>("op_4"), val = tensor<int32, []>(-1)];
+            tensor<string, []> var_18_pad_type_0 = const()[name = tensor<string, []>("op_18_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [1]> var_18_strides_0 = const()[name = tensor<string, []>("op_18_strides_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [2]> var_18_pad_0 = const()[name = tensor<string, []>("op_18_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> var_18_dilations_0 = const()[name = tensor<string, []>("op_18_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> var_18_groups_0 = const()[name = tensor<string, []>("op_18_groups_0"), val = tensor<int32, []>(1)];
+            tensor<string, []> encoder_output_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_output_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1025, 512, 1]> module_decoder_layers_0_weight_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_weight_to_fp16"), val = tensor<fp16, [1025, 512, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1025]> module_decoder_layers_0_bias_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_bias_to_fp16"), val = tensor<fp16, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1049728)))];
+            tensor<fp16, [1, 512, 188]> encoder_output_to_fp16 = cast(dtype = encoder_output_to_fp16_dtype_0, x = encoder_output)[name = tensor<string, []>("cast_1")];
+            tensor<fp16, [1, 1025, 188]> var_18_cast_fp16 = conv(bias = module_decoder_layers_0_bias_to_fp16, dilations = var_18_dilations_0, groups = var_18_groups_0, pad = var_18_pad_0, pad_type = var_18_pad_type_0, strides = var_18_strides_0, weight = module_decoder_layers_0_weight_to_fp16, x = encoder_output_to_fp16)[name = tensor<string, []>("op_18_cast_fp16")];
+            tensor<int32, [3]> input_perm_0 = const()[name = tensor<string, []>("input_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<fp16, [1, 188, 1025]> input_cast_fp16 = transpose(perm = input_perm_0, x = var_18_cast_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 188, 1025]> out_objects_softmax_cast_fp16 = softmax(axis = var_4, x = input_cast_fp16)[name = tensor<string, []>("out_objects_softmax_cast_fp16")];
+            tensor<fp32, []> out_objects_epsilon_0 = const()[name = tensor<string, []>("out_objects_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
+            tensor<fp16, [1, 188, 1025]> out_objects_cast_fp16 = log(epsilon = out_objects_epsilon_0, x = out_objects_softmax_cast_fp16)[name = tensor<string, []>("out_objects_cast_fp16")];
+            tensor<string, []> out_objects_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("out_objects_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, [1, 188, 1025]> ctc_logits = cast(dtype = out_objects_cast_fp16_to_fp32_dtype_0, x = out_objects_cast_fp16)[name = tensor<string, []>("cast_0")];
+        } -> (ctc_logits);
+}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
+size 1051842

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:990455f6431342750254f66edf27bfb41be62a7ba17a18e1dd6afd4f5f56e9eb
+size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29009727821ad8551ab5fe9271e93c597d92a9714f64b94aa533a9ceb6e22b93
+size 498

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,118 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet 110M decoder (RNNT prediction network)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "h_out",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "c_out",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.squeeze" : 2,
+      "Ios17.gather" : 1,
+      "Ios17.cast" : 6,
+      "Ios17.lstm" : 1,
+      "Ios17.transpose" : 2,
+      "Identity" : 1,
+      "Ios17.expandDims" : 2
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1]",
+        "name" : "targets",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "target_length",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "h_in",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 640]",
+        "name" : "c_in",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.version" : "8.3.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.9.0"
+    },
+    "generatedClassName" : "parakeet_decoder",
+    "method" : "predict"
+  }
+]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,45 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
+            tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1025, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
+            tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
+            tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
+            tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
+            tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
+            tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
+            tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
+            tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
+            tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
+            tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1312128)))];
+            tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4588992)))];
+            tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7865856)))];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
+            tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
+            tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
+            tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
+            tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
+            tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
+            tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
+            tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
+            tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
+        } -> (decoder, h_out, c_out);
+}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
+size 7871040

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b7ae65e2af616df46066b7efca2d7c19941666ac0685f4ed005666890a052b0d
+size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0713c2d6ac5f8f6fb9582be250351ebd8efc925f71f4261191165f1406f2ee5d
+size 437

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,105 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet 110M encoder (15 s window)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 188]",
+        "name" : "encoder_output",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "encoder_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.logicalAnd" : 2,
+      "Ios17.reshape" : 103,
+      "Ios16.softmax" : 17,
+      "Ios17.matmul" : 51,
+      "Ios17.transpose" : 123,
+      "Split" : 17,
+      "Ios17.expandDims" : 17,
+      "Select" : 51,
+      "Ios17.add" : 128,
+      "Tile" : 8,
+      "Ios17.sliceByIndex" : 34,
+      "Ios16.sigmoid" : 17,
+      "Pad" : 34,
+      "Ios17.logicalNot" : 2,
+      "Ios17.layerNorm" : 85,
+      "Ios16.silu" : 51,
+      "Ios17.less" : 5,
+      "Ios17.sub" : 3,
+      "Ios17.conv" : 56,
+      "Ios16.relu" : 3,
+      "Ios17.linear" : 137,
+      "Ios17.cast" : 11,
+      "Ios17.floorDiv" : 3,
+      "Ios17.mul" : 77
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 80 × 1501)",
+        "shortDescription" : "",
+        "shape" : "[1, 80, 1501]",
+        "name" : "mel_features",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "mel_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.9.0",
+      "com.github.apple.coremltools.version" : "8.3.0"
+    },
+    "generatedClassName" : "parakeet_encoder",
+    "method" : "predict"
+  }
+]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil ADDED Viewed

The diff for this file is too large to render. See raw diff

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
+size 215143424

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:983ba26dd9276b8d2d4f75f3475aefb1817c542df87dbd0fdac95bd63647494f
+size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0800e3bdf4ecb1bd46fd27e1826d33125cd574f9ae1e15dd9ff70ea42944ca2d
+size 476

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,102 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet 110M joint + decision head (split, softmax, argmax)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 188 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 188, 1]",
+        "name" : "token_id",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 188 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 188, 1]",
+        "name" : "token_prob",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 188 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 188, 1]",
+        "name" : "duration",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.reduceArgmax" : 2,
+      "Ios17.squeeze" : 1,
+      "Ios17.cast" : 4,
+      "Ios17.linear" : 3,
+      "Ios17.transpose" : 2,
+      "Ios17.sliceByIndex" : 2,
+      "Ios17.add" : 1,
+      "Ios16.relu" : 1,
+      "Ios16.softmax" : 1,
+      "Ios17.gatherAlongAxis" : 1,
+      "Ios17.expandDims" : 3
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 188]",
+        "name" : "encoder",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.version" : "8.3.0",
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.9.0"
+    },
+    "generatedClassName" : "parakeet_joint_decision",
+    "method" : "predict"
+  }
+]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,58 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 512, 188]> encoder) {
+            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> encoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> decoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
+            tensor<fp16, [1, 512, 188]> encoder_to_fp16 = cast(dtype = encoder_to_fp16_dtype_0, x = encoder)[name = tensor<string, []>("cast_6")];
+            tensor<fp16, [1, 188, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_to_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 188, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
+            tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
+            tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
+            tensor<fp16, [1, 640, 1]> decoder_to_fp16 = cast(dtype = decoder_to_fp16_dtype_0, x = decoder)[name = tensor<string, []>("cast_5")];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_to_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
+            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 188, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
+            tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
+            tensor<fp16, [1, 188, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<fp16, [1, 188, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
+            tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
+            tensor<fp16, [1, 188, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
+            tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1025])];
+            tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 188, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
+            tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
+            tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1030])];
+            tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 188, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
+            tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 188, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
+            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
+            tensor<fp16, [1, 188, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
+            tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 188, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 188, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_4")];
+            tensor<fp16, [1, 188, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
+            tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1, 188, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
+            tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 188, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
+            tensor<fp32, [1, 188, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_3")];
+        } -> (token_id, token_prob, duration);
+}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
+size 2798028

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c7c11c6bb985fab7f835ba687a575f1eb04f4c93b0783155d634adbc49f0e797
+size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1af2cb9bcc13eec83ce006e4f1c2cf158393745cd9187428333fbcb6917da244
+size 535

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,123 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet 110M single-step joint decision (current frame)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "token_id",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "token_prob",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1]",
+        "name" : "duration",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 64]",
+        "name" : "top_k_ids",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
+        "shortDescription" : "",
+        "shape" : "[1, 1, 1, 64]",
+        "name" : "top_k_logits",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Ios17.reduceArgmax" : 2,
+      "Ios17.linear" : 3,
+      "Ios17.transpose" : 2,
+      "Ios17.sliceByIndex" : 2,
+      "Ios17.add" : 1,
+      "Ios17.topk" : 1,
+      "Ios16.relu" : 1,
+      "Ios16.softmax" : 1,
+      "Ios17.expandDims" : 3,
+      "Ios17.squeeze" : 1,
+      "Ios17.cast" : 6,
+      "Ios17.gatherAlongAxis" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 512, 1]",
+        "name" : "encoder_step",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
+        "shortDescription" : "",
+        "shape" : "[1, 640, 1]",
+        "name" : "decoder_step",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.9.0",
+      "com.github.apple.coremltools.version" : "8.3.0"
+    },
+    "generatedClassName" : "parakeet_joint_decision_single_step",
+    "method" : "predict"
+  }
+]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,69 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
+            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
+            tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
+            tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_9")];
+            tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
+            tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
+            tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
+            tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
+            tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_8")];
+            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
+            tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
+            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
+            tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
+            tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
+            tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
+            tensor<fp16, [1, 1, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
+            tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
+            tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1025])];
+            tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
+            tensor<fp16, [1, 1, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
+            tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
+            tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1030])];
+            tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
+            tensor<fp16, [1, 1, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
+            tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
+            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
+            tensor<fp16, [1, 1, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
+            tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<int32, [1, 1, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
+            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<int16, [1, 1, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_7")];
+            tensor<fp16, [1, 1, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
+            tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<fp16, [1, 1, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
+            tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, [1, 1, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
+            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(64)];
+            tensor<int32, []> var_76_axis_0 = const()[name = tensor<string, []>("op_76_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<bool, []> var_76_ascending_0 = const()[name = tensor<string, []>("op_76_ascending_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> var_76_sort_0 = const()[name = tensor<string, []>("op_76_sort_0"), val = tensor<bool, []>(true)];
+            tensor<bool, []> var_76_return_indices_0 = const()[name = tensor<string, []>("op_76_return_indices_0"), val = tensor<bool, []>(true)];
+            tensor<string, []> var_76_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
+            tensor<fp16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_1 = topk(ascending = var_76_ascending_0, axis = var_76_axis_0, k = var_72, output_indices_dtype = var_76_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_76_return_indices_0, sort = var_76_sort_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_76_cast_fp16_cast_int16")];
+            tensor<string, []> var_76_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<string, []> var_76_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_76_cast_fp16_0_to_fp32_dtype_0, x = var_76_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_4")];
+            tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_76_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_76_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_5")];
+            tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_6")];
+        } -> (token_id, token_prob, duration, top_k_ids, top_k_logits);
+}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
+size 2798028

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a1ac15543fbb9301fba5f018b147e44d767479dec352aaa91dfe7bcf65949693
+size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4940877938cc1b6d8830bbdd68ac8a49377cc57d75b61308883da5235b6a1914
+size 439

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json ADDED Viewed

	@@ -0,0 +1,112 @@

+[
+  {
+    "metadataOutputVersion" : "3.0",
+    "shortDescription" : "Parakeet 110M preprocessor (15 s window)",
+    "outputSchema" : [
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Float32",
+        "formattedType" : "MultiArray (Float32)",
+        "shortDescription" : "",
+        "shape" : "[]",
+        "name" : "mel_features",
+        "type" : "MultiArray"
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "mel_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "storagePrecision" : "Float16",
+    "modelParameters" : [
+    ],
+    "author" : "Fluid Inference",
+    "specificationVersion" : 8,
+    "mlProgramOperationTypeHistogram" : {
+      "Range1d" : 3,
+      "Ios17.equal" : 1,
+      "Ios17.notEqual" : 1,
+      "Ios17.reshape" : 2,
+      "Identity" : 1,
+      "Ios17.matmul" : 1,
+      "Select" : 6,
+      "Ios17.expandDims" : 12,
+      "Ios17.add" : 3,
+      "Tile" : 2,
+      "Ios17.sliceByIndex" : 3,
+      "Ios16.reduceSum" : 4,
+      "Shape" : 4,
+      "Ios17.gather" : 4,
+      "Ios17.logicalNot" : 1,
+      "Pad" : 1,
+      "Ios17.log" : 1,
+      "Ios17.less" : 2,
+      "Ios17.sub" : 4,
+      "Ios17.conv" : 2,
+      "Ios17.pow" : 2,
+      "Ios17.cast" : 10,
+      "Ios17.concat" : 3,
+      "Stack" : 1,
+      "Ios17.floorDiv" : 1,
+      "Ios17.realDiv" : 4,
+      "Ios17.sqrt" : 1,
+      "Ios17.greaterEqual" : 1,
+      "Ios17.mul" : 1
+    },
+    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
+    "isUpdatable" : "0",
+    "stateSchema" : [
+    ],
+    "availability" : {
+      "macOS" : "14.0",
+      "tvOS" : "17.0",
+      "visionOS" : "1.0",
+      "watchOS" : "10.0",
+      "iOS" : "17.0",
+      "macCatalyst" : "17.0"
+    },
+    "modelType" : {
+      "name" : "MLModelType_mlProgram"
+    },
+    "inputSchema" : [
+      {
+        "dataType" : "Float32",
+        "hasShapeFlexibility" : "1",
+        "isOptional" : "0",
+        "shapeFlexibility" : "1 × 1...240000",
+        "shapeRange" : "[[1, 1], [1, 240000]]",
+        "formattedType" : "MultiArray (Float32 1 × 1)",
+        "type" : "MultiArray",
+        "shape" : "[1, 1]",
+        "name" : "audio",
+        "shortDescription" : ""
+      },
+      {
+        "hasShapeFlexibility" : "0",
+        "isOptional" : "0",
+        "dataType" : "Int32",
+        "formattedType" : "MultiArray (Int32 1)",
+        "shortDescription" : "",
+        "shape" : "[1]",
+        "name" : "audio_length",
+        "type" : "MultiArray"
+      }
+    ],
+    "userDefinedMetadata" : {
+      "com.github.apple.coremltools.source_dialect" : "TorchScript",
+      "com.github.apple.coremltools.source" : "torch==2.9.0",
+      "com.github.apple.coremltools.version" : "8.3.0"
+    },
+    "generatedClassName" : "parakeet_preprocessor",
+    "method" : "predict"
+  }
+]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil ADDED Viewed

	@@ -0,0 +1,191 @@

+program(1.0)
+[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
+{
+    func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 240000]]}})))] {
+            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
+            tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
+            tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
+            tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_35 = add(x = audio_length, y = var_34)[name = tensor<string, []>("op_35")];
+            tensor<int32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, []>(512)];
+            tensor<int32, [1]> var_37 = sub(x = var_35, y = var_36)[name = tensor<string, []>("op_37")];
+            tensor<int32, [1]> floor_div_0 = floor_div(x = var_37, y = var_10)[name = tensor<string, []>("floor_div_0")];
+            tensor<bool, [1]> var_40 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_40")];
+            tensor<int32, [1]> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1]> mel_length = select(a = var_41, b = floor_div_0, cond = var_40)[name = tensor<string, []>("seq_len")];
+            tensor<string, []> audio_to_fp16_dtype_0 = const()[name = tensor<string, []>("audio_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [1, ?]> audio_to_fp16 = cast(dtype = audio_to_fp16_dtype_0, x = audio)[name = tensor<string, []>("cast_27")];
+            tensor<int32, [2]> var_43_shape_cast_fp16 = shape(x = audio_to_fp16)[name = tensor<string, []>("op_43_shape_cast_fp16")];
+            tensor<int32, []> gather_0_axis_0 = const()[name = tensor<string, []>("gather_0_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_43_shape_cast_fp16_to_int16_dtype_0 = const()[name = tensor<string, []>("op_43_shape_cast_fp16_to_int16_dtype_0"), val = tensor<string, []>("int16")];
+            tensor<uint16, []> select_0_to_uint16 = const()[name = tensor<string, []>("select_0_to_uint16"), val = tensor<uint16, []>(1)];
+            tensor<int16, [2]> var_43_shape_cast_fp16_to_int16 = cast(dtype = var_43_shape_cast_fp16_to_int16_dtype_0, x = var_43_shape_cast_fp16)[name = tensor<string, []>("cast_26")];
+            tensor<int16, []> gather_0_cast_uint16 = gather(axis = gather_0_axis_0, batch_dims = gather_0_batch_dims_0, indices = select_0_to_uint16, validate_indices = gather_0_validate_indices_0, x = var_43_shape_cast_fp16_to_int16)[name = tensor<string, []>("gather_0_cast_uint16")];
+            tensor<string, []> gather_0_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_0_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
+            tensor<int32, []> gather_0_cast_uint16_to_int32 = cast(dtype = gather_0_cast_uint16_to_int32_dtype_0, x = gather_0_cast_uint16)[name = tensor<string, []>("cast_25")];
+            tensor<int32, [?]> var_44 = range_1d(end = gather_0_cast_uint16_to_int32, start = const_0, step = const_1)[name = tensor<string, []>("op_44")];
+            tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, ?]> var_45 = expand_dims(axes = var_45_axes_0, x = var_44)[name = tensor<string, []>("op_45")];
+            tensor<int32, [1]> var_46_axes_0 = const()[name = tensor<string, []>("op_46_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1]> var_46 = expand_dims(axes = var_46_axes_0, x = audio_length)[name = tensor<string, []>("op_46")];
+            tensor<bool, [1, ?]> timemask = less(x = var_45, y = var_46)[name = tensor<string, []>("timemask")];
+            tensor<int32, [2]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [2]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [2]>([1, 1])];
+            tensor<bool, [2]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [2]>([true, false])];
+            tensor<bool, [2]> var_49_squeeze_mask_0 = const()[name = tensor<string, []>("op_49_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
+            tensor<fp16, [1]> var_49_cast_fp16 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, squeeze_mask = var_49_squeeze_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_49_cast_fp16")];
+            tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1]> var_50_cast_fp16 = expand_dims(axes = var_50_axes_0, x = var_49_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
+            tensor<int32, [2]> var_52_begin_0 = const()[name = tensor<string, []>("op_52_begin_0"), val = tensor<int32, [2]>([0, 1])];
+            tensor<int32, [2]> var_52_end_0 = const()[name = tensor<string, []>("op_52_end_0"), val = tensor<int32, [2]>([1, 0])];
+            tensor<bool, [2]> var_52_end_mask_0 = const()[name = tensor<string, []>("op_52_end_mask_0"), val = tensor<bool, [2]>([true, true])];
+            tensor<fp16, [1, ?]> var_52_cast_fp16 = slice_by_index(begin = var_52_begin_0, end = var_52_end_0, end_mask = var_52_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_52_cast_fp16")];
+            tensor<int32, [2]> var_54_begin_0 = const()[name = tensor<string, []>("op_54_begin_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [2]> var_54_end_0 = const()[name = tensor<string, []>("op_54_end_0"), val = tensor<int32, [2]>([1, -1])];
+            tensor<bool, [2]> var_54_end_mask_0 = const()[name = tensor<string, []>("op_54_end_mask_0"), val = tensor<bool, [2]>([true, false])];
+            tensor<fp16, [1, ?]> var_54_cast_fp16 = slice_by_index(begin = var_54_begin_0, end = var_54_end_0, end_mask = var_54_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_54_cast_fp16")];
+            tensor<fp16, []> var_55_to_fp16 = const()[name = tensor<string, []>("op_55_to_fp16"), val = tensor<fp16, []>(0x1.f0cp-1)];
+            tensor<fp16, [1, ?]> var_56_cast_fp16 = mul(x = var_54_cast_fp16, y = var_55_to_fp16)[name = tensor<string, []>("op_56_cast_fp16")];
+            tensor<fp16, [1, ?]> var_57_cast_fp16 = sub(x = var_52_cast_fp16, y = var_56_cast_fp16)[name = tensor<string, []>("op_57_cast_fp16")];
+            tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, ?]> x_3_cast_fp16 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_50_cast_fp16, var_57_cast_fp16))[name = tensor<string, []>("x_3_cast_fp16")];
+            tensor<bool, [1, ?]> var_60 = logical_not(x = timemask)[name = tensor<string, []>("op_60")];
+            tensor<fp16, []> var_16_to_fp16 = const()[name = tensor<string, []>("op_16_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
+            tensor<fp16, [1, ?]> input_1_cast_fp16 = select(a = var_16_to_fp16, b = x_3_cast_fp16, cond = var_60)[name = tensor<string, []>("input_1_cast_fp16")];
+            tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
+            tensor<fp16, [1, 1, ?]> input_3_cast_fp16 = reshape(shape = concat_1x, x = input_1_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
+            tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
+            tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
+            tensor<fp16, []> const_3_to_fp16 = const()[name = tensor<string, []>("const_3_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
+            tensor<fp16, [1, 1, ?]> input_5_cast_fp16 = pad(constant_val = const_3_to_fp16, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
+            tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
+            tensor<fp16, [1, ?]> input_cast_fp16 = reshape(shape = concat_2x, x = input_5_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
+            tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
+            tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [1, 1, ?]> expand_dims_4_cast_fp16 = expand_dims(axes = expand_dims_4_axes_0, x = input_cast_fp16)[name = tensor<string, []>("expand_dims_4_cast_fp16")];
+            tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp16, [257, 1, 512]> expand_dims_1_to_fp16 = const()[name = tensor<string, []>("expand_dims_1_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
+            tensor<fp16, [1, 257, ?]> conv_0_cast_fp16 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_0_cast_fp16")];
+            tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
+            tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
+            tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
+            tensor<fp16, [257, 1, 512]> expand_dims_2_to_fp16 = const()[name = tensor<string, []>("expand_dims_2_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263296)))];
+            tensor<fp16, [1, 257, ?]> conv_1_cast_fp16 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_1_cast_fp16")];
+            tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
+            tensor<fp16, [1, 257, ?, 2]> stack_0_cast_fp16 = stack(axis = stack_0_axis_0, values = (conv_0_cast_fp16, conv_1_cast_fp16))[name = tensor<string, []>("stack_0_cast_fp16")];
+            tensor<fp16, []> var_19_promoted_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
+            tensor<fp16, [1, 257, ?, 2]> var_75_cast_fp16 = pow(x = stack_0_cast_fp16, y = var_19_promoted_to_fp16)[name = tensor<string, []>("op_75_cast_fp16")];
+            tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([-1])];
+            tensor<bool, []> var_77_keep_dims_0 = const()[name = tensor<string, []>("op_77_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 257, ?]> var_77_cast_fp16 = reduce_sum(axes = var_77_axes_0, keep_dims = var_77_keep_dims_0, x = var_75_cast_fp16)[name = tensor<string, []>("op_77_cast_fp16")];
+            tensor<fp16, [1, 257, ?]> x_11_cast_fp16 = identity(x = var_77_cast_fp16)[name = tensor<string, []>("x_11_cast_fp16")];
+            tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
+            tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 80, 257]> const_4_to_fp16 = const()[name = tensor<string, []>("const_4_to_fp16"), val = tensor<fp16, [1, 80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526528)))];
+            tensor<fp16, [1, 80, ?]> x_13_cast_fp16 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4_to_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("x_13_cast_fp16")];
+            tensor<fp16, []> var_84_to_fp16 = const()[name = tensor<string, []>("op_84_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
+            tensor<fp16, [1, 80, ?]> var_85_cast_fp16 = add(x = x_13_cast_fp16, y = var_84_to_fp16)[name = tensor<string, []>("op_85_cast_fp16")];
+            tensor<fp32, []> x_15_epsilon_0 = const()[name = tensor<string, []>("x_15_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
+            tensor<fp16, [1, 80, ?]> x_15_cast_fp16 = log(epsilon = x_15_epsilon_0, x = var_85_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
+            tensor<int32, [3]> var_87_shape_cast_fp16 = shape(x = x_15_cast_fp16)[name = tensor<string, []>("op_87_shape_cast_fp16")];
+            tensor<int32, []> gather_5 = const()[name = tensor<string, []>("gather_5"), val = tensor<int32, []>(1)];
+            tensor<int32, []> gather_6_axis_0 = const()[name = tensor<string, []>("gather_6_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_6_batch_dims_0 = const()[name = tensor<string, []>("gather_6_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_6_validate_indices_0 = const()[name = tensor<string, []>("gather_6_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_87_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_87_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
+            tensor<uint16, []> select_6_to_uint16 = const()[name = tensor<string, []>("select_6_to_uint16"), val = tensor<uint16, []>(2)];
+            tensor<uint16, [3]> var_87_shape_cast_fp16_to_uint16 = cast(dtype = var_87_shape_cast_fp16_to_uint16_dtype_0, x = var_87_shape_cast_fp16)[name = tensor<string, []>("cast_24")];
+            tensor<uint16, []> gather_6_cast_uint16 = gather(axis = gather_6_axis_0, batch_dims = gather_6_batch_dims_0, indices = select_6_to_uint16, validate_indices = gather_6_validate_indices_0, x = var_87_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_6_cast_uint16")];
+            tensor<string, []> gather_6_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_6_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
+            tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
+            tensor<int32, []> gather_6_cast_uint16_to_int32 = cast(dtype = gather_6_cast_uint16_to_int32_dtype_0, x = gather_6_cast_uint16)[name = tensor<string, []>("cast_23")];
+            tensor<int32, [?]> var_89 = range_1d(end = gather_6_cast_uint16_to_int32, start = const_5, step = const_6)[name = tensor<string, []>("op_89")];
+            tensor<int32, [1]> var_90_axes_0 = const()[name = tensor<string, []>("op_90_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, ?]> var_90 = expand_dims(axes = var_90_axes_0, x = var_89)[name = tensor<string, []>("op_90")];
+            tensor<int32, []> concat_3_axis_0 = const()[name = tensor<string, []>("concat_3_axis_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> concat_3_interleave_0 = const()[name = tensor<string, []>("concat_3_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<int32, [2]> concat_3 = concat(axis = concat_3_axis_0, interleave = concat_3_interleave_0, values = (gather_5, gather_6_cast_uint16_to_int32))[name = tensor<string, []>("concat_3")];
+            tensor<int32, [2]> shape_8 = shape(x = var_90)[name = tensor<string, []>("shape_8")];
+            tensor<int32, [2]> real_div_0 = real_div(x = concat_3, y = shape_8)[name = tensor<string, []>("real_div_0")];
+            tensor<int32, [?, ?]> time_steps = tile(reps = real_div_0, x = var_90)[name = tensor<string, []>("time_steps")];
+            tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<int32, [1, 1]> var_93 = expand_dims(axes = var_93_axes_0, x = mel_length)[name = tensor<string, []>("op_93")];
+            tensor<bool, [?, ?]> valid_mask = less(x = time_steps, y = var_93)[name = tensor<string, []>("valid_mask")];
+            tensor<int32, [1]> var_95_axes_0 = const()[name = tensor<string, []>("op_95_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<bool, [?, 1, ?]> var_95 = expand_dims(axes = var_95_axes_0, x = valid_mask)[name = tensor<string, []>("op_95")];
+            tensor<fp16, [1, 80, ?]> var_96_cast_fp16 = select(a = x_15_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_96_cast_fp16")];
+            tensor<int32, [1]> x_mean_numerator_axes_0 = const()[name = tensor<string, []>("x_mean_numerator_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<bool, []> x_mean_numerator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_numerator_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [1, 80]> x_mean_numerator_cast_fp16 = reduce_sum(axes = x_mean_numerator_axes_0, keep_dims = x_mean_numerator_keep_dims_0, x = var_96_cast_fp16)[name = tensor<string, []>("x_mean_numerator_cast_fp16")];
+            tensor<int32, [1]> x_mean_denominator_axes_0 = const()[name = tensor<string, []>("x_mean_denominator_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<bool, []> x_mean_denominator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_denominator_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> cast_6_to_fp16_dtype_0 = const()[name = tensor<string, []>("cast_6_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
+            tensor<fp16, [?, ?]> valid_mask_to_fp16 = cast(dtype = cast_6_to_fp16_dtype_0, x = valid_mask)[name = tensor<string, []>("cast_22")];
+            tensor<fp16, [?]> x_mean_denominator_cast_fp16 = reduce_sum(axes = x_mean_denominator_axes_0, keep_dims = x_mean_denominator_keep_dims_0, x = valid_mask_to_fp16)[name = tensor<string, []>("x_mean_denominator_cast_fp16")];
+            tensor<int32, [1]> var_101_axes_0 = const()[name = tensor<string, []>("op_101_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<fp16, [?, 1]> var_101_cast_fp16 = expand_dims(axes = var_101_axes_0, x = x_mean_denominator_cast_fp16)[name = tensor<string, []>("op_101_cast_fp16")];
+            tensor<fp16, [?, 80]> x_mean_cast_fp16 = real_div(x = x_mean_numerator_cast_fp16, y = var_101_cast_fp16)[name = tensor<string, []>("x_mean_cast_fp16")];
+            tensor<int32, [1]> var_104_axes_0 = const()[name = tensor<string, []>("op_104_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [?, 80, 1]> var_104_cast_fp16 = expand_dims(axes = var_104_axes_0, x = x_mean_cast_fp16)[name = tensor<string, []>("op_104_cast_fp16")];
+            tensor<fp16, [?, 80, ?]> var_105_cast_fp16 = sub(x = x_15_cast_fp16, y = var_104_cast_fp16)[name = tensor<string, []>("op_105_cast_fp16")];
+            tensor<fp16, [?, 80, ?]> var_106_cast_fp16 = select(a = var_105_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_106_cast_fp16")];
+            tensor<fp16, []> var_19_promoted_1_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_1_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
+            tensor<fp16, [?, 80, ?]> var_107_cast_fp16 = pow(x = var_106_cast_fp16, y = var_19_promoted_1_to_fp16)[name = tensor<string, []>("op_107_cast_fp16")];
+            tensor<int32, [1]> var_109_axes_0 = const()[name = tensor<string, []>("op_109_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<bool, []> var_109_keep_dims_0 = const()[name = tensor<string, []>("op_109_keep_dims_0"), val = tensor<bool, []>(false)];
+            tensor<fp16, [?, 80]> var_109_cast_fp16 = reduce_sum(axes = var_109_axes_0, keep_dims = var_109_keep_dims_0, x = var_107_cast_fp16)[name = tensor<string, []>("op_109_cast_fp16")];
+            tensor<fp16, []> var_111_to_fp16 = const()[name = tensor<string, []>("op_111_to_fp16"), val = tensor<fp16, []>(0x1p+0)];
+            tensor<fp16, [?, 1]> var_112_cast_fp16 = sub(x = var_101_cast_fp16, y = var_111_to_fp16)[name = tensor<string, []>("op_112_cast_fp16")];
+            tensor<fp16, [?, 80]> var_113_cast_fp16 = real_div(x = var_109_cast_fp16, y = var_112_cast_fp16)[name = tensor<string, []>("op_113_cast_fp16")];
+            tensor<fp16, [?, 80]> x_std_1_cast_fp16 = sqrt(x = var_113_cast_fp16)[name = tensor<string, []>("x_std_1_cast_fp16")];
+            tensor<bool, [?, 80]> var_115_cast_fp16 = not_equal(x = x_std_1_cast_fp16, y = x_std_1_cast_fp16)[name = tensor<string, []>("op_115_cast_fp16")];
+            tensor<fp16, [?, 80]> x_std_3_cast_fp16 = select(a = var_16_to_fp16, b = x_std_1_cast_fp16, cond = var_115_cast_fp16)[name = tensor<string, []>("x_std_3_cast_fp16")];
+            tensor<fp16, []> var_25_to_fp16 = const()[name = tensor<string, []>("op_25_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
+            tensor<fp16, [?, 80]> x_std_cast_fp16 = add(x = x_std_3_cast_fp16, y = var_25_to_fp16)[name = tensor<string, []>("x_std_cast_fp16")];
+            tensor<int32, [1]> var_120_axes_0 = const()[name = tensor<string, []>("op_120_axes_0"), val = tensor<int32, [1]>([2])];
+            tensor<fp16, [?, 80, 1]> var_120_cast_fp16 = expand_dims(axes = var_120_axes_0, x = x_std_cast_fp16)[name = tensor<string, []>("op_120_cast_fp16")];
+            tensor<fp16, [?, 80, ?]> x_cast_fp16 = real_div(x = var_105_cast_fp16, y = var_120_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
+            tensor<int32, [3]> var_122_shape_cast_fp16 = shape(x = x_cast_fp16)[name = tensor<string, []>("op_122_shape_cast_fp16")];
+            tensor<int32, []> gather_7_axis_0 = const()[name = tensor<string, []>("gather_7_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_7_batch_dims_0 = const()[name = tensor<string, []>("gather_7_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_7_validate_indices_0 = const()[name = tensor<string, []>("gather_7_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<string, []> var_122_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_122_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
+            tensor<uint16, []> select_7_to_uint16 = const()[name = tensor<string, []>("select_7_to_uint16"), val = tensor<uint16, []>(2)];
+            tensor<uint16, [3]> var_122_shape_cast_fp16_to_uint16 = cast(dtype = var_122_shape_cast_fp16_to_uint16_dtype_0, x = var_122_shape_cast_fp16)[name = tensor<string, []>("cast_21")];
+            tensor<uint16, []> gather_7_cast_uint16 = gather(axis = gather_7_axis_0, batch_dims = gather_7_batch_dims_0, indices = select_7_to_uint16, validate_indices = gather_7_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_7_cast_uint16")];
+            tensor<string, []> gather_7_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_7_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, []> const_7 = const()[name = tensor<string, []>("const_7"), val = tensor<int32, []>(0)];
+            tensor<int32, []> const_8 = const()[name = tensor<string, []>("const_8"), val = tensor<int32, []>(1)];
+            tensor<int32, []> gather_7_cast_uint16_to_int32 = cast(dtype = gather_7_cast_uint16_to_int32_dtype_0, x = gather_7_cast_uint16)[name = tensor<string, []>("cast_20")];
+            tensor<int32, [?]> mask_1 = range_1d(end = gather_7_cast_uint16_to_int32, start = const_7, step = const_8)[name = tensor<string, []>("mask_1")];
+            tensor<int32, []> gather_8_axis_0 = const()[name = tensor<string, []>("gather_8_axis_0"), val = tensor<int32, []>(0)];
+            tensor<int32, []> gather_8_batch_dims_0 = const()[name = tensor<string, []>("gather_8_batch_dims_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> gather_8_validate_indices_0 = const()[name = tensor<string, []>("gather_8_validate_indices_0"), val = tensor<bool, []>(false)];
+            tensor<uint16, []> select_8_to_uint16 = const()[name = tensor<string, []>("select_8_to_uint16"), val = tensor<uint16, []>(0)];
+            tensor<uint16, []> gather_8_cast_uint16 = gather(axis = gather_8_axis_0, batch_dims = gather_8_batch_dims_0, indices = select_8_to_uint16, validate_indices = gather_8_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_8_cast_uint16")];
+            tensor<string, []> gather_8_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_8_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
+            tensor<int32, []> concat_4_axis_0 = const()[name = tensor<string, []>("concat_4_axis_0"), val = tensor<int32, []>(0)];
+            tensor<bool, []> concat_4_interleave_0 = const()[name = tensor<string, []>("concat_4_interleave_0"), val = tensor<bool, []>(false)];
+            tensor<int32, []> gather_8_cast_uint16_to_int32 = cast(dtype = gather_8_cast_uint16_to_int32_dtype_0, x = gather_8_cast_uint16)[name = tensor<string, []>("cast_19")];
+            tensor<int32, [2]> concat_4 = concat(axis = concat_4_axis_0, interleave = concat_4_interleave_0, values = (gather_8_cast_uint16_to_int32, var_9))[name = tensor<string, []>("concat_4")];
+            tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
+            tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
+            tensor<int32, [?, ?]> var_126 = tile(reps = concat_4, x = expand_dims_0)[name = tensor<string, []>("op_126")];
+            tensor<bool, [?, ?]> mask = greater_equal(x = var_126, y = var_93)[name = tensor<string, []>("mask")];
+            tensor<int32, [1]> var_129_axes_0 = const()[name = tensor<string, []>("op_129_axes_0"), val = tensor<int32, [1]>([1])];
+            tensor<bool, [?, 1, ?]> var_129 = expand_dims(axes = var_129_axes_0, x = mask)[name = tensor<string, []>("op_129")];
+            tensor<fp16, []> cast_15_to_fp16 = const()[name = tensor<string, []>("cast_15_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
+            tensor<fp16, [?, 80, ?]> processed_signal_cast_fp16 = select(a = cast_15_to_fp16, b = x_cast_fp16, cond = var_129)[name = tensor<string, []>("processed_signal_cast_fp16")];
+            tensor<string, []> processed_signal_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("processed_signal_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
+            tensor<fp32, [?, 80, ?]> mel_features = cast(dtype = processed_signal_cast_fp16_to_fp32_dtype_0, x = processed_signal_cast_fp16)[name = tensor<string, []>("cast_18")];
+        } -> (mel_features, mel_length);
+}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c062338de852a26607ce4101f74e6895de3a4134a57b07232bd72bfc6f1d7f1a
+size 567712

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json ADDED Viewed

	@@ -0,0 +1,247 @@

+{
+  "model_id": "nvidia/parakeet-tdt_ctc-110m",
+  "model_type": "hybrid_rnnt_ctc",
+  "sample_rate": 16000,
+  "max_audio_seconds": 15.0,
+  "max_audio_samples": 240000,
+  "max_symbol_steps": 1,
+  "vocab_size": 1024,
+  "joint_extra_outputs": 5,
+  "encoder_dim": 512,
+  "decoder_dim": 640,
+  "decoder_hidden": 640,
+  "decoder_layers": 1,
+  "blank_id": 1024,
+  "checkpoint": {
+    "type": "pretrained",
+    "model_id": "nvidia/parakeet-tdt_ctc-110m"
+  },
+  "coreml": {
+    "compute_units": "CPU_ONLY",
+    "compute_precision": "FLOAT32"
+  },
+  "components": {
+    "preprocessor": {
+      "inputs": {
+        "audio_signal": [
+          1,
+          240000
+        ],
+        "audio_length": [
+          1
+        ]
+      },
+      "outputs": {
+        "mel": [
+          1,
+          80,
+          1501
+        ],
+        "mel_length": [
+          1
+        ]
+      },
+      "path": "parakeet_preprocessor.mlpackage"
+    },
+    "encoder": {
+      "inputs": {
+        "mel": [
+          1,
+          80,
+          1501
+        ],
+        "mel_length": [
+          1
+        ]
+      },
+      "outputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "encoder_length": [
+          1
+        ]
+      },
+      "path": "parakeet_encoder.mlpackage"
+    },
+    "ctc_head": {
+      "inputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ]
+      },
+      "outputs": {
+        "log_probs": [
+          1,
+          188,
+          1025
+        ]
+      },
+      "path": "parakeet_ctc_head.mlpackage"
+    },
+    "mel_encoder": {
+      "inputs": {
+        "audio_signal": [
+          1,
+          240000
+        ],
+        "audio_length": [
+          1
+        ]
+      },
+      "outputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "encoder_length": [
+          1
+        ]
+      },
+      "path": "parakeet_mel_encoder.mlpackage"
+    },
+    "decoder": {
+      "inputs": {
+        "targets": [
+          1,
+          1
+        ],
+        "target_length": [
+          1
+        ],
+        "h_in": [
+          1,
+          1,
+          640
+        ],
+        "c_in": [
+          1,
+          1,
+          640
+        ]
+      },
+      "outputs": {
+        "decoder": [
+          1,
+          640,
+          1
+        ],
+        "h_out": [
+          1,
+          1,
+          640
+        ],
+        "c_out": [
+          1,
+          1,
+          640
+        ]
+      },
+      "path": "parakeet_decoder.mlpackage"
+    },
+    "joint": {
+      "inputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "decoder": [
+          1,
+          640,
+          1
+        ]
+      },
+      "outputs": {
+        "logits": [
+          1,
+          188,
+          1,
+          1030
+        ]
+      },
+      "path": "parakeet_joint.mlpackage"
+    },
+    "joint_decision": {
+      "inputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "decoder": [
+          1,
+          640,
+          1
+        ]
+      },
+      "outputs": {
+        "token_id": [
+          1,
+          188,
+          1
+        ],
+        "token_prob": [
+          1,
+          188,
+          1
+        ],
+        "duration": [
+          1,
+          188,
+          1
+        ]
+      },
+      "path": "parakeet_joint_decision.mlpackage"
+    },
+    "joint_decision_single_step": {
+      "inputs": {
+        "encoder_step": [
+          1,
+          512,
+          1
+        ],
+        "decoder_step": [
+          1,
+          640,
+          1
+        ]
+      },
+      "outputs": {
+        "token_id": [
+          1,
+          1,
+          1
+        ],
+        "token_prob": [
+          1,
+          1,
+          1
+        ],
+        "duration": [
+          1,
+          1,
+          1
+        ],
+        "top_k_ids": [
+          1,
+          1,
+          1,
+          64
+        ],
+        "top_k_logits": [
+          1,
+          1,
+          1,
+          64
+        ]
+      },
+      "path": "parakeet_joint_decision_single_step.mlpackage"
+    }
+  }
+}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"0": "<unk>", "1": "▁t", "2": "▁th", "3": "▁a", "4": "in", "5": "re", "6": "▁the", "7": "▁w", "8": "▁s", "9": "▁o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "▁h", "16": "▁c", "17": "▁b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "▁f", "23": "▁to", "24": "▁m", "25": "es", "26": "▁p", "27": "or", "28": "an", "29": "▁d", "30": "ll", "31": "▁I", "32": "ed", "33": "▁and", "34": "▁l", "35": "▁of", "36": "▁in", "37": "▁y", "38": "ar", "39": "▁g", "40": "▁you", "41": "as", "42": "om", "43": "▁n", "44": "ve", "45": "▁that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "▁e", "53": "ut", "54": "▁it", "55": "ot", "56": "▁be", "57": "▁T", "58": "ion", "59": "▁is", "60": "▁wh", "61": "▁re", "62": "▁on", "63": "▁we", "64": "ent", "65": "▁A", "66": "ay", "67": "▁ha", "68": "▁Th", "69": "id", "70": "▁S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "▁for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "▁he", "81": "▁st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "▁this", "91": "if", "92": "▁W", "93": "oo", "94": "ri", "95": "▁was", "96": "ght", "97": "▁u", "98": "▁with", "99": "ad", "100": "ch", "101": "▁se", "102": "▁k", "103": "▁an", "104": "▁The", "105": "▁li", "106": "▁do", "107": "▁B", "108": "▁have", "109": "▁as", "110": "th", "111": "▁are", "112": "▁sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "▁H", "118": "▁j", "119": "ter", "120": "▁go", "121": "▁And", "122": "ation", "123": "▁C", "124": "▁so", "125": "ome", "126": "▁not", "127": "op", "128": "il", "129": "ore", "130": "▁ne", "131": "▁can", "132": "▁me", "133": "▁at", "134": "ould", "135": "ant", "136": "▁M", "137": "▁like", "138": "ere", "139": "▁they", "140": "ra", "141": "ers", "142": "▁ab", "143": "▁de", "144": "▁kn", "145": "ge", "146": "▁Y", "147": "▁ch", "148": "ul", "149": "pp", "150": "▁or", "151": "▁al", "152": "▁con", "153": "▁com", "154": "ess", "155": "▁su", "156": "out", "157": "▁your", "158": "▁So", "159": "ate", "160": "▁one", "161": "▁all", "162": "▁ex", "163": "est", "164": "▁fr", "165": "▁just", "166": "▁pro", "167": "▁know", "168": "▁O", "169": "ain", "170": "▁but", "171": "ol", "172": "ive", "173": "▁v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "▁my", "179": "el", "180": "▁N", "181": "nt", "182": "▁It", "183": "▁what", "184": "ab", "185": "▁P", "186": "▁wor", "187": "▁out", "188": "▁there", "189": "▁up", "190": "um", "191": "▁from", "192": "pe", "193": "▁tw", "194": "▁r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "▁L", "200": "ist", "201": "▁about", "202": "ide", "203": "ig", "204": "ake", "205": "▁D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "▁We", "214": "▁get", "215": "▁E", "216": "▁G", "217": "ack", "218": "▁le", "219": "ity", "220": "od", "221": "▁F", "222": "ard", "223": "▁pl", "224": "▁our", "225": "▁int", "226": "ment", "227": "▁will", "228": "ies", "229": "▁by", "230": "ink", "231": "ca", "232": "▁if", "233": "red", "234": "her", "235": "ie", "236": "▁us", "237": "▁some", "238": "▁don", "239": "ven", "240": "ood", "241": "ast", "242": "▁R", "243": "▁his", "244": "▁tim", "245": "▁tr", "246": "▁more", "247": "ich", "248": "ous", "249": "ame", "250": "▁going", "251": "▁had", "252": "▁them", "253": "ook", "254": "▁pe", "255": "▁Wh", "256": "▁You", "257": "▁But", "258": "ine", "259": "▁here", "260": "▁would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "▁has", "267": "ect", "268": "▁think", "269": "▁fe", "270": "ong", "271": "▁see", "272": "▁when", "273": "▁who", "274": "▁were", "275": "▁really", "276": "▁their", "277": "▁want", "278": "one", "279": "ople", "280": "▁then", "281": "▁time", "282": "▁sa", "283": "ap", "284": "▁te", "285": "▁He", "286": "▁ye", "287": "ck", "288": "▁her", "289": "▁thing", "290": "▁right", "291": "▁which", "292": "itt", "293": "ice", "294": "act", "295": "▁people", "296": "ty", "297": "▁two", "298": "▁J", "299": "▁im", "300": "ther", "301": "ci", "302": "ose", "303": "▁cl", "304": "▁qu", "305": "▁man", "306": "▁also", "307": "ree", "308": "▁en", "309": "ud", "310": "▁how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "▁any", "316": "ff", "317": "ace", "318": "per", "319": "▁because", "320": "▁very", "321": "own", "322": "▁ad", "323": "▁act", "324": "▁been", "325": "▁now", "326": "▁ag", "327": "▁into", "328": "▁comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "▁these", "335": "ays", "336": "ep", "337": "▁This", "338": "▁she", "339": "ans", "340": "ah", "341": "een", "342": "▁over", "343": "ry", "344": "▁lo", "345": "age", "346": "▁pr", "347": "▁sp", "348": "ue", "349": "▁co", "350": "ick", "351": "ber", "352": "▁did", "353": "ip", "354": "ach", "355": "▁back", "356": "▁no", "357": "▁cont", "358": "▁other", "359": "▁every", "360": "pt", "361": "▁need", "362": "▁him", "363": "▁U", "364": "▁In", "365": "▁work", "366": "irst", "367": "▁part", "368": "▁look", "369": "ittle", "370": "ble", "371": "iz", "372": "▁un", "373": "▁make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "▁little", "379": "▁off", "380": "▁than", "381": "▁got", "382": "ually", "383": "▁per", "384": "▁good", "385": "▁way", "386": "▁could", "387": "▁ac", "388": "▁imp", "389": "able", "390": "▁where", "391": "iff", "392": "▁That", "393": "▁res", "394": "ount", "395": "pl", "396": "ance", "397": "▁first", "398": "▁ro", "399": "▁pre", "400": "ass", "401": "▁say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "▁somet", "408": "ound", "409": "▁down", "410": "▁diff", "411": "sel", "412": "▁gu", "413": "▁am", "414": "ress", "415": "▁lot", "416": "ence", "417": "▁dis", "418": "orm", "419": "ix", "420": "▁po", "421": "ving", "422": "enty", "423": "▁K", "424": "▁spe", "425": "und", "426": "he", "427": "▁much", "428": "▁ar", "429": "round", "430": "▁app", "431": "co", "432": "ark", "433": "▁new", "434": "ater", "435": "ult", "436": "end", "437": "▁even", "438": "▁start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "▁well", "444": "be", "445": "▁They", "446": "▁three", "447": "ign", "448": "ild", "449": "▁said", "450": "ough", "451": "ang", "452": "▁too", "453": "ade", "454": "▁bl", "455": "ens", "456": "▁inc", "457": "ia", "458": "▁those", "459": "▁mo", "460": "▁take", "461": "▁through", "462": "▁fl", "463": "▁kind", "464": "▁things", "465": "▁bet", "466": "▁only", "467": "▁St", "468": "▁let", "469": "cess", "470": "▁Ch", "471": "ary", "472": "vel", "473": "▁If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "▁again", "480": "▁something", "481": "onna", "482": "fore", "483": "▁may", "484": "ting", "485": "▁bu", "486": "▁differe", "487": "urn", "488": "▁gonna", "489": "▁does", "490": "uct", "491": "og", "492": "▁twenty", "493": "▁gr", "494": "▁Ye", "495": "wn", "496": "▁should", "497": "▁comm", "498": "ition", "499": "▁under", "500": "▁hel", "501": "ory", "502": "▁fo", "503": "▁use", "504": "igh", "505": "ife", "506": "▁actually", "507": "▁tal", "508": "▁call", "509": "ents", "510": "ious", "511": "ull", "512": "▁There", "513": "▁Yeah", "514": "▁most", "515": "▁ke", "516": "ors", "517": "ved", "518": "ys", "519": "▁sc", "520": "▁happ", "521": "ope", "522": "▁help", "523": "atch", "524": "▁What", "525": "▁rem", "526": "ple", "527": "▁Now", "528": "▁br", "529": "ool", "530": "oth", "531": "▁four", "532": "self", "533": "▁str", "534": "ne", "535": "thing", "536": "▁put", "537": "ial", "538": "▁great", "539": "ail", "540": "ub", "541": "ning", "542": "▁sm", "543": "▁feel", "544": "▁five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "▁many", "552": "▁hundred", "553": "▁years", "554": "▁being", "555": "▁come", "556": "▁mean", "557": "ily", "558": "▁different", "559": "▁after", "560": "▁ser", "561": "▁show", "562": "form", "563": "ful", "564": "oy", "565": "▁six", "566": "▁vide", "567": "▁V", "568": "▁its", "569": "▁point", "570": "▁day", "571": "▁des", "572": "ons", "573": "▁bit", "574": "▁bel", "575": "▁before", "576": "▁aw", "577": "▁end", "578": "▁Oh", "579": "▁still", "580": "ath", "581": "▁long", "582": "▁'", "583": "ise", "584": "ob", "585": "day", "586": "▁add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "▁cr", "592": "▁around", "593": "▁try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "▁find", "600": "ward", "601": "▁As", "602": "▁eight", "603": "lic", "604": "▁same", "605": "▁pos", "606": "▁em", "607": "▁made", "608": "▁supp", "609": "▁life", "610": "▁Be", "611": "pect", "612": "▁dec", "613": "▁play", "614": "ange", "615": "▁att", "616": "▁pers", "617": "ways", "618": "▁high", "619": "▁hand", "620": "▁next", "621": "▁cons", "622": "▁own", "623": "▁inv", "624": "ower", "625": "▁ind", "626": "ert", "627": "ng", "628": "ave", "629": "▁year", "630": "▁big", "631": "ating", "632": "▁world", "633": "▁rel", "634": "▁sure", "635": "▁tra", "636": "ew", "637": "ered", "638": "▁fin", "639": "▁Well", "640": "▁sl", "641": "▁doing", "642": "bs", "643": "▁set", "644": "▁rec", "645": "ual", "646": "cial", "647": "▁ph", "648": "erm", "649": "▁love", "650": "ph", "651": "▁real", "652": "▁last", "653": "ict", "654": "▁bo", "655": "▁ra", "656": "ible", "657": "▁wr", "658": "mer", "659": "▁count", "660": "ities", "661": "▁always", "662": "inet", "663": "ments", "664": "uc", "665": "▁might", "666": "▁inter", "667": "▁video", "668": "gin", "669": "▁tell", "670": "▁never", "671": "vent", "672": "▁import", "673": "ied", "674": "▁sy", "675": "▁How", "676": "ically", "677": "ought", "678": "▁thir", "679": "▁rep", "680": "ks", "681": "ib", "682": "▁fam", "683": "ject", "684": "▁bas", "685": "▁She", "686": "▁give", "687": "akes", "688": "▁ninet", "689": "▁reg", "690": "▁min", "691": "▁op", "692": "▁def", "693": "▁didn", "694": "te", "695": "▁cour", "696": "▁why", "697": "▁ent", "698": "▁place", "699": "▁ins", "700": "▁car", "701": "ather", "702": "▁person", "703": "ular", "704": "▁inst", "705": "▁prod", "706": "lect", "707": "▁Al", "708": "▁today", "709": "▁bec", "710": "▁sur", "711": "▁All", "712": "▁another", "713": "▁bus", "714": "▁keep", "715": "ell", "716": "ese", "717": "riend", "718": "▁quest", "719": "▁talk", "720": "als", "721": "ings", "722": "▁mon", "723": "cond", "724": "old", "725": "▁acc", "726": "▁la", "727": "▁num", "728": "ident", "729": "▁che", "730": "iness", "731": "▁turn", "732": "▁ear", "733": "▁No", "734": "ousand", "735": "▁better", "736": "ific", "737": "▁loo", "738": "▁gl", "739": "oc", "740": "▁important", "741": "ited", "742": "▁An", "743": "▁thousand", "744": "ility", "745": "llow", "746": "▁used", "747": "▁gen", "748": "▁sim", "749": "li", "750": "▁happen", "751": "▁Un", "752": "▁Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "▁watch", "758": "▁For", "759": "▁sw", "760": "ren", "761": "ute", "762": "ever", "763": "▁pol", "764": "▁sch", "765": "▁When", "766": "▁such", "767": "▁fif", "768": "▁home", "769": "▁cle", "770": "▁contin", "771": "ouse", "772": "▁friend", "773": "uring", "774": "▁Okay", "775": "gr", "776": "▁able", "777": "▁stud", "778": "▁eff", "779": "hip", "780": "body", "781": "▁top", "782": "ness", "783": "▁exper", "784": "▁pret", "785": "▁both", "786": "▁done", "787": "cri", "788": "▁mark", "789": "▁while", "790": "▁old", "791": "ros", "792": "ont", "793": "▁second", "794": "ative", "795": "▁thought", "796": "▁best", "797": "▁found", "798": "iew", "799": "▁belie", "800": "▁each", "801": "erest", "802": "▁tri", "803": "▁eas", "804": "▁ca", "805": "▁fact", "806": "▁care", "807": "▁fun", "808": "atter", "809": "ures", "810": "▁head", "811": "▁lear", "812": "▁water", "813": "▁hard", "814": "▁few", "815": "▁side", "816": "ween", "817": "▁exp", "818": "▁away", "819": "its", "820": "▁ext", "821": "lud", "822": "▁run", "823": "▁trans", "824": "ince", "825": "▁sk", "826": "▁open", "827": "cus", "828": "▁between", "829": "▁called", "830": "▁wee", "831": "▁pretty", "832": "ason", "833": "▁far", "834": "ember", "835": "omm", "836": "▁interest", "837": "any", "838": "ner", "839": "uff", "840": "▁pres", "841": "▁cur", "842": "▁child", "843": "ee", "844": "▁toget", "845": "▁together", "846": "olog", "847": "▁God", "848": "ond", "849": "▁char", "850": "▁looking", "851": "stem", "852": "az", "853": "cent", "854": "▁ob", "855": "▁ass", "856": "land", "857": "▁doesn", "858": "▁business", "859": "▁course", "860": "▁ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "▁ref", "868": "▁name", "869": "ross", "870": "▁grow", "871": "oney", "872": "▁went", "873": "ics", "874": "teen", "875": "▁cou", "876": "▁prob", "877": "▁ret", "878": "▁guys", "879": "▁came", "880": "ash", "881": "led", "882": "▁Eur", "883": "ues", "884": "▁ide", "885": "gan", "886": "▁everything", "887": "▁getting", "888": "▁ask", "889": "▁cor", "890": "▁build", "891": "▁sign", "892": "▁small", "893": "uck", "894": "▁el", "895": "▁col", "896": "▁Is", "897": "ational", "898": "stand", "899": "cy", "900": "▁conf", "901": "der", "902": "▁bre", "903": "▁cap", "904": "▁mod", "905": "ets", "906": "ike", "907": "▁number", "908": "▁comple", "909": "ertain", "910": "▁ever", "911": "▁coll", "912": "▁hum", "913": "▁Europe", "914": "▁cre", "915": "▁met", "916": "▁exam", "917": "▁move", "918": "▁pass", "919": "▁left", "920": "▁system", "921": "▁includ", "922": "▁Thank", "923": "cept", "924": "▁wom", "925": "▁product", "926": "ten", "927": "▁rest", "928": "▁probably", "929": "▁dri", "930": "▁Do", "931": "▁gener", "932": "▁anything", "933": "▁lar", "934": "▁My", "935": "▁school", "936": "▁lead", "937": "▁sub", "938": "▁ty", "939": "▁plan", "940": "▁seem", "941": "▁whole", "942": "irect", "943": "▁light", "944": "▁must", "945": "▁mom", "946": "▁opp", "947": "▁support", "948": "▁family", "949": "ices", "950": "amp", "951": "▁proble", "952": "▁dr", "953": "ready", "954": "▁using", "955": "ense", "956": "▁prov", "957": "ush", "958": "ax", "959": "▁power", "960": "▁Re", "961": "alth", "962": "▁ev", "963": "▁stand", "964": "��war", "965": "ts", "966": "▁", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}

convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py ADDED Viewed

	@@ -0,0 +1,697 @@

+#!/usr/bin/env python3
+"""CLI for exporting Parakeet TDT-CTC 110M Hybrid components to CoreML."""
+from __future__ import annotations
+import json
+from dataclasses import asdict
+from pathlib import Path
+from typing import Dict, Optional, Tuple
+import coremltools as ct
+import numpy as np
+import soundfile as sf
+import torch
+import typer
+import nemo.collections.asr as nemo_asr
+from individual_components import (
+    CTCHeadWrapper,
+    DecoderWrapper,
+    EncoderWrapper,
+    ExportSettings,
+    JointWrapper,
+    JointDecisionWrapper,
+    JointDecisionSingleStep,
+    PreprocessorWrapper,
+    MelEncoderWrapper,
+    _coreml_convert,
+)
+DEFAULT_MODEL_ID = "nvidia/parakeet-tdt_ctc-110m"
+AUTHOR = "Fluid Inference"
+def _compute_length(seconds: float, sample_rate: int) -> int:
+    return int(round(seconds * sample_rate))
+def _prepare_audio(
+    validation_audio: Optional[Path],
+    sample_rate: int,
+    max_samples: int,
+    seed: Optional[int],
+) -> torch.Tensor:
+    if validation_audio is None:
+        if seed is not None:
+            torch.manual_seed(seed)
+        audio = torch.randn(1, max_samples, dtype=torch.float32)
+        return audio
+    data, sr = sf.read(str(validation_audio), dtype="float32")
+    if sr != sample_rate:
+        raise typer.BadParameter(
+            f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
+        )
+    if data.ndim > 1:
+        data = data[:, 0]
+    if data.size == 0:
+        raise typer.BadParameter("Validation audio is empty")
+    if data.size < max_samples:
+        pad_width = max_samples - data.size
+        data = np.pad(data, (0, pad_width))
+    elif data.size > max_samples:
+        data = data[:max_samples]
+    audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
+    return audio
+def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
+    # Ensure iOS 17+ target for MLProgram ops and ANE readiness
+    try:
+        model.minimum_deployment_target = ct.target.iOS17
+    except Exception:
+        pass
+    model.short_description = description
+    model.author = AUTHOR
+    path.parent.mkdir(parents=True, exist_ok=True)
+    model.save(str(path))
+def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
+    return tuple(int(dim) for dim in tensor.shape)
+def _parse_compute_units(name: str) -> ct.ComputeUnit:
+    """Parse a human-friendly compute units string into ct.ComputeUnit.
+    Accepted (case-insensitive): ALL, CPU_ONLY, CPU_AND_GPU, CPU_AND_NE.
+    """
+    normalized = str(name).strip().upper()
+    mapping = {
+        "ALL": ct.ComputeUnit.ALL,
+        "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
+        "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
+        "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
+        "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
+    }
+    if normalized not in mapping:
+        raise typer.BadParameter(
+            f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
+        )
+    return mapping[normalized]
+def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
+    """Parse compute precision string into ct.precision or None.
+    Accepted (case-insensitive): FLOAT32, FLOAT16. If None/empty, returns None (tool default).
+    """
+    if name is None:
+        return None
+    normalized = str(name).strip().upper()
+    if normalized == "":
+        return None
+    mapping = {
+        "FLOAT32": ct.precision.FLOAT32,
+        "FLOAT16": ct.precision.FLOAT16,
+    }
+    if normalized not in mapping:
+        raise typer.BadParameter(
+            f"Unknown compute precision '{name}'. Choose from: " + ", ".join(mapping.keys())
+        )
+    return mapping[normalized]
+# Fixed export choices: CPU_ONLY + FP32, min target iOS17
+app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
+@app.command()
+def convert(
+    nemo_path: Optional[Path] = typer.Option(
+        None,
+        "--nemo-path",
+        exists=True,
+        resolve_path=True,
+        help="Path to parakeet-tdt_ctc-110m .nemo checkpoint (skip to auto-download)",
+    ),
+    model_id: str = typer.Option(
+        DEFAULT_MODEL_ID,
+        "--model-id",
+        help="Model identifier to download when --nemo-path is omitted",
+    ),
+    output_dir: Path = typer.Option(Path("parakeet_110m_coreml"), help="Directory where mlpackages and metadata will be written"),
+    preprocessor_cu: str = typer.Option(
+        "CPU_ONLY",
+        "--preprocessor-cu",
+        help="Compute units for preprocessor (default CPU_ONLY)",
+    ),
+    mel_encoder_cu: str = typer.Option(
+        "CPU_ONLY",
+        "--mel-encoder-cu",
+        help="Compute units for fused mel+encoder (default CPU_ONLY)",
+    ),
+    compute_precision: Optional[str] = typer.Option(
+        None,
+        "--compute-precision",
+        help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
+    ),
+) -> None:
+    """Export all Parakeet TDT-CTC 110M Hybrid sub-modules to CoreML with a fixed 15-second window.
+    This exports both CTC and TDT components from the hybrid model.
+    """
+    # Runtime CoreML contract keeps U=1 so the prediction net matches the streaming decoder.
+    export_settings = ExportSettings(
+        output_dir=output_dir,
+        compute_units=ct.ComputeUnit.CPU_ONLY,  # Default: CPU-only for all components
+        deployment_target=ct.target.iOS17,  # iOS 17+ features and kernels
+        compute_precision=_parse_compute_precision(compute_precision),
+        max_audio_seconds=15.0,
+        max_symbol_steps=1,
+    )
+    typer.echo("Export configuration:")
+    typer.echo(asdict(export_settings))
+    output_dir.mkdir(parents=True, exist_ok=True)
+    pre_cu = _parse_compute_units(preprocessor_cu)
+    melenc_cu = _parse_compute_units(mel_encoder_cu)
+    if nemo_path is not None:
+        typer.echo(f"Loading NeMo model from {nemo_path}…")
+        # 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
+        asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(
+            str(nemo_path), map_location="cpu"
+        )
+        checkpoint_meta = {
+            "type": "file",
+            "path": str(nemo_path),
+        }
+    else:
+        typer.echo(f"Downloading NeMo model via {model_id}…")
+        # 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
+        asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
+            model_id, map_location="cpu"
+        )
+        checkpoint_meta = {
+            "type": "pretrained",
+            "model_id": model_id,
+        }
+    asr_model.eval()
+    sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
+    max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
+    # Look for a bundled 15s 16kHz audio file
+    default_audio = (Path(__file__).parent / "audio" / "yc_first_minute_16k_15s.wav").resolve()
+    if default_audio.exists():
+        typer.echo(f"Using trace audio: {default_audio}")
+        audio_tensor = _prepare_audio(default_audio, sample_rate, max_samples, seed=None)
+    else:
+        typer.echo("No trace audio found, using random noise for tracing")
+        audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
+    audio_length = torch.tensor([max_samples], dtype=torch.int32)
+    preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
+    encoder = EncoderWrapper(asr_model.encoder.eval())
+    decoder = DecoderWrapper(asr_model.decoder.eval())
+    joint = JointWrapper(asr_model.joint.eval())
+    # CTC head for hybrid model
+    ctc_head = CTCHeadWrapper(asr_model.ctc_decoder.eval())
+    decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
+    asr_model.decoder._rnnt_export = True
+    try:
+        with torch.inference_mode():
+            mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
+            mel_length_ref = mel_length_ref.to(dtype=torch.int32)
+            encoder_ref, encoder_length_ref = encoder(mel_ref, mel_length_ref)
+            encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
+            # CTC log probs
+            ctc_log_probs_ref = ctc_head(encoder_ref)
+        # Clone Tensors to drop the inference tensor flag before tracing
+        mel_ref = mel_ref.clone()
+        mel_length_ref = mel_length_ref.clone()
+        encoder_ref = encoder_ref.clone()
+        encoder_length_ref = encoder_length_ref.clone()
+        ctc_log_probs_ref = ctc_log_probs_ref.clone()
+        vocab_size = int(asr_model.tokenizer.vocab_size)
+        num_extra = int(asr_model.joint.num_extra_outputs)
+        decoder_hidden = int(asr_model.decoder.pred_hidden)
+        decoder_layers = int(asr_model.decoder.pred_rnn_layers)
+        typer.echo(f"Model info:")
+        typer.echo(f"  Vocab size: {vocab_size}")
+        typer.echo(f"  Num extra (duration bins): {num_extra}")
+        typer.echo(f"  Decoder hidden: {decoder_hidden}")
+        typer.echo(f"  Decoder layers: {decoder_layers}")
+        typer.echo(f"  Encoder output shape: {_tensor_shape(encoder_ref)}")
+        targets = torch.full(
+            (1, export_settings.max_symbol_steps),
+            fill_value=asr_model.decoder.blank_idx,
+            dtype=torch.int32,
+        )
+        target_lengths = torch.tensor(
+            [export_settings.max_symbol_steps], dtype=torch.int32
+        )
+        zero_state = torch.zeros(
+            decoder_layers,
+            1,
+            decoder_hidden,
+            dtype=torch.float32,
+        )
+        with torch.inference_mode():
+            decoder_ref, h_ref, c_ref = decoder(targets, target_lengths, zero_state, zero_state)
+            joint_ref = joint(encoder_ref, decoder_ref)
+        decoder_ref = decoder_ref.clone()
+        h_ref = h_ref.clone()
+        c_ref = c_ref.clone()
+        joint_ref = joint_ref.clone()
+        typer.echo(f"  Decoder output shape: {_tensor_shape(decoder_ref)}")
+        typer.echo(f"  Joint output shape: {_tensor_shape(joint_ref)}")
+        typer.echo(f"  CTC log probs shape: {_tensor_shape(ctc_log_probs_ref)}")
+        typer.echo("Tracing and converting preprocessor…")
+        # Ensure tracing happens on CPU explicitly
+        preprocessor = preprocessor.cpu()
+        audio_tensor = audio_tensor.cpu()
+        audio_length = audio_length.cpu()
+        traced_preprocessor = torch.jit.trace(
+            preprocessor, (audio_tensor, audio_length), strict=False
+        )
+        traced_preprocessor.eval()
+        preprocessor_inputs = [
+            # Allow variable-length audio up to the fixed 15s window using RangeDim
+            ct.TensorType(
+                name="audio",
+                shape=(1, ct.RangeDim(1, max_samples)),
+                dtype=np.float32,
+            ),
+            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ]
+        preprocessor_outputs = [
+            ct.TensorType(name="mel_features", dtype=np.float32),
+            ct.TensorType(name="mel_length", dtype=np.int32),
+        ]
+        # Preprocessor compute units (parametrized; default CPU_ONLY)
+        preprocessor_model = _coreml_convert(
+            traced_preprocessor,
+            preprocessor_inputs,
+            preprocessor_outputs,
+            export_settings,
+            compute_units_override=pre_cu,
+        )
+        preprocessor_path = output_dir / "parakeet_preprocessor.mlpackage"
+        _save_mlpackage(
+            preprocessor_model,
+            preprocessor_path,
+            "Parakeet 110M preprocessor (15 s window)",
+        )
+        typer.echo("Tracing and converting encoder…")
+        traced_encoder = torch.jit.trace(
+            encoder, (mel_ref, mel_length_ref), strict=False
+        )
+        traced_encoder.eval()
+        encoder_inputs = [
+            ct.TensorType(name="mel_features", shape=_tensor_shape(mel_ref), dtype=np.float32),
+            ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
+        ]
+        encoder_outputs = [
+            ct.TensorType(name="encoder_output", dtype=np.float32),
+            ct.TensorType(name="encoder_length", dtype=np.int32),
+        ]
+        # Encoder: CPU only
+        encoder_model = _coreml_convert(
+            traced_encoder,
+            encoder_inputs,
+            encoder_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        encoder_path = output_dir / "parakeet_encoder.mlpackage"
+        _save_mlpackage(
+            encoder_model,
+            encoder_path,
+            "Parakeet 110M encoder (15 s window)",
+        )
+        # CTC Head for hybrid model
+        typer.echo("Tracing and converting CTC head…")
+        traced_ctc_head = torch.jit.trace(
+            ctc_head, (encoder_ref,), strict=False
+        )
+        traced_ctc_head.eval()
+        ctc_head_inputs = [
+            ct.TensorType(name="encoder_output", shape=_tensor_shape(encoder_ref), dtype=np.float32),
+        ]
+        ctc_head_outputs = [
+            ct.TensorType(name="ctc_logits", dtype=np.float32),
+        ]
+        ctc_head_model = _coreml_convert(
+            traced_ctc_head,
+            ctc_head_inputs,
+            ctc_head_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        ctc_head_path = output_dir / "parakeet_ctc_head.mlpackage"
+        _save_mlpackage(
+            ctc_head_model,
+            ctc_head_path,
+            "Parakeet 110M CTC decoder head",
+        )
+        # Optional fused export: Preprocessor + Encoder
+        typer.echo("Tracing and converting fused mel+encoder…")
+        mel_encoder = MelEncoderWrapper(preprocessor, encoder)
+        traced_mel_encoder = torch.jit.trace(
+            mel_encoder, (audio_tensor, audio_length), strict=False
+        )
+        traced_mel_encoder.eval()
+        mel_encoder_inputs = [
+            # Keep fixed 15s window for fused Mel+Encoder
+            ct.TensorType(name="audio", shape=(1, max_samples), dtype=np.float32),
+            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
+        ]
+        mel_encoder_outputs = [
+            ct.TensorType(name="encoder_output", dtype=np.float32),
+            ct.TensorType(name="encoder_length", dtype=np.int32),
+        ]
+        # Fused mel+encoder compute units (parametrized; default CPU_ONLY)
+        mel_encoder_model = _coreml_convert(
+            traced_mel_encoder,
+            mel_encoder_inputs,
+            mel_encoder_outputs,
+            export_settings,
+            compute_units_override=melenc_cu,
+        )
+        mel_encoder_path = output_dir / "parakeet_mel_encoder.mlpackage"
+        _save_mlpackage(
+            mel_encoder_model,
+            mel_encoder_path,
+            "Parakeet 110M fused Mel+Encoder (15 s window)",
+        )
+        typer.echo("Tracing and converting decoder…")
+        traced_decoder = torch.jit.trace(
+            decoder,
+            (targets, target_lengths, zero_state, zero_state),
+            strict=False,
+        )
+        traced_decoder.eval()
+        decoder_inputs = [
+            ct.TensorType(name="targets", shape=_tensor_shape(targets), dtype=np.int32),
+            ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
+            ct.TensorType(name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32),
+            ct.TensorType(name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32),
+        ]
+        decoder_outputs = [
+            ct.TensorType(name="decoder", dtype=np.float32),
+            ct.TensorType(name="h_out", dtype=np.float32),
+            ct.TensorType(name="c_out", dtype=np.float32),
+        ]
+        # Decoder: CPU only
+        decoder_model = _coreml_convert(
+            traced_decoder,
+            decoder_inputs,
+            decoder_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        decoder_path = output_dir / "parakeet_decoder.mlpackage"
+        _save_mlpackage(
+            decoder_model,
+            decoder_path,
+            "Parakeet 110M decoder (RNNT prediction network)",
+        )
+        typer.echo("Tracing and converting joint…")
+        traced_joint = torch.jit.trace(
+            joint,
+            (encoder_ref, decoder_ref),
+            strict=False,
+        )
+        traced_joint.eval()
+        joint_inputs = [
+            ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
+            ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
+        ]
+        joint_outputs = [
+            ct.TensorType(name="logits", dtype=np.float32),
+        ]
+        # Joint: CPU only
+        joint_model = _coreml_convert(
+            traced_joint,
+            joint_inputs,
+            joint_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        joint_path = output_dir / "parakeet_joint.mlpackage"
+        _save_mlpackage(
+            joint_model,
+            joint_path,
+            "Parakeet 110M joint network (RNNT)",
+        )
+        # Joint + decision head (split logits, softmax, argmax)
+        typer.echo("Tracing and converting joint decision head…")
+        vocab_size = int(asr_model.tokenizer.vocab_size)
+        num_extra = int(asr_model.joint.num_extra_outputs)
+        joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size, num_extra=num_extra)
+        traced_joint_decision = torch.jit.trace(
+            joint_decision,
+            (encoder_ref, decoder_ref),
+            strict=False,
+        )
+        traced_joint_decision.eval()
+        joint_decision_inputs = [
+            ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
+            ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
+        ]
+        joint_decision_outputs = [
+            ct.TensorType(name="token_id", dtype=np.int32),
+            ct.TensorType(name="token_prob", dtype=np.float32),
+            ct.TensorType(name="duration", dtype=np.int32),
+        ]
+        # JointDecision: CPU only
+        joint_decision_model = _coreml_convert(
+            traced_joint_decision,
+            joint_decision_inputs,
+            joint_decision_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        joint_decision_path = output_dir / "parakeet_joint_decision.mlpackage"
+        _save_mlpackage(
+            joint_decision_model,
+            joint_decision_path,
+            "Parakeet 110M joint + decision head (split, softmax, argmax)",
+        )
+        # Single-step JointDecision for [1,512,1] x [1,640,1] -> [1,1,1]
+        # Note: 110M encoder dim is 512 (not 1024 like 0.6B)
+        typer.echo("Tracing and converting single-step joint decision…")
+        jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size, num_extra=num_extra)
+        # Create single-step slices from refs
+        enc_step = encoder_ref[:, :, :1].contiguous()
+        dec_step = decoder_ref[:, :, :1].contiguous()
+        traced_jd_single = torch.jit.trace(
+            jd_single,
+            (enc_step, dec_step),
+            strict=False,
+        )
+        traced_jd_single.eval()
+        jd_single_inputs = [
+            ct.TensorType(name="encoder_step", shape=(1, enc_step.shape[1], 1), dtype=np.float32),
+            ct.TensorType(name="decoder_step", shape=(1, dec_step.shape[1], 1), dtype=np.float32),
+        ]
+        jd_single_outputs = [
+            ct.TensorType(name="token_id", dtype=np.int32),
+            ct.TensorType(name="token_prob", dtype=np.float32),
+            ct.TensorType(name="duration", dtype=np.int32),
+            ct.TensorType(name="top_k_ids", dtype=np.int32),
+            ct.TensorType(name="top_k_logits", dtype=np.float32),
+        ]
+        # Single-step JointDecision: CPU only
+        jd_single_model = _coreml_convert(
+            traced_jd_single,
+            jd_single_inputs,
+            jd_single_outputs,
+            export_settings,
+            compute_units_override=ct.ComputeUnit.CPU_ONLY,
+        )
+        jd_single_path = output_dir / "parakeet_joint_decision_single_step.mlpackage"
+        _save_mlpackage(
+            jd_single_model,
+            jd_single_path,
+            "Parakeet 110M single-step joint decision (current frame)",
+        )
+        # Export vocabulary
+        typer.echo("Exporting vocabulary…")
+        vocab_path = output_dir / "vocab.json"
+        vocab_dict = {
+            "vocab_size": vocab_size,
+            "blank_id": int(asr_model.decoder.blank_idx),
+            "tokens": asr_model.tokenizer.vocab,
+        }
+        vocab_path.write_text(json.dumps(vocab_dict, indent=2, ensure_ascii=False))
+        metadata: Dict[str, object] = {
+            "model_id": model_id,
+            "model_type": "hybrid_rnnt_ctc",
+            "sample_rate": sample_rate,
+            "max_audio_seconds": export_settings.max_audio_seconds,
+            "max_audio_samples": max_samples,
+            "max_symbol_steps": export_settings.max_symbol_steps,
+            "vocab_size": vocab_size,
+            "joint_extra_outputs": num_extra,
+            "encoder_dim": int(encoder_ref.shape[1]),  # 512 for 110M
+            "decoder_dim": int(decoder_ref.shape[1]),  # 640
+            "decoder_hidden": decoder_hidden,
+            "decoder_layers": decoder_layers,
+            "blank_id": int(asr_model.decoder.blank_idx),
+            "checkpoint": checkpoint_meta,
+            "coreml": {
+                "compute_units": export_settings.compute_units.name,
+                "compute_precision": (
+                    export_settings.compute_precision.name
+                    if export_settings.compute_precision is not None
+                    else "FLOAT32"
+                ),
+            },
+            "components": {
+                "preprocessor": {
+                    "inputs": {
+                        "audio_signal": list(_tensor_shape(audio_tensor)),
+                        "audio_length": [1],
+                    },
+                    "outputs": {
+                        "mel": list(_tensor_shape(mel_ref)),
+                        "mel_length": [1],
+                    },
+                    "path": preprocessor_path.name,
+                },
+                "encoder": {
+                    "inputs": {
+                        "mel": list(_tensor_shape(mel_ref)),
+                        "mel_length": [1],
+                    },
+                    "outputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "encoder_length": [1],
+                    },
+                    "path": encoder_path.name,
+                },
+                "ctc_head": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                    },
+                    "outputs": {
+                        "log_probs": list(_tensor_shape(ctc_log_probs_ref)),
+                    },
+                    "path": ctc_head_path.name,
+                },
+                "mel_encoder": {
+                    "inputs": {
+                        "audio_signal": [1, max_samples],
+                        "audio_length": [1],
+                    },
+                    "outputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "encoder_length": [1],
+                    },
+                    "path": mel_encoder_path.name,
+                },
+                "decoder": {
+                    "inputs": {
+                        "targets": list(_tensor_shape(targets)),
+                        "target_length": [1],
+                        "h_in": list(_tensor_shape(zero_state)),
+                        "c_in": list(_tensor_shape(zero_state)),
+                    },
+                    "outputs": {
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                        "h_out": list(_tensor_shape(h_ref)),
+                        "c_out": list(_tensor_shape(c_ref)),
+                    },
+                    "path": decoder_path.name,
+                },
+                "joint": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                    },
+                    "outputs": {
+                        "logits": list(_tensor_shape(joint_ref)),
+                    },
+                    "path": joint_path.name,
+                },
+                "joint_decision": {
+                    "inputs": {
+                        "encoder": list(_tensor_shape(encoder_ref)),
+                        "decoder": list(_tensor_shape(decoder_ref)),
+                    },
+                    "outputs": {
+                        "token_id": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                        "token_prob": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                        "duration": [
+                            _tensor_shape(encoder_ref)[0],
+                            _tensor_shape(encoder_ref)[2],
+                            _tensor_shape(decoder_ref)[2],
+                        ],
+                    },
+                    "path": joint_decision_path.name,
+                },
+                "joint_decision_single_step": {
+                    "inputs": {
+                        "encoder_step": [1, int(encoder_ref.shape[1]), 1],
+                        "decoder_step": [1, int(decoder_ref.shape[1]), 1],
+                    },
+                    "outputs": {
+                        "token_id": [1, 1, 1],
+                        "token_prob": [1, 1, 1],
+                        "duration": [1, 1, 1],
+                        "top_k_ids": [1, 1, 1, 64],
+                        "top_k_logits": [1, 1, 1, 64],
+                    },
+                    "path": jd_single_path.name,
+                },
+            },
+        }
+        metadata_path = output_dir / "metadata.json"
+        metadata_path.write_text(json.dumps(metadata, indent=2))
+        typer.echo(f"Export complete. Metadata written to {metadata_path}")
+    finally:
+        asr_model.decoder._rnnt_export = decoder_export_flag
+if __name__ == "__main__":
+    app()

convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json ADDED Viewed

	@@ -0,0 +1,35 @@

+{
+  "approach" : "single-encoder",
+  "model" : "parakeet-tdt-ctc-110m-hybrid",
+  "results" : [
+    {
+      "audioLength" : 15,
+      "ctcDetections" : [
+        {
+          "endTime" : 6.0800000000000001,
+          "inReference" : true,
+          "score" : -8.3699999999999992,
+          "source" : "ctc",
+          "startTime" : 4.96,
+          "word" : "LATAM Airlines"
+        }
+      ],
+      "dictFound" : 1,
+      "dictTotal" : 1,
+      "fileId" : "4329526_chunk0",
+      "hypothesis" : "goodday everyone and welcome to latam airlines group earnings release confonference call just as a reminder this conference is being recorded lat tam airlines group eararnings releaseed for the",
+      "processingTime" : 0.070000000000000007,
+      "reference" : "good day everyone and welcome to latam airlines group earnings release conference call just as a reminder this conference is being recorded latam airlines group earnings released for the",
+      "wer" : 24.140000000000001
+    }
+  ],
+  "summary" : {
+    "avgWer" : 24.140000000000001,
+    "dictPass" : 1,
+    "dictRate" : 100,
+    "dictTotal" : 1,
+    "totalAudioDuration" : 15,
+    "totalProcessingTime" : 0.070000000000000007,
+    "totalTests" : 1
+  }
+}

convert/parakeet-tdt-ctc-110m/coreml/individual_components.py ADDED Viewed

	@@ -0,0 +1,265 @@

+#!/usr/bin/env python3
+"""Export Parakeet TDT-CTC 110M Hybrid RNNT components into CoreML and validate outputs."""
+from __future__ import annotations
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Tuple
+import coremltools as ct
+import torch
+@dataclass
+class ExportSettings:
+    output_dir: Path
+    compute_units: ct.ComputeUnit
+    deployment_target: Optional[ct.target.iOS17]
+    compute_precision: Optional[ct.precision]
+    max_audio_seconds: float
+    max_symbol_steps: int
+@dataclass
+class ValidationSettings:
+    audio_path: Optional[Path]
+    seconds: float
+    seed: Optional[int]
+    rtol: float
+    atol: float
+    skip: bool
+@dataclass
+class ValidationDiff:
+    name: str
+    max_abs_diff: float
+    max_rel_diff: float
+@dataclass
+class ValidationResult:
+    source: str
+    audio_num_samples: int
+    audio_seconds: float
+    token_length: int
+    atol: float
+    rtol: float
+    diffs: Tuple[ValidationDiff, ...]
+class PreprocessorWrapper(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(self, audio_signal: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.module(input_signal=audio_signal, length=length.to(dtype=torch.long))
+        return mel, mel_length
+class EncoderWrapper(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(self, features: torch.Tensor, length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        encoded, encoded_lengths = self.module(audio_signal=features, length=length.to(dtype=torch.long))
+        return encoded, encoded_lengths
+class DecoderWrapper(torch.nn.Module):
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(
+        self,
+        targets: torch.Tensor,
+        target_lengths: torch.Tensor,
+        h_in: torch.Tensor,
+        c_in: torch.Tensor,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        state = [h_in, c_in]
+        decoder_output, _, new_state = self.module(
+            targets=targets.to(dtype=torch.long),
+            target_length=target_lengths.to(dtype=torch.long),
+            states=state,
+        )
+        return decoder_output, new_state[0], new_state[1]
+class JointWrapper(torch.nn.Module):
+    """Joint network for 110M hybrid model.
+    Note: The 110M model has encoder_dim=512 and decoder_dim=640.
+    The joint network projects both to 640, then combines them.
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor) -> torch.Tensor:
+        # Input: encoder_outputs [B, D_enc, T], decoder_outputs [B, D_dec, U]
+        # For 110M: D_enc=512, D_dec=640
+        # Transpose to match what projection layers expect
+        encoder_outputs = encoder_outputs.transpose(1, 2)  # [B, T, D_enc]
+        decoder_outputs = decoder_outputs.transpose(1, 2)  # [B, U, D_dec]
+        # Apply projections
+        enc_proj = self.module.enc(encoder_outputs)        # [B, T, 640]
+        dec_proj = self.module.pred(decoder_outputs)       # [B, U, 640]
+        # Explicit broadcasting along T and U to avoid converter ambiguity
+        x = enc_proj.unsqueeze(2) + dec_proj.unsqueeze(1)  # [B, T, U, 640]
+        x = self.module.joint_net[0](x)                   # ReLU
+        x = self.module.joint_net[1](x)                   # Dropout (no-op in eval)
+        out = self.module.joint_net[2](x)                 # Linear -> logits [B, T, U, vocab+1+durations]
+        return out
+class CTCHeadWrapper(torch.nn.Module):
+    """CTC decoder head for 110M hybrid model.
+    Takes encoder output and produces log probabilities over vocabulary.
+    The NeMo CTC decoder (ConvASRDecoder) uses Conv1d so it expects [B, D, T] format.
+    """
+    def __init__(self, module: torch.nn.Module) -> None:
+        super().__init__()
+        self.module = module
+    def forward(self, encoder_outputs: torch.Tensor) -> torch.Tensor:
+        # Input: encoder_outputs [B, D_enc, T] - already in the format CTC decoder expects
+        # The NeMo CTC decoder uses Conv1d internally, so it expects [B, D, T]
+        # Output: log probabilities [B, T, vocab+1]
+        log_probs = self.module(encoder_output=encoder_outputs)
+        return log_probs
+class MelEncoderWrapper(torch.nn.Module):
+    """Fused wrapper: waveform -> mel -> encoder.
+    Inputs:
+      - audio_signal: [B, S]
+      - audio_length: [B]
+    Outputs:
+      - encoder: [B, D, T_enc]
+      - encoder_length: [B]
+    """
+    def __init__(self, preprocessor: PreprocessorWrapper, encoder: EncoderWrapper) -> None:
+        super().__init__()
+        self.preprocessor = preprocessor
+        self.encoder = encoder
+    def forward(self, audio_signal: torch.Tensor, audio_length: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        mel, mel_length = self.preprocessor(audio_signal, audio_length)
+        encoded, enc_len = self.encoder(mel, mel_length.to(dtype=torch.int32))
+        return encoded, enc_len
+class JointDecisionWrapper(torch.nn.Module):
+    """Joint + decision head: outputs label id, label prob, duration frames.
+    Splits joint logits into token logits and duration logits, applies softmax
+    over tokens, argmax for both heads, and gathers probability of the chosen token.
+    Inputs:
+      - encoder_outputs: [B, D, T]
+      - decoder_outputs: [B, D, U]
+    Returns:
+      - token_id: [B, T, U] int32
+      - token_prob: [B, T, U] float32
+      - duration: [B, T, U] int32  (frames; for v3 bins=[0,1,2,3,4])
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int, num_extra: int) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+        self.num_extra = int(num_extra)
+    def forward(self, encoder_outputs: torch.Tensor, decoder_outputs: torch.Tensor):
+        logits = self.joint(encoder_outputs, decoder_outputs)
+        token_logits = logits[..., : self.vocab_with_blank]
+        duration_logits = logits[..., -self.num_extra :]
+        # Token selection
+        token_ids = torch.argmax(token_logits, dim=-1).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(token_logits, dim=-1)
+        # gather expects int64 (long) indices; cast only for gather
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        # Duration prediction (bins are identity mapping to frames for v3)
+        duration = torch.argmax(duration_logits, dim=-1).to(dtype=torch.int32)
+        return token_ids, token_prob, duration
+class JointDecisionSingleStep(torch.nn.Module):
+    """Single-step variant for streaming: encoder_step [1, 512, 1] -> [1,1,1].
+    Note: For 110M model, encoder_dim is 512 (not 1024 like 0.6B).
+    Inputs:
+      - encoder_step: [B=1, D=512, T=1]
+      - decoder_step: [B=1, D=640, U=1]
+    Returns:
+      - token_id: [1, 1, 1] int32
+      - token_prob: [1, 1, 1] float32
+      - duration: [1, 1, 1] int32
+      - top_k_ids: [1, 1, 1, K] int32
+      - top_k_logits: [1, 1, 1, K] float32
+    """
+    def __init__(self, joint: JointWrapper, vocab_size: int, num_extra: int, top_k: int = 64) -> None:
+        super().__init__()
+        self.joint = joint
+        self.vocab_with_blank = int(vocab_size) + 1
+        self.num_extra = int(num_extra)
+        # Emit top-K candidates to enable host-side re-ranking with contextual biasing
+        self.top_k = int(top_k)
+    def forward(self, encoder_step: torch.Tensor, decoder_step: torch.Tensor):
+        # Reuse JointWrapper which expects [B, D, T] and [B, D, U]
+        logits = self.joint(encoder_step, decoder_step)  # [1, 1, 1, V+extra]
+        token_logits = logits[..., : self.vocab_with_blank]
+        duration_logits = logits[..., -self.num_extra :]
+        token_ids = torch.argmax(token_logits, dim=-1, keepdim=False).to(dtype=torch.int32)
+        token_probs_all = torch.softmax(token_logits, dim=-1)
+        token_prob = torch.gather(
+            token_probs_all, dim=-1, index=token_ids.long().unsqueeze(-1)
+        ).squeeze(-1)
+        duration = torch.argmax(duration_logits, dim=-1, keepdim=False).to(dtype=torch.int32)
+        # Also expose top-K candidates for host-side re-ranking.
+        # Shapes preserved as [1, 1, 1, K] to match CoreML broadcasting expectations.
+        # Note: topk expects last dimension; original shape is [1, 1, 1, V].
+        topk_logits, topk_ids_long = torch.topk(token_logits, k=min(self.top_k, token_logits.shape[-1]), dim=-1)
+        topk_ids = topk_ids_long.to(dtype=torch.int32)
+        return token_ids, token_prob, duration, topk_ids, topk_logits
+def _coreml_convert(
+    traced: torch.jit.ScriptModule,
+    inputs,
+    outputs,
+    settings: ExportSettings,
+    compute_units_override: Optional[ct.ComputeUnit] = None,
+) -> ct.models.MLModel:
+    cu = compute_units_override if compute_units_override is not None else settings.compute_units
+    kwargs = {
+        "convert_to": "mlprogram",
+        "inputs": inputs,
+        "outputs": outputs,
+        "compute_units": cu,
+    }
+    print("Converting:", traced.__class__.__name__)
+    print("Conversion kwargs:", kwargs)
+    if settings.deployment_target is not None:
+        kwargs["minimum_deployment_target"] = settings.deployment_target
+    if settings.compute_precision is not None:
+        kwargs["compute_precision"] = settings.compute_precision
+    return ct.convert(traced, **kwargs)

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/metadata.json ADDED Viewed

	@@ -0,0 +1,247 @@

+{
+  "model_id": "nvidia/parakeet-tdt_ctc-110m",
+  "model_type": "hybrid_rnnt_ctc",
+  "sample_rate": 16000,
+  "max_audio_seconds": 15.0,
+  "max_audio_samples": 240000,
+  "max_symbol_steps": 1,
+  "vocab_size": 1024,
+  "joint_extra_outputs": 5,
+  "encoder_dim": 512,
+  "decoder_dim": 640,
+  "decoder_hidden": 640,
+  "decoder_layers": 1,
+  "blank_id": 1024,
+  "checkpoint": {
+    "type": "pretrained",
+    "model_id": "nvidia/parakeet-tdt_ctc-110m"
+  },
+  "coreml": {
+    "compute_units": "CPU_ONLY",
+    "compute_precision": "FLOAT32"
+  },
+  "components": {
+    "preprocessor": {
+      "inputs": {
+        "audio_signal": [
+          1,
+          240000
+        ],
+        "audio_length": [
+          1
+        ]
+      },
+      "outputs": {
+        "mel": [
+          1,
+          80,
+          1501
+        ],
+        "mel_length": [
+          1
+        ]
+      },
+      "path": "parakeet_preprocessor.mlpackage"
+    },
+    "encoder": {
+      "inputs": {
+        "mel": [
+          1,
+          80,
+          1501
+        ],
+        "mel_length": [
+          1
+        ]
+      },
+      "outputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "encoder_length": [
+          1
+        ]
+      },
+      "path": "parakeet_encoder.mlpackage"
+    },
+    "ctc_head": {
+      "inputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ]
+      },
+      "outputs": {
+        "log_probs": [
+          1,
+          188,
+          1025
+        ]
+      },
+      "path": "parakeet_ctc_head.mlpackage"
+    },
+    "mel_encoder": {
+      "inputs": {
+        "audio_signal": [
+          1,
+          240000
+        ],
+        "audio_length": [
+          1
+        ]
+      },
+      "outputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "encoder_length": [
+          1
+        ]
+      },
+      "path": "parakeet_mel_encoder.mlpackage"
+    },
+    "decoder": {
+      "inputs": {
+        "targets": [
+          1,
+          1
+        ],
+        "target_length": [
+          1
+        ],
+        "h_in": [
+          1,
+          1,
+          640
+        ],
+        "c_in": [
+          1,
+          1,
+          640
+        ]
+      },
+      "outputs": {
+        "decoder": [
+          1,
+          640,
+          1
+        ],
+        "h_out": [
+          1,
+          1,
+          640
+        ],
+        "c_out": [
+          1,
+          1,
+          640
+        ]
+      },
+      "path": "parakeet_decoder.mlpackage"
+    },
+    "joint": {
+      "inputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "decoder": [
+          1,
+          640,
+          1
+        ]
+      },
+      "outputs": {
+        "logits": [
+          1,
+          188,
+          1,
+          1030
+        ]
+      },
+      "path": "parakeet_joint.mlpackage"
+    },
+    "joint_decision": {
+      "inputs": {
+        "encoder": [
+          1,
+          512,
+          188
+        ],
+        "decoder": [
+          1,
+          640,
+          1
+        ]
+      },
+      "outputs": {
+        "token_id": [
+          1,
+          188,
+          1
+        ],
+        "token_prob": [
+          1,
+          188,
+          1
+        ],
+        "duration": [
+          1,
+          188,
+          1
+        ]
+      },
+      "path": "parakeet_joint_decision.mlpackage"
+    },
+    "joint_decision_single_step": {
+      "inputs": {
+        "encoder_step": [
+          1,
+          512,
+          1
+        ],
+        "decoder_step": [
+          1,
+          640,
+          1
+        ]
+      },
+      "outputs": {
+        "token_id": [
+          1,
+          1,
+          1
+        ],
+        "token_prob": [
+          1,
+          1,
+          1
+        ],
+        "duration": [
+          1,
+          1,
+          1
+        ],
+        "top_k_ids": [
+          1,
+          1,
+          1,
+          64
+        ],
+        "top_k_logits": [
+          1,
+          1,
+          1,
+          64
+        ]
+      },
+      "path": "parakeet_joint_decision_single_step.mlpackage"
+    }
+  }
+}

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6459b9564e0630f2eec300eb732fceccbc1d2d16f12cb0694ce310d84bfbecf2
+size 3366

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
+size 1051842

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_ctc_head.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "6651E3CE-C3ED-4267-AAC3-5A772FC3515A": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "A3F7798B-67CA-418C-B8BB-58731D3A413F": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "6651E3CE-C3ED-4267-AAC3-5A772FC3515A"
+}

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a05548eb455c5cd564782b125a5f9279a789be1f4141e5f044453ea79cd68b47
+size 6729

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
+size 7871040

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_decoder.mlpackage/Manifest.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+    "fileFormatVersion": "1.0.0",
+    "itemInfoEntries": {
+        "83E7B87A-4EBE-48BF-BF3C-EE74DEA4C7AF": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Specification",
+            "name": "model.mlmodel",
+            "path": "com.apple.CoreML/model.mlmodel"
+        },
+        "98BF03AC-26AF-410B-95AC-C9B99B3B240C": {
+            "author": "com.apple.CoreML",
+            "description": "CoreML Model Weights",
+            "name": "weights",
+            "path": "com.apple.CoreML/weights"
+        }
+    },
+    "rootModelIdentifier": "83E7B87A-4EBE-48BF-BF3C-EE74DEA4C7AF"
+}

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/model.mlmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:70d7747b57beba0248fabb6cbfa5d276e3604d0d7e234f4ccb578ea0a4d25110
+size 508107

convert/parakeet-tdt-ctc-110m/coreml/parakeet_110m_coreml/parakeet_encoder.mlpackage/Data/com.apple.CoreML/weights/weight.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
+size 215143424