Alex-Wengg commited on 5 days ago

Commit

8036e60

1 Parent(s): e7fd089

cleanup: flatten repo to CTC-only models at root

Move AudioEncoder.mlmodelc, MelSpectrogram.mlmodelc, and vocab.json
from parakeet-ctc-110m-coreml/ subdirectory to repo root.

Remove everything not needed by the Swift FluidAudio library:
- TDT model copies (parakeet-tdt-0.6b-v2-coreml/, parakeet-tdt-v2-0.6b/)
- Conversion scripts (convert/, parakeet-tdt-ctc-110m/, scripts/)
- CLI benchmarks (cli/) — already in FluidAudio repo
- Duplicate model copies (models/)
- Python artifacts (pyproject.toml, uv.lock)
- HuggingFace tokenizer files (tokenizer.json, tokenizer.model, etc.)
- .DS_Store files

Matches the clean flat structure of parakeet-ctc-0.6b-coreml.

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/analytics/coremldata.bin +0 -0
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/coremldata.bin +0 -0
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/metadata.json +0 -0
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/model.mil +0 -0
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/weights/weight.bin +0 -0
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/analytics/coremldata.bin +0 -0
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/coremldata.bin +0 -0
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/metadata.json +0 -0
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/model.mil +0 -0
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/weights/weight.bin +0 -0
cli/CtcEarningsBenchmark.swift +0 -1048
cli/HybridEarningsBenchmark.swift +0 -554
config.json +0 -1
convert/.DS_Store +0 -0
convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py +0 -323
convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json +0 -66
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil +0 -24
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json +0 -118
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil +0 -45
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json +0 -105
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil +0 -0
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json +0 -102
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil +0 -58
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json +0 -123
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil +0 -69
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json +0 -112
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil +0 -191
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin +0 -3
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json +0 -247
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json +0 -1
convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py +0 -697
convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json +0 -35

{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/analytics/coremldata.bin RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/coremldata.bin RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/metadata.json RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/model.mil RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc → AudioEncoder.mlmodelc}/weights/weight.bin RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/analytics/coremldata.bin RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/coremldata.bin RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/metadata.json RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/model.mil RENAMED Viewed

File without changes

{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc → MelSpectrogram.mlmodelc}/weights/weight.bin RENAMED Viewed

File without changes

cli/CtcEarningsBenchmark.swift DELETED Viewed

@@ -1,1048 +0,0 @@
-#if os(macOS)
-import AVFoundation
-import CoreML
-import FluidAudio
-import Foundation
-/// Earnings22 benchmark using TDT for transcription + CTC for keyword spotting.
-/// TDT provides low WER transcription, CTC provides high recall dictionary detection.
-public enum CtcEarningsBenchmark {
-    private enum KeywordMode: String {
-        case chunk
-        case file
-    }
-    /// Default CTC model directory
-    private static func defaultCtcModelPath() -> String? {
-        let appSupport = FileManager.default.urls(
-            for: .applicationSupportDirectory, in: .userDomainMask
-        ).first!
-        let modelPath = appSupport.appendingPathComponent("FluidAudio/Models/parakeet-ctc-110m-coreml")
-        if FileManager.default.fileExists(atPath: modelPath.path) {
-            return modelPath.path
-        }
-        return nil
-    }
-    /// Default data directory (from download command)
-    private static func defaultDataDir() -> String? {
-        let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
-        if FileManager.default.fileExists(atPath: dataDir.path) {
-            return dataDir.path
-        }
-        return nil
-    }
-    public static func runCLI(arguments: [String]) async {
-        // Check for help
-        if arguments.contains("--help") || arguments.contains("-h") {
-            printUsage()
-            return
-        }
-        // Parse arguments
-        var dataDir: String? = nil
-        var outputFile = "ctc_earnings_benchmark.json"
-        var maxFiles: Int? = nil
-        var ctcModelPath: String? = nil
-        // Note: Using v2 by default because v3 has issues with certain audio files
-        // (returns empty transcription for ~7 files in Earnings22 dataset)
-        var tdtVersion: AsrModelVersion = .v2
-        var autoDownload = false
-        var keywordMode: KeywordMode = .chunk
-        var i = 0
-        while i < arguments.count {
-            switch arguments[i] {
-            case "--data-dir":
-                if i + 1 < arguments.count {
-                    dataDir = arguments[i + 1]
-                    i += 1
-                }
-            case "--output", "-o":
-                if i + 1 < arguments.count {
-                    outputFile = arguments[i + 1]
-                    i += 1
-                }
-            case "--max-files":
-                if i + 1 < arguments.count {
-                    maxFiles = Int(arguments[i + 1])
-                    i += 1
-                }
-            case "--ctc-model":
-                if i + 1 < arguments.count {
-                    ctcModelPath = arguments[i + 1]
-                    i += 1
-                }
-            case "--tdt-version":
-                if i + 1 < arguments.count {
-                    if arguments[i + 1] == "v2" || arguments[i + 1] == "2" {
-                        tdtVersion = .v2
-                    }
-                    i += 1
-                }
-            case "--auto-download":
-                autoDownload = true
-            case "--keyword-mode":
-                if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
-                    keywordMode = mode
-                    i += 1
-                }
-            default:
-                break
-            }
-            i += 1
-        }
-        // Use defaults if not specified
-        if dataDir == nil {
-            dataDir = defaultDataDir()
-        }
-        if ctcModelPath == nil {
-            ctcModelPath = defaultCtcModelPath()
-        }
-        // Handle auto-download for dataset
-        if autoDownload && dataDir == nil {
-            print("📥 Downloading earnings22-kws dataset...")
-            await DatasetDownloader.downloadEarnings22KWS(force: false)
-            dataDir = defaultDataDir()
-        }
-        // Handle auto-download for CTC models
-        if autoDownload && ctcModelPath == nil {
-            print("📥 Downloading CTC models...")
-            do {
-                _ = try await CtcModels.download()
-                ctcModelPath = defaultCtcModelPath()
-            } catch {
-                print("ERROR: Failed to download CTC models: \(error)")
-            }
-        }
-        print("Earnings Benchmark (TDT transcription + CTC keyword spotting)")
-        print("  Data directory: \(dataDir ?? "not found")")
-        print("  Output file: \(outputFile)")
-        print("  TDT version: \(tdtVersion == .v2 ? "v2" : "v3")")
-        print("  CTC model: \(ctcModelPath ?? "not found")")
-        print("  Keyword mode: \(keywordMode.rawValue)")
-        guard let finalDataDir = dataDir else {
-            print("ERROR: Data directory not found")
-            print("💡 Download with: fluidaudio download --dataset earnings22-kws")
-            print("   Or specify: --data-dir <path>")
-            printUsage()
-            return
-        }
-        guard let modelPath = ctcModelPath else {
-            print("ERROR: CTC model not found")
-            print("💡 Download parakeet-ctc-110m-coreml model to:")
-            print("   ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
-            print("   Or specify: --ctc-model <path>")
-            printUsage()
-            return
-        }
-        let dataDirResolved = finalDataDir
-        do {
-            // Load TDT models for transcription
-            print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...")
-            let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
-            let asrManager = AsrManager(config: .default)
-            try await asrManager.initialize(models: tdtModels)
-            print("TDT models loaded successfully")
-            // Load CTC models for keyword spotting
-            print("Loading CTC models from: \(modelPath)")
-            let modelDir = URL(fileURLWithPath: modelPath)
-            let ctcModels = try await CtcModels.loadDirect(from: modelDir)
-            print("Loaded CTC vocabulary with \(ctcModels.vocabulary.count) tokens")
-            // Create keyword spotter
-            let vocabSize = ctcModels.vocabulary.count
-            let blankId = vocabSize  // Blank is at index = vocab_size
-            let spotter = CtcKeywordSpotter(models: ctcModels, blankId: blankId)
-            print("Created CTC spotter with blankId=\(blankId)")
-            // Collect test files
-            let dataDirURL = URL(fileURLWithPath: dataDirResolved)
-            let fileIds = try collectFileIds(from: dataDirURL, maxFiles: maxFiles)
-            let keywordIndex = try buildKeywordIndex(dataDir: dataDirURL, keywordMode: keywordMode)
-            if fileIds.isEmpty {
-                print("ERROR: No test files found in \(dataDirResolved)")
-                return
-            }
-            print("Processing \(fileIds.count) test files...")
-            var results: [[String: Any]] = []
-            var totalWer = 0.0
-            var totalKeywordReference = 0
-            var totalKeywordPredicted = 0
-            var totalKeywordTruePositives = 0
-            var totalKeywordFalsePositives = 0
-            var totalKeywordFalseNegatives = 0
-            var totalAudioDuration = 0.0
-            var totalProcessingTime = 0.0
-            for (index, fileId) in fileIds.enumerated() {
-                print("[\(index + 1)/\(fileIds.count)] \(fileId)")
-                if let result = try await processFile(
-                    fileId: fileId,
-                    dataDir: dataDirURL,
-                    asrManager: asrManager,
-                    ctcModels: ctcModels,
-                    spotter: spotter,
-                    keywordMode: keywordMode,
-                    keywordIndex: keywordIndex
-                ) {
-                    results.append(result)
-                    totalWer += result["wer"] as? Double ?? 0
-                    totalKeywordReference += result["keywordReference"] as? Int ?? 0
-                    totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
-                    totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
-                    totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
-                    totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
-                    totalAudioDuration += result["audioLength"] as? Double ?? 0
-                    totalProcessingTime += result["processingTime"] as? Double ?? 0
-                    let wer = result["wer"] as? Double ?? 0
-                    let precision = result["keywordPrecision"] as? Double ?? 0
-                    let recall = result["keywordRecall"] as? Double ?? 0
-                    let fscore = result["keywordFscore"] as? Double ?? 0
-                    print(
-                        "  WER: \(String(format: "%.1f", wer))%, " +
-                            "KW P/R/F: \(String(format: "%.2f", precision))/" +
-                            "\(String(format: "%.2f", recall))/" +
-                            "\(String(format: "%.2f", fscore))"
-                    )
-                }
-            }
-            // Calculate summary
-            let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
-            let keywordPrecision =
-                totalKeywordPredicted > 0
-                ? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
-                : 0
-            let keywordRecall =
-                totalKeywordReference > 0
-                ? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
-                : 0
-            let keywordFscore =
-                (keywordPrecision + keywordRecall) > 0
-                ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
-                : 0
-            // Print summary
-            print("\n" + String(repeating: "=", count: 60))
-            print("EARNINGS22 BENCHMARK (TDT + CTC)")
-            print(String(repeating: "=", count: 60))
-            print("Model: \(modelPath)")
-            print("Total tests: \(results.count)")
-            print("Average WER: \(String(format: "%.2f", avgWer))%")
-            print(
-                "Keyword Precision/Recall/F1: " +
-                    "\(String(format: "%.2f", keywordPrecision))/" +
-                    "\(String(format: "%.2f", keywordRecall))/" +
-                    "\(String(format: "%.2f", keywordFscore))"
-            )
-            print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
-            print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
-            if totalProcessingTime > 0 {
-                print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
-            }
-            print(String(repeating: "=", count: 60))
-            // Sort results by WER descending (worst first)
-            let sortedResults = results.sorted { r1, r2 in
-                let wer1 = r1["wer"] as? Double ?? 0
-                let wer2 = r2["wer"] as? Double ?? 0
-                return wer1 > wer2
-            }
-            // Save to JSON
-            let summaryDict: [String: Any] = [
-                "totalTests": results.count,
-                "avgWer": round(avgWer * 100) / 100,
-                "keywordTruePositives": totalKeywordTruePositives,
-                "keywordFalsePositives": totalKeywordFalsePositives,
-                "keywordFalseNegatives": totalKeywordFalseNegatives,
-                "keywordPredicted": totalKeywordPredicted,
-                "keywordReference": totalKeywordReference,
-                "keywordPrecision": round(keywordPrecision * 1000) / 1000,
-                "keywordRecall": round(keywordRecall * 1000) / 1000,
-                "keywordFscore": round(keywordFscore * 1000) / 1000,
-                "totalAudioDuration": round(totalAudioDuration * 100) / 100,
-                "totalProcessingTime": round(totalProcessingTime * 100) / 100,
-            ]
-            let output: [String: Any] = [
-                "model": modelPath,
-                "keywordMode": keywordMode.rawValue,
-                "summary": summaryDict,
-                "results": sortedResults,
-            ]
-            let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
-            try jsonData.write(to: URL(fileURLWithPath: outputFile))
-            print("\nResults written to: \(outputFile)")
-        } catch {
-            print("ERROR: Benchmark failed: \(error)")
-        }
-    }
-    private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
-        var fileIds: [String] = []
-        let suffix = ".dictionary.txt"
-        let fileManager = FileManager.default
-        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
-        for url in contents.sorted(by: { $0.path < $1.path }) {
-            let name = url.lastPathComponent
-            if name.hasSuffix(suffix) {
-                let data = try? Data(contentsOf: url)
-                if let data = data, !data.isEmpty {
-                    let fileId = String(name.dropLast(suffix.count))
-                    fileIds.append(fileId)
-                }
-            }
-        }
-        if let maxFiles = maxFiles {
-            return Array(fileIds.prefix(maxFiles))
-        }
-        return fileIds
-    }
-    private static func processFile(
-        fileId: String,
-        dataDir: URL,
-        asrManager: AsrManager,
-        ctcModels: CtcModels,
-        spotter: CtcKeywordSpotter,
-        keywordMode: KeywordMode,
-        keywordIndex: [String: [String]]
-    ) async throws -> [String: Any]? {
-        let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
-        let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
-        let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
-        let fm = FileManager.default
-        guard fm.fileExists(atPath: wavFile.path),
-            fm.fileExists(atPath: dictionaryFile.path)
-        else {
-            return nil
-        }
-        // Load dictionary words (chunk or file keywords)
-        let dictionaryWords = try loadDictionaryWords(
-            fileId: fileId,
-            dictionaryFile: dictionaryFile,
-            keywordMode: keywordMode,
-            keywordIndex: keywordIndex
-        )
-        // Load reference text
-        let referenceRaw =
-            (try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
-        // Get audio samples
-        let audioFile = try AVAudioFile(forReading: wavFile)
-        let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
-        let format = audioFile.processingFormat
-        let frameCount = AVAudioFrameCount(audioFile.length)
-        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
-            throw NSError(
-                domain: "CtcEarningsBenchmark", code: 1,
-                userInfo: [NSLocalizedDescriptionKey: "Failed to create audio buffer"])
-        }
-        try audioFile.read(into: buffer)
-        // Resample to 16kHz
-        let converter = AudioConverter()
-        let samples = try converter.resampleBuffer(buffer)
-        let startTime = Date()
-        // 1. TDT transcription for low WER
-        let tdtResult = try await asrManager.transcribe(wavFile)
-        // Skip files where TDT returns empty (some audio files cause model issues)
-        if tdtResult.text.isEmpty {
-            print("  SKIPPED: TDT returned empty transcription")
-            return nil
-        }
-        // 2. Build custom vocabulary for CTC keyword spotting
-        var vocabTerms: [CustomVocabularyTerm] = []
-        for word in dictionaryWords {
-            let tokenIds = tokenize(word, vocabulary: ctcModels.vocabulary)
-            if !tokenIds.isEmpty {
-                let term = CustomVocabularyTerm(
-                    text: word,
-                    weight: nil,
-                    aliases: nil,
-                    tokenIds: nil,
-                    ctcTokenIds: tokenIds
-                )
-                vocabTerms.append(term)
-            }
-        }
-        let customVocab = CustomVocabularyContext(terms: vocabTerms)
-        // 3. CTC keyword spotting for high recall dictionary detection
-        let spotResult = try await spotter.spotKeywordsWithLogProbs(
-            audioSamples: samples,
-            customVocabulary: customVocab,
-            minScore: nil
-        )
-        // 4. Post-process: Use VocabularyRescorer with Argmax-style parameters
-        // Argmax uses cbw=3.0 (context-biasing weight) for boosting vocab terms
-        let useRescorer = ProcessInfo.processInfo.environment["NO_CTC_RESCORING"] != "1"
-        let hypothesis: String
-        if useRescorer {
-            let rescorerConfig = VocabularyRescorer.Config(
-                minScoreAdvantage: 1.0,  // Lower threshold - rely more on CTC scoring
-                minVocabScore: -15.0,  // Permissive to include more detections
-                maxOriginalScoreForReplacement: -2.0,  // Don't replace very confident words
-                vocabBoostWeight: 3.0  // Argmax cbw=3.0
-            )
-            let rescorer = VocabularyRescorer(
-                spotter: spotter,
-                vocabulary: customVocab,
-                config: rescorerConfig
-            )
-            let rescoreResult = rescorer.rescore(transcript: tdtResult.text, spotResult: spotResult)
-            hypothesis = rescoreResult.text
-        } else {
-            hypothesis = tdtResult.text  // Baseline: no CTC corrections
-        }
-        let processingTime = Date().timeIntervalSince(startTime)
-        // Normalize texts
-        let referenceNormalized = TextNormalizer.normalize(referenceRaw)
-        let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
-        // Keyword sets for precision/recall
-        let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
-        let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
-        let truePositives = referenceKeywords.intersection(predictedKeywords)
-        let falsePositives = predictedKeywords.subtracting(referenceKeywords)
-        let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
-        let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
-        let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
-        let keywordFscore =
-            (keywordPrecision + keywordRecall) > 0
-            ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
-            : 0
-        let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
-            !$0.isEmpty
-        }
-        let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
-            !$0.isEmpty
-        }
-        // Calculate WER
-        let wer: Double
-        if referenceWords.isEmpty {
-            wer = hypothesisWords.isEmpty ? 0.0 : 1.0
-        } else {
-            wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
-        }
-        // Count dictionary detections (debug only)
-        let minCtcScore: Float = -15.0  // Permissive threshold for detection
-        var detectionDetails: [[String: Any]] = []
-        var ctcFoundWords: Set<String> = []
-        // 1. CTC detections
-        for detection in spotResult.detections {
-            let inRef = referenceKeywords.contains(detection.term.text.lowercased())
-            let detail: [String: Any] = [
-                "word": detection.term.text,
-                "score": round(Double(detection.score) * 100) / 100,
-                "startTime": round(detection.startTime * 100) / 100,
-                "endTime": round(detection.endTime * 100) / 100,
-                "source": "ctc",
-                "inReference": inRef,
-            ]
-            detectionDetails.append(detail)
-            if detection.score >= minCtcScore {  // Use >= to include edge cases
-                ctcFoundWords.insert(detection.term.text.lowercased())
-            }
-        }
-        // 2. Fallback: check hypothesis for dictionary words not found by CTC
-        let hypothesisLower = hypothesis.lowercased()
-        for word in dictionaryWords {
-            let wordLower = word.lowercased()
-            if !ctcFoundWords.contains(wordLower) {
-                // Check if word appears as whole word in hypothesis (avoid substring false positives)
-                let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
-                if let regex = try? NSRegularExpression(pattern: pattern, options: []),
-                    regex.firstMatch(
-                        in: hypothesisLower, options: [],
-                        range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
-                {
-                    ctcFoundWords.insert(wordLower)
-                    let inRef = referenceKeywords.contains(wordLower)
-                    let detail: [String: Any] = [
-                        "word": word,
-                        "score": 0.0,
-                        "startTime": 0.0,
-                        "endTime": 0.0,
-                        "source": "hypothesis",
-                        "inReference": inRef,
-                    ]
-                    detectionDetails.append(detail)
-                }
-            }
-        }
-        let result: [String: Any] = [
-            "fileId": fileId,
-            "reference": referenceNormalized,
-            "hypothesis": hypothesisNormalized,
-            "wer": round(wer * 10000) / 100,
-            "dictFound": predictedKeywords.count,
-            "dictTotal": referenceKeywords.count,
-            "keywordPredicted": predictedKeywords.count,
-            "keywordReference": referenceKeywords.count,
-            "keywordTruePositives": truePositives.count,
-            "keywordFalsePositives": falsePositives.count,
-            "keywordFalseNegatives": falseNegatives.count,
-            "keywordPrecision": round(keywordPrecision * 1000) / 1000,
-            "keywordRecall": round(keywordRecall * 1000) / 1000,
-            "keywordFscore": round(keywordFscore * 1000) / 1000,
-            "audioLength": round(audioLength * 100) / 100,
-            "processingTime": round(processingTime * 1000) / 1000,
-            "ctcDetections": detectionDetails,
-        ]
-        return result
-    }
-    /// Simple tokenization using vocabulary lookup
-    private static func tokenize(_ text: String, vocabulary: [Int: String]) -> [Int] {
-        // Build reverse vocabulary (token -> id)
-        var tokenToId: [String: Int] = [:]
-        for (id, token) in vocabulary {
-            tokenToId[token] = id
-        }
-        let normalizedText = text.lowercased()
-        var result: [Int] = []
-        var position = normalizedText.startIndex
-        var isWordStart = true
-        while position < normalizedText.endIndex {
-            var matched = false
-            let remaining = normalizedText.distance(from: position, to: normalizedText.endIndex)
-            var matchLength = min(20, remaining)
-            while matchLength > 0 {
-                let endPos = normalizedText.index(position, offsetBy: matchLength)
-                let substring = String(normalizedText[position..<endPos])
-                // Try with SentencePiece prefix for word start
-                let withPrefix = isWordStart ? "▁" + substring : substring
-                if let tokenId = tokenToId[withPrefix] {
-                    result.append(tokenId)
-                    position = endPos
-                    isWordStart = false
-                    matched = true
-                    break
-                } else if let tokenId = tokenToId[substring] {
-                    result.append(tokenId)
-                    position = endPos
-                    isWordStart = false
-                    matched = true
-                    break
-                }
-                matchLength -= 1
-            }
-            if !matched {
-                let char = normalizedText[position]
-                if char == " " {
-                    isWordStart = true
-                    position = normalizedText.index(after: position)
-                } else {
-                    // Unknown character - skip
-                    position = normalizedText.index(after: position)
-                    isWordStart = false
-                }
-            }
-        }
-        return result
-    }
-    /// Apply CTC keyword corrections to TDT transcription using multiple strategies:
-    /// 1. Fuzzy matching (for words that are phonetically similar)
-    /// 2. Context pattern matching (for "this is X" type patterns)
-    /// 3. Proper noun replacement (for names after common patterns)
-    private static func applyKeywordCorrections(
-        tdtResult: ASRResult,
-        detections: [CtcKeywordSpotter.KeywordDetection],
-        minScore: Float
-    ) -> String {
-        // Filter detections by score
-        let validDetections = detections.filter { $0.score >= minScore }
-        guard !validDetections.isEmpty else {
-            return tdtResult.text
-        }
-        var text = tdtResult.text
-        var usedDetections: Set<String> = []
-        // PASS 1: Fuzzy matching for phonetically similar words
-        for detection in validDetections {
-            let keyword = detection.term.text
-            let keywordLower = keyword.lowercased()
-            let keywordParts = keywordLower.components(separatedBy: " ").filter { !$0.isEmpty }
-            let words = text.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }
-            // Handle multi-word keywords
-            if keywordParts.count > 1 {
-                for i in 0..<(words.count - keywordParts.count + 1) {
-                    var allMatch = true
-                    var matchedWords: [String] = []
-                    for j in 0..<keywordParts.count {
-                        let wordClean = words[i + j].trimmingCharacters(in: .punctuationCharacters).lowercased()
-                        if isSimilar(wordClean, keywordParts[j]) {
-                            matchedWords.append(words[i + j])
-                        } else {
-                            allMatch = false
-                            break
-                        }
-                    }
-                    if allMatch && !matchedWords.isEmpty {
-                        let originalPhrase = matchedWords.joined(separator: " ")
-                        let replacement = matchCase(keyword, to: matchedWords[0])
-                        text = text.replacingOccurrences(of: originalPhrase, with: replacement)
-                        usedDetections.insert(keyword)
-                        break
-                    }
-                }
-            } else {
-                // Single word keyword
-                for word in words {
-                    let wordClean = word.trimmingCharacters(in: .punctuationCharacters).lowercased()
-                    guard !wordClean.isEmpty else { continue }
-                    if isSimilar(wordClean, keywordLower) && wordClean != keywordLower {
-                        let replacement = matchCase(keyword, to: word)
-                        text = text.replacingOccurrences(of: word, with: replacement)
-                        usedDetections.insert(keyword)
-                        break
-                    }
-                }
-            }
-        }
-        // PASS 2: Context pattern matching - specifically for "this is X" pattern
-        // Only replace if keyword is NOT already in the text
-        for detection in validDetections {
-            let keyword = detection.term.text
-            guard !usedDetections.contains(keyword) else { continue }
-            let keywordLower = keyword.lowercased()
-            // Skip if keyword already exists in text (case-insensitive)
-            if text.lowercased().contains(keywordLower) {
-                usedDetections.insert(keyword)  // Mark as handled
-                continue
-            }
-            // Check if keyword looks like a proper noun (starts with uppercase)
-            let isProperNoun =
-                keyword.first?.isUppercase == true
-                && keyword.count >= 3
-                && !stopWords.contains(keywordLower)
-            guard isProperNoun else { continue }
-            // Look for "this is X" pattern specifically for names
-            let thisIsPattern = try? NSRegularExpression(pattern: "this is ([A-Z][a-z]+)", options: [])
-            if let regex = thisIsPattern {
-                let textRange = NSRange(text.startIndex..., in: text)
-                if let match = regex.firstMatch(in: text, options: [], range: textRange),
-                    match.numberOfRanges > 1,
-                    let captureRange = Range(match.range(at: 1), in: text)
-                {
-                    let capturedWord = String(text[captureRange])
-                    let capturedLower = capturedWord.lowercased()
-                    // Skip if captured word is already a detected keyword
-                    let isOtherKeyword = validDetections.contains { det in
-                        det.term.text.lowercased() == capturedLower
-                    }
-                    if !isOtherKeyword && !stopWords.contains(capturedLower) {
-                        // Similar length check
-                        if abs(capturedWord.count - keyword.count) <= 3 {
-                            text = text.replacingOccurrences(of: capturedWord, with: keyword)
-                            usedDetections.insert(keyword)
-                        }
-                    }
-                }
-            }
-        }
-        return text
-    }
-    /// Build word timings by merging subword tokens (tokens starting with "▁" begin new words)
-    private static func buildWordTimings(
-        from tokenTimings: [TokenTiming]
-    ) -> [(word: String, startTime: Double, endTime: Double)] {
-        var wordTimings: [(word: String, startTime: Double, endTime: Double)] = []
-        var currentWord = ""
-        var wordStart: Double = 0
-        var wordEnd: Double = 0
-        for timing in tokenTimings {
-            let token = timing.token
-            // Skip special tokens
-            if token.isEmpty || token == "<blank>" || token == "<pad>" {
-                continue
-            }
-            // Check if this starts a new word (has ▁ prefix or is first token)
-            let startsNewWord = token.hasPrefix("▁") || currentWord.isEmpty
-            if startsNewWord && !currentWord.isEmpty {
-                // Save previous word
-                wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
-                currentWord = ""
-            }
-            if startsNewWord {
-                currentWord = token.hasPrefix("▁") ? String(token.dropFirst()) : token
-                wordStart = timing.startTime
-            } else {
-                currentWord += token
-            }
-            wordEnd = timing.endTime
-        }
-        // Save final word
-        if !currentWord.isEmpty {
-            wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
-        }
-        return wordTimings
-    }
-    /// Common English words that should never be replaced by keyword matching
-    private static let stopWords: Set<String> = [
-        // Pronouns
-        "i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
-        "my", "your", "his", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
-        "this", "that", "these", "those", "who", "whom", "what", "which", "whose",
-        // Common verbs
-        "is", "are", "was", "were", "be", "been", "being", "am",
-        "have", "has", "had", "having", "do", "does", "did", "doing", "done",
-        "will", "would", "shall", "should", "may", "might", "must", "can", "could",
-        "get", "got", "getting", "go", "goes", "went", "going", "gone",
-        "come", "came", "coming", "see", "saw", "seen", "know", "knew", "known",
-        "think", "thought", "make", "made", "take", "took", "taken", "give", "gave", "given",
-        "say", "said", "tell", "told", "ask", "asked", "use", "used", "want", "wanted",
-        "need", "needed", "try", "tried", "let", "put", "keep", "kept", "look", "looked",
-        // Articles and determiners
-        "a", "an", "the", "some", "any", "no", "every", "each", "all", "both", "few", "many",
-        "much", "more", "most", "other", "another", "such",
-        // Prepositions
-        "in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "down", "out",
-        "about", "into", "over", "after", "before", "between", "under", "through", "during",
-        // Conjunctions
-        "and", "or", "but", "so", "yet", "nor", "if", "then", "than", "because", "while",
-        "although", "unless", "since", "when", "where", "as",
-        // Adverbs
-        "not", "very", "just", "also", "only", "even", "still", "already", "always", "never",
-        "often", "sometimes", "usually", "really", "well", "now", "here", "there", "how", "why",
-        // Common words
-        "yes", "no", "okay", "ok", "thank", "thanks", "please", "sorry", "hello", "hi", "bye",
-        "good", "great", "bad", "new", "old", "first", "last", "long", "short", "big", "small",
-        "high", "low", "right", "left", "next", "back", "same", "different", "own", "able",
-        "way", "thing", "things", "time", "times", "year", "years", "day", "days", "week", "weeks",
-        "part", "place", "case", "point", "fact", "end", "kind", "lot", "set",
-        // Numbers
-        "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
-        "hundred", "thousand", "million", "billion",
-    ]
-    /// Check if two words are similar (edit distance / length ratio)
-    private static func isSimilar(_ a: String, _ b: String) -> Bool {
-        // Never match stop words - they're too common to be proper nouns
-        if stopWords.contains(a) || stopWords.contains(b) {
-            return false
-        }
-        let maxLen = max(a.count, b.count)
-        let minLen = min(a.count, b.count)
-        guard maxLen > 0, minLen >= 3 else { return false }
-        // Allow more length difference for longer words
-        let lenDiff = abs(a.count - b.count)
-        if lenDiff > max(3, maxLen / 2) { return false }
-        // Calculate edit distance
-        let distance = editDistance(a, b)
-        // More aggressive threshold: allow up to 40% of max length as edits
-        let threshold = max(2, Int(Double(maxLen) * 0.4))
-        // Also check if one is substring of other (handles "Erik" in "Ririek")
-        if a.contains(b) || b.contains(a) {
-            return true
-        }
-        // Check common prefix/suffix (handles "Heri" vs "Harry")
-        let commonPrefix = commonPrefixLength(a, b)
-        let commonSuffix = commonSuffixLength(a, b)
-        if commonPrefix >= 2 || commonSuffix >= 2 {
-            return distance <= threshold + 1
-        }
-        return distance <= threshold
-    }
-    /// Get length of common prefix
-    private static func commonPrefixLength(_ a: String, _ b: String) -> Int {
-        let aChars = Array(a)
-        let bChars = Array(b)
-        var count = 0
-        for i in 0..<min(aChars.count, bChars.count) {
-            if aChars[i] == bChars[i] {
-                count += 1
-            } else {
-                break
-            }
-        }
-        return count
-    }
-    /// Get length of common suffix
-    private static func commonSuffixLength(_ a: String, _ b: String) -> Int {
-        let aChars = Array(a.reversed())
-        let bChars = Array(b.reversed())
-        var count = 0
-        for i in 0..<min(aChars.count, bChars.count) {
-            if aChars[i] == bChars[i] {
-                count += 1
-            } else {
-                break
-            }
-        }
-        return count
-    }
-    /// Simple edit distance calculation
-    private static func editDistance(_ a: String, _ b: String) -> Int {
-        let a = Array(a)
-        let b = Array(b)
-        let m = a.count
-        let n = b.count
-        if m == 0 { return n }
-        if n == 0 { return m }
-        var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
-        for i in 0...m { dp[i][0] = i }
-        for j in 0...n { dp[0][j] = j }
-        for i in 1...m {
-            for j in 1...n {
-                if a[i - 1] == b[j - 1] {
-                    dp[i][j] = dp[i - 1][j - 1]
-                } else {
-                    dp[i][j] = 1 + min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1]))
-                }
-            }
-        }
-        return dp[m][n]
-    }
-    /// Match the case pattern of the original word
-    private static func matchCase(_ keyword: String, to original: String) -> String {
-        let origClean = original.trimmingCharacters(in: .punctuationCharacters)
-        // Check case pattern
-        if origClean.first?.isUppercase == true {
-            // Capitalize first letter
-            return keyword.prefix(1).uppercased() + keyword.dropFirst()
-        }
-        return keyword
-    }
-    private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
-        if reference.isEmpty {
-            return hypothesis.isEmpty ? 0.0 : 1.0
-        }
-        let m = reference.count
-        let n = hypothesis.count
-        var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
-        for i in 0...m { dp[i][0] = i }
-        for j in 0...n { dp[0][j] = j }
-        for i in 1...m {
-            for j in 1...n {
-                if reference[i - 1] == hypothesis[j - 1] {
-                    dp[i][j] = dp[i - 1][j - 1]
-                } else {
-                    dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
-                }
-            }
-        }
-        return Double(dp[m][n]) / Double(m)
-    }
-    private static func printUsage() {
-        print(
-            """
-            CTC Earnings Benchmark (TDT + CTC keyword spotting)
-            Usage: fluidaudio ctc-earnings-benchmark [options]
-            Options:
-                --data-dir <path>     Path to earnings test dataset (auto-detected if downloaded)
-                --ctc-model <path>    Path to CTC model directory (auto-detected if in standard location)
-                --max-files <n>       Maximum number of files to process
-                --output, -o <path>   Output JSON file (default: ctc_earnings_benchmark.json)
-                --auto-download       Download earnings22-kws dataset if not found
-                --keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
-            Default locations:
-                Dataset: ~/Library/Application Support/FluidAudio/earnings22-kws/test-dataset/
-                CTC Model: ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/
-            Setup:
-                1. Download dataset: fluidaudio download --dataset earnings22-kws
-                2. Place CTC model in standard location
-                3. Run: fluidaudio ctc-earnings-benchmark
-            Examples:
-                # Run with auto-detected paths
-                fluidaudio ctc-earnings-benchmark
-                # Run with auto-download
-                fluidaudio ctc-earnings-benchmark --auto-download
-                # Run with explicit paths
-                fluidaudio ctc-earnings-benchmark \\
-                    --data-dir /path/to/test-dataset \\
-                    --ctc-model /path/to/parakeet-ctc-110m-coreml \\
-                    --max-files 100
-            """)
-    }
-    private static func parseKeywordMode(_ value: String) -> KeywordMode? {
-        switch value.lowercased() {
-        case "chunk", "chunk-keywords":
-            return .chunk
-        case "file", "file-keywords":
-            return .file
-        default:
-            return nil
-        }
-    }
-    private static func parentId(from fileId: String) -> String {
-        guard let range = fileId.range(of: "_chunk") else {
-            return fileId
-        }
-        return String(fileId[..<range.lowerBound])
-    }
-    private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
-        guard keywordMode == .file else {
-            return [:]
-        }
-        var index: [String: Set<String>] = [:]
-        let suffix = ".dictionary.txt"
-        let fileManager = FileManager.default
-        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
-        for url in contents {
-            let name = url.lastPathComponent
-            guard name.hasSuffix(suffix) else { continue }
-            let fileId = String(name.dropLast(suffix.count))
-            let parent = parentId(from: fileId)
-            let words = try loadDictionaryWords(from: url)
-            var set = index[parent] ?? Set<String>()
-            set.formUnion(words)
-            index[parent] = set
-        }
-        return index.mapValues { Array($0).sorted() }
-    }
-    private static func loadDictionaryWords(
-        fileId: String,
-        dictionaryFile: URL,
-        keywordMode: KeywordMode,
-        keywordIndex: [String: [String]]
-    ) throws -> [String] {
-        switch keywordMode {
-        case .chunk:
-            return try loadDictionaryWords(from: dictionaryFile)
-        case .file:
-            let parent = parentId(from: fileId)
-            if let words = keywordIndex[parent] {
-                return words
-            }
-            return try loadDictionaryWords(from: dictionaryFile)
-        }
-    }
-    private static func loadDictionaryWords(from url: URL) throws -> [String] {
-        let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
-        return dictionaryContent
-            .components(separatedBy: .newlines)
-            .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
-            .filter { !$0.isEmpty }
-    }
-    private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
-        let textLower = text.lowercased()
-        var result: Set<String> = []
-        for word in dictionaryWords {
-            let wordLower = word.lowercased()
-            let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
-            guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
-            let range = NSRange(textLower.startIndex..., in: textLower)
-            if regex.firstMatch(in: textLower, options: [], range: range) != nil {
-                result.insert(wordLower)
-            }
-        }
-        return result
-    }
-}
-#endif

cli/HybridEarningsBenchmark.swift DELETED Viewed

@@ -1,554 +0,0 @@
-#if os(macOS)
-import AVFoundation
-import FluidAudio
-import Foundation
-/// Earnings22 benchmark using ONLY the Hybrid 110M model (single encoder).
-/// CTC head provides both transcription AND keyword spotting from the same encoder.
-public enum HybridEarningsBenchmark {
-    private enum KeywordMode: String {
-        case chunk
-        case file
-    }
-    public static func runCLI(arguments: [String]) async {
-        if arguments.contains("--help") || arguments.contains("-h") {
-            printUsage()
-            return
-        }
-        // Parse arguments
-        var outputFile = "hybrid_earnings_benchmark.json"
-        var maxFiles: Int? = nil
-        var decodingMode: HybridDecodingMode = .tdt
-        var useRescoring = false
-        var keywordMode: KeywordMode = .chunk
-        var i = 0
-        while i < arguments.count {
-            switch arguments[i] {
-            case "--output", "-o":
-                if i + 1 < arguments.count {
-                    outputFile = arguments[i + 1]
-                    i += 1
-                }
-            case "--max-files":
-                if i + 1 < arguments.count {
-                    maxFiles = Int(arguments[i + 1])
-                    i += 1
-                }
-            case "--ctc":
-                decodingMode = .ctc
-            case "--tdt":
-                decodingMode = .tdt
-            case "--rescore":
-                useRescoring = true
-            case "--keyword-mode":
-                if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
-                    keywordMode = mode
-                    i += 1
-                }
-            default:
-                break
-            }
-            i += 1
-        }
-        let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
-        guard FileManager.default.fileExists(atPath: dataDir.path) else {
-            print("ERROR: Earnings dataset not found at \(dataDir.path)")
-            print("Download with: fluidaudio download --dataset earnings22-kws")
-            return
-        }
-        let modeStr = decodingMode == .ctc ? "CTC" : "TDT"
-        let rescoringStr = useRescoring ? " + Rescoring" : ""
-        print("Hybrid 110M Earnings Benchmark (Decoding: \(modeStr)\(rescoringStr))")
-        print("  Output file: \(outputFile)")
-        print("  Decoding mode: \(modeStr)")
-        print("  Rescoring: \(useRescoring ? "enabled" : "disabled")")
-        print("  Keyword mode: \(keywordMode.rawValue)")
-        do {
-            // Load Hybrid 110M model (single encoder with CTC head)
-            print("Loading Hybrid 110M model...")
-            let hybridModels = try await HybridAsrModels.downloadAndLoad()
-            let hybridManager = HybridAsrManager(models: hybridModels, decodingMode: decodingMode)
-            let spotter = HybridKeywordSpotter(vocabulary: hybridModels.vocabulary, blankId: hybridModels.blankId)
-            print("  Vocab size: \(hybridModels.vocabSize)")
-            // Collect test files
-            let fileIds = try collectFileIds(from: dataDir, maxFiles: maxFiles)
-            let keywordIndex = try buildKeywordIndex(dataDir: dataDir, keywordMode: keywordMode)
-            if fileIds.isEmpty {
-                print("ERROR: No test files found")
-                return
-            }
-            print("Processing \(fileIds.count) test files...")
-            var results: [[String: Any]] = []
-            var totalWer = 0.0
-            var totalKeywordReference = 0
-            var totalKeywordPredicted = 0
-            var totalKeywordTruePositives = 0
-            var totalKeywordFalsePositives = 0
-            var totalKeywordFalseNegatives = 0
-            var totalAudioDuration = 0.0
-            var totalProcessingTime = 0.0
-            for (index, fileId) in fileIds.enumerated() {
-                print("[\(index + 1)/\(fileIds.count)] \(fileId)")
-                if let result = try await processFile(
-                    fileId: fileId,
-                    dataDir: dataDir,
-                    hybridManager: hybridManager,
-                    spotter: spotter,
-                    useRescoring: useRescoring,
-                    keywordMode: keywordMode,
-                    keywordIndex: keywordIndex
-                ) {
-                    results.append(result)
-                    totalWer += result["wer"] as? Double ?? 0
-                    totalKeywordReference += result["keywordReference"] as? Int ?? 0
-                    totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
-                    totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
-                    totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
-                    totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
-                    totalAudioDuration += result["audioLength"] as? Double ?? 0
-                    totalProcessingTime += result["processingTime"] as? Double ?? 0
-                    let wer = result["wer"] as? Double ?? 0
-                    let precision = result["keywordPrecision"] as? Double ?? 0
-                    let recall = result["keywordRecall"] as? Double ?? 0
-                    let fscore = result["keywordFscore"] as? Double ?? 0
-                    print(
-                        "  WER: \(String(format: "%.1f", wer))%, " +
-                            "KW P/R/F: \(String(format: "%.2f", precision))/" +
-                            "\(String(format: "%.2f", recall))/" +
-                            "\(String(format: "%.2f", fscore))"
-                    )
-                }
-            }
-            // Calculate summary
-            let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
-            let keywordPrecision =
-                totalKeywordPredicted > 0
-                ? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
-                : 0
-            let keywordRecall =
-                totalKeywordReference > 0
-                ? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
-                : 0
-            let keywordFscore =
-                (keywordPrecision + keywordRecall) > 0
-                ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
-                : 0
-            // Print summary
-            print("\n" + String(repeating: "=", count: 60))
-            print("HYBRID 110M BENCHMARK (\(modeStr)\(rescoringStr))")
-            print(String(repeating: "=", count: 60))
-            print("Model: parakeet-tdt-ctc-110m-hybrid")
-            print("Decoding: \(modeStr), Rescoring: \(useRescoring ? "yes" : "no")")
-            print("Total tests: \(results.count)")
-            print("Average WER: \(String(format: "%.2f", avgWer))%")
-            print(
-                "Keyword Precision/Recall/F1: " +
-                    "\(String(format: "%.2f", keywordPrecision))/" +
-                    "\(String(format: "%.2f", keywordRecall))/" +
-                    "\(String(format: "%.2f", keywordFscore))"
-            )
-            print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
-            print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
-            if totalProcessingTime > 0 {
-                print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
-            }
-            print(String(repeating: "=", count: 60))
-            // Sort results by WER descending (worst first)
-            let sortedResults = results.sorted { r1, r2 in
-                let wer1 = r1["wer"] as? Double ?? 0
-                let wer2 = r2["wer"] as? Double ?? 0
-                return wer1 > wer2
-            }
-            // Save to JSON
-            let summaryDict: [String: Any] = [
-                "totalTests": results.count,
-                "avgWer": round(avgWer * 100) / 100,
-                "keywordTruePositives": totalKeywordTruePositives,
-                "keywordFalsePositives": totalKeywordFalsePositives,
-                "keywordFalseNegatives": totalKeywordFalseNegatives,
-                "keywordPredicted": totalKeywordPredicted,
-                "keywordReference": totalKeywordReference,
-                "keywordPrecision": round(keywordPrecision * 1000) / 1000,
-                "keywordRecall": round(keywordRecall * 1000) / 1000,
-                "keywordFscore": round(keywordFscore * 1000) / 1000,
-                "totalAudioDuration": round(totalAudioDuration * 100) / 100,
-                "totalProcessingTime": round(totalProcessingTime * 100) / 100,
-            ]
-            let output: [String: Any] = [
-                "model": "parakeet-tdt-ctc-110m-hybrid",
-                "approach": "single-encoder",
-                "decodingMode": modeStr,
-                "rescoring": useRescoring,
-                "keywordMode": keywordMode.rawValue,
-                "summary": summaryDict,
-                "results": sortedResults,
-            ]
-            let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
-            try jsonData.write(to: URL(fileURLWithPath: outputFile))
-            print("\nResults written to: \(outputFile)")
-        } catch {
-            print("ERROR: \(error)")
-        }
-    }
-    private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
-        var fileIds: [String] = []
-        let suffix = ".dictionary.txt"
-        let fileManager = FileManager.default
-        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
-        for url in contents.sorted(by: { $0.path < $1.path }) {
-            let name = url.lastPathComponent
-            if name.hasSuffix(suffix) {
-                let data = try? Data(contentsOf: url)
-                if let data = data, !data.isEmpty {
-                    let fileId = String(name.dropLast(suffix.count))
-                    fileIds.append(fileId)
-                }
-            }
-        }
-        if let maxFiles = maxFiles {
-            return Array(fileIds.prefix(maxFiles))
-        }
-        return fileIds
-    }
-    private static func processFile(
-        fileId: String,
-        dataDir: URL,
-        hybridManager: HybridAsrManager,
-        spotter: HybridKeywordSpotter,
-        useRescoring: Bool,
-        keywordMode: KeywordMode,
-        keywordIndex: [String: [String]]
-    ) async throws -> [String: Any]? {
-        let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
-        let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
-        let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
-        let fm = FileManager.default
-        guard fm.fileExists(atPath: wavFile.path),
-            fm.fileExists(atPath: dictionaryFile.path)
-        else {
-            return nil
-        }
-        // Load dictionary words (chunk or file keywords)
-        let dictionaryWords = try loadDictionaryWords(
-            fileId: fileId,
-            dictionaryFile: dictionaryFile,
-            keywordMode: keywordMode,
-            keywordIndex: keywordIndex
-        )
-        // Load reference text
-        let referenceRaw =
-            (try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
-        // Get audio samples
-        let audioFile = try AVAudioFile(forReading: wavFile)
-        let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
-        let format = audioFile.processingFormat
-        let frameCount = AVAudioFrameCount(audioFile.length)
-        guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
-            return nil
-        }
-        try audioFile.read(into: buffer)
-        // Resample to 16kHz
-        let converter = AudioConverter()
-        let samples = try converter.resampleBuffer(buffer)
-        // Build custom vocabulary for keyword spotting
-        var vocabTerms: [CustomVocabularyTerm] = []
-        for word in dictionaryWords {
-            let term = CustomVocabularyTerm(
-                text: word,
-                weight: nil,
-                aliases: nil,
-                tokenIds: nil,
-                ctcTokenIds: nil
-            )
-            vocabTerms.append(term)
-        }
-        let customVocab = CustomVocabularyContext(terms: vocabTerms)
-        // Run Hybrid 110M using new API (TDT transcription + CTC keyword detection)
-        let rescorerConfig: HybridTextRescorer.Config? = useRescoring ? .default : nil
-        let hybridResult = try await hybridManager.transcribeHybrid(
-            audioSamples: samples,
-            customVocabulary: customVocab,
-            rescorerConfig: rescorerConfig
-        )
-        // Skip if empty transcription
-        if hybridResult.text.isEmpty {
-            print("  SKIPPED: Empty transcription")
-            return nil
-        }
-        let detections = hybridResult.keywordDetections
-        let processingTime = hybridResult.processingTime
-        // Use hybrid transcription as hypothesis (may be rescored if enabled)
-        let hypothesis = hybridResult.text
-        // Normalize texts
-        let referenceNormalized = TextNormalizer.normalize(referenceRaw)
-        let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
-        // Keyword sets for precision/recall
-        let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
-        let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
-        let truePositives = referenceKeywords.intersection(predictedKeywords)
-        let falsePositives = predictedKeywords.subtracting(referenceKeywords)
-        let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
-        let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
-        let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
-        let keywordFscore =
-            (keywordPrecision + keywordRecall) > 0
-            ? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
-            : 0
-        let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
-            !$0.isEmpty
-        }
-        let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
-            !$0.isEmpty
-        }
-        // Calculate WER
-        let wer: Double
-        if referenceWords.isEmpty {
-            wer = hypothesisWords.isEmpty ? 0.0 : 1.0
-        } else {
-            wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
-        }
-        // Count dictionary detections for debugging
-        let minCtcScore: Float = -15.0
-        var detectionDetails: [[String: Any]] = []
-        var foundWords: Set<String> = []
-        // CTC detections
-        for detection in detections {
-            let inRef = referenceKeywords.contains(detection.term.text.lowercased())
-            let detail: [String: Any] = [
-                "word": detection.term.text,
-                "score": round(Double(detection.score) * 100) / 100,
-                "startTime": round(detection.startTime * 100) / 100,
-                "endTime": round(detection.endTime * 100) / 100,
-                "source": "ctc",
-                "inReference": inRef,
-            ]
-            detectionDetails.append(detail)
-            if detection.score >= minCtcScore {
-                foundWords.insert(detection.term.text.lowercased())
-            }
-        }
-        // Fallback: check hypothesis for dictionary words not found by CTC
-        let hypothesisLower = hypothesis.lowercased()
-        for word in dictionaryWords {
-            let wordLower = word.lowercased()
-            if !foundWords.contains(wordLower) {
-                let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
-                if let regex = try? NSRegularExpression(pattern: pattern, options: []),
-                    regex.firstMatch(
-                        in: hypothesisLower, options: [],
-                        range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
-                {
-                    foundWords.insert(wordLower)
-                    let inRef = referenceKeywords.contains(wordLower)
-                    let detail: [String: Any] = [
-                        "word": word,
-                        "score": 0.0,
-                        "startTime": 0.0,
-                        "endTime": 0.0,
-                        "source": "hypothesis",
-                        "inReference": inRef,
-                    ]
-                    detectionDetails.append(detail)
-                }
-            }
-        }
-        let result: [String: Any] = [
-            "fileId": fileId,
-            "reference": referenceNormalized,
-            "hypothesis": hypothesisNormalized,
-            "wer": round(wer * 10000) / 100,
-            "dictFound": predictedKeywords.count,
-            "dictTotal": referenceKeywords.count,
-            "keywordPredicted": predictedKeywords.count,
-            "keywordReference": referenceKeywords.count,
-            "keywordTruePositives": truePositives.count,
-            "keywordFalsePositives": falsePositives.count,
-            "keywordFalseNegatives": falseNegatives.count,
-            "keywordPrecision": round(keywordPrecision * 1000) / 1000,
-            "keywordRecall": round(keywordRecall * 1000) / 1000,
-            "keywordFscore": round(keywordFscore * 1000) / 1000,
-            "audioLength": round(audioLength * 100) / 100,
-            "processingTime": round(processingTime * 1000) / 1000,
-            "ctcDetections": detectionDetails,
-        ]
-        return result
-    }
-    private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
-        if reference.isEmpty {
-            return hypothesis.isEmpty ? 0.0 : 1.0
-        }
-        let m = reference.count
-        let n = hypothesis.count
-        var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
-        for i in 0...m { dp[i][0] = i }
-        for j in 0...n { dp[0][j] = j }
-        for i in 1...m {
-            for j in 1...n {
-                if reference[i - 1] == hypothesis[j - 1] {
-                    dp[i][j] = dp[i - 1][j - 1]
-                } else {
-                    dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
-                }
-            }
-        }
-        return Double(dp[m][n]) / Double(m)
-    }
-    private static func printUsage() {
-        print(
-            """
-            Hybrid 110M Earnings Benchmark (Single Encoder)
-            Usage: fluidaudio hybrid-earnings-benchmark [options]
-            This benchmark uses ONLY the Hybrid 110M model:
-            - Single encoder provides CTC log-probs
-            - CTC greedy decode for transcription
-            - CTC keyword spotting from same encoder output
-            Options:
-                --max-files <n>       Maximum number of files to process
-                --output, -o <path>   Output JSON file (default: hybrid_earnings_benchmark.json)
-                --keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
-            Compare with:
-                fluidaudio ctc-earnings-benchmark  (Canary-CTC + TDT 0.6B, two encoders)
-            """)
-    }
-    private static func parseKeywordMode(_ value: String) -> KeywordMode? {
-        switch value.lowercased() {
-        case "chunk", "chunk-keywords":
-            return .chunk
-        case "file", "file-keywords":
-            return .file
-        default:
-            return nil
-        }
-    }
-    private static func parentId(from fileId: String) -> String {
-        guard let range = fileId.range(of: "_chunk") else {
-            return fileId
-        }
-        return String(fileId[..<range.lowerBound])
-    }
-    private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
-        guard keywordMode == .file else {
-            return [:]
-        }
-        var index: [String: Set<String>] = [:]
-        let suffix = ".dictionary.txt"
-        let fileManager = FileManager.default
-        let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
-        for url in contents {
-            let name = url.lastPathComponent
-            guard name.hasSuffix(suffix) else { continue }
-            let fileId = String(name.dropLast(suffix.count))
-            let parent = parentId(from: fileId)
-            let words = try loadDictionaryWords(from: url)
-            var set = index[parent] ?? Set<String>()
-            set.formUnion(words)
-            index[parent] = set
-        }
-        return index.mapValues { Array($0).sorted() }
-    }
-    private static func loadDictionaryWords(
-        fileId: String,
-        dictionaryFile: URL,
-        keywordMode: KeywordMode,
-        keywordIndex: [String: [String]]
-    ) throws -> [String] {
-        switch keywordMode {
-        case .chunk:
-            return try loadDictionaryWords(from: dictionaryFile)
-        case .file:
-            let parent = parentId(from: fileId)
-            if let words = keywordIndex[parent] {
-                return words
-            }
-            return try loadDictionaryWords(from: dictionaryFile)
-        }
-    }
-    private static func loadDictionaryWords(from url: URL) throws -> [String] {
-        let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
-        return dictionaryContent
-            .components(separatedBy: .newlines)
-            .map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
-            .filter { !$0.isEmpty }
-    }
-    private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
-        let textLower = text.lowercased()
-        var result: Set<String> = []
-        for word in dictionaryWords {
-            let wordLower = word.lowercased()
-            let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
-            guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
-            let range = NSRange(textLower.startIndex..., in: textLower)
-            if regex.firstMatch(in: textLower, options: [], range: range) != nil {
-                result.insert(wordLower)
-            }
-        }
-        return result
-    }
-}
-#endif

config.json DELETED Viewed

	@@ -1 +0,0 @@
1	- {}

convert/.DS_Store DELETED Viewed

Binary file (10.2 kB)

convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py DELETED Viewed

@@ -1,323 +0,0 @@
-#!/usr/bin/env python3
-"""
-Convert Parakeet TDT-CTC 110M decoder components to CoreML.
-This script exports the TDT decoder (prediction network) and joint network
-with the SAME format as the working 0.6B model:
-- JointDecision outputs token_id, token_prob, duration (argmax done inside)
-- Uses shape [1, dim, 1] for encoder/decoder steps
-- Matches the interface expected by TdtDecoderV3
-"""
-import argparse
-import os
-import torch
-import torch.nn.functional as F
-import coremltools as ct
-import numpy as np
-from pathlib import Path
-# NeMo imports
-import nemo.collections.asr as nemo_asr
-def get_model_config(model):
-    """Extract model configuration."""
-    encoder_dim = None
-    pred_hidden = 640  # Default for parakeet models
-    num_layers = 1
-    vocab_size = 1024
-    num_durations = 5
-    # Get encoder dimension
-    if hasattr(model, 'encoder'):
-        encoder = model.encoder
-        if hasattr(encoder, 'd_model'):
-            encoder_dim = encoder.d_model
-        elif hasattr(encoder, '_feat_out'):
-            encoder_dim = encoder._feat_out
-    # Get decoder config
-    if hasattr(model, 'decoder'):
-        decoder = model.decoder
-        if hasattr(decoder, 'pred_hidden'):
-            pred_hidden = decoder.pred_hidden
-        if hasattr(decoder, 'pred_rnn_layers'):
-            num_layers = decoder.pred_rnn_layers
-    # Get joint config
-    if hasattr(model, 'joint'):
-        joint = model.joint
-        if hasattr(joint, 'num_extra_outputs'):
-            num_durations = joint.num_extra_outputs
-        if hasattr(joint, 'num_classes'):
-            vocab_size = joint.num_classes - num_durations
-    return {
-        'encoder_dim': encoder_dim,
-        'pred_hidden': pred_hidden,
-        'num_layers': num_layers,
-        'vocab_size': vocab_size,
-        'num_durations': num_durations,
-    }
-class DecoderWrapper(torch.nn.Module):
-    """
-    Wrapper for the RNNT/TDT decoder (prediction network).
-    Matches 0.6B format:
-    - Input: targets[1,1], target_lengths[1], h_in[num_layers,1,pred_hidden], c_in[...]
-    - Output: decoder_output[1,pred_hidden,2], h_out[...], c_out[...]
-    """
-    def __init__(self, decoder, pred_hidden):
-        super().__init__()
-        self.decoder = decoder
-        self.pred_hidden = pred_hidden
-    def forward(self, targets, target_lengths, h_in, c_in):
-        """
-        Args:
-            targets: [1, 1] - previous token ID
-            target_lengths: [1] - always 1
-            h_in: [num_layers, 1, pred_hidden]
-            c_in: [num_layers, 1, pred_hidden]
-        Returns:
-            decoder_output: [1, pred_hidden, 2] - prediction network output (transposed)
-            h_out: [num_layers, 1, pred_hidden]
-            c_out: [num_layers, 1, pred_hidden]
-        """
-        state = (h_in, c_in)
-        # pred_output shape: [batch, time, pred_hidden] = [1, 1, pred_hidden]
-        pred_output, new_state = self.decoder.predict(targets, state=state, add_sos=False)
-        h_out, c_out = new_state
-        # Transpose to [batch, pred_hidden, time] and concat two time steps
-        # (0.6B outputs [1, 640, 2] - we match this by duplicating)
-        pred_transposed = pred_output.transpose(1, 2)  # [1, pred_hidden, 1]
-        decoder_output = torch.cat([pred_transposed, pred_transposed], dim=2)  # [1, pred_hidden, 2]
-        return decoder_output, h_out, c_out
-class JointWrapper(torch.nn.Module):
-    """
-    Wrapper for the TDT joint network with internal argmax.
-    Matches 0.6B format:
-    - Input: encoder_step[1,encoder_dim,1], decoder_step[1,pred_hidden,1]
-    - Output: token_id[1,1,1], token_prob[1,1,1], duration[1,1,1]
-    """
-    def __init__(self, joint, vocab_size, num_durations=5):
-        super().__init__()
-        self.joint = joint
-        self.vocab_size = vocab_size
-        self.num_durations = num_durations
-    def forward(self, encoder_step, decoder_step):
-        """
-        Args:
-            encoder_step: [1, encoder_dim, 1]
-            decoder_step: [1, pred_hidden, 1]
-        Returns:
-            token_id: [1, 1, 1] - argmax token ID
-            token_prob: [1, 1, 1] - probability of selected token
-            duration: [1, 1, 1] - argmax duration bin
-        """
-        # Transpose to [batch, 1, dim] for joint network
-        enc = encoder_step.transpose(1, 2)  # [1, 1, encoder_dim]
-        dec = decoder_step.transpose(1, 2)  # [1, 1, pred_hidden]
-        # Run joint network
-        # Joint output: [1, 1, 1, vocab_size + 1 (blank) + num_durations]
-        joint_out = self.joint.joint(enc, dec)
-        # Debug: print shape on first call
-        if not hasattr(self, '_debug_printed'):
-            self._debug_printed = True
-            print(f"  Joint output shape: {joint_out.shape}")
-            print(f"  Expected: vocab={self.vocab_size} + blank=1 + durations={self.num_durations} = {self.vocab_size + 1 + self.num_durations}")
-        # Split: token logits include vocab + blank, durations are separate
-        # vocab_size = 1024 tokens (0-1023), blank = index 1024, durations = indices 1025+
-        num_tokens = self.vocab_size + 1  # Include blank at vocab_size
-        logits = joint_out[..., :num_tokens]  # [1, 1, 1, vocab_size + 1]
-        duration_logits = joint_out[..., num_tokens:]  # [1, 1, 1, num_durations]
-        # Apply softmax and get probabilities
-        probs = F.softmax(logits, dim=-1)
-        # Argmax for token
-        token_id = torch.argmax(logits, dim=-1, keepdim=True)  # [1, 1, 1, 1]
-        token_id = token_id.squeeze(-1)  # [1, 1, 1]
-        # Get probability of selected token
-        token_prob = torch.gather(probs, -1, token_id.unsqueeze(-1))  # [1, 1, 1, 1]
-        token_prob = token_prob.squeeze(-1)  # [1, 1, 1]
-        # Argmax for duration
-        duration = torch.argmax(duration_logits, dim=-1, keepdim=False)  # [1, 1, 1]
-        return token_id.int(), token_prob, duration.int()
-def convert_decoder(model, config, output_dir: Path):
-    """Convert decoder to CoreML."""
-    print(f"Converting Decoder...")
-    print(f"  pred_hidden={config['pred_hidden']}, num_layers={config['num_layers']}")
-    wrapper = DecoderWrapper(model.decoder, config['pred_hidden'])
-    wrapper.eval()
-    # Create example inputs
-    targets = torch.zeros(1, 1, dtype=torch.long)
-    target_lengths = torch.ones(1, dtype=torch.long)
-    h_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
-    c_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
-    # Trace the model
-    with torch.no_grad():
-        traced = torch.jit.trace(wrapper, (targets, target_lengths, h_in, c_in))
-    # Convert to CoreML
-    mlmodel = ct.convert(
-        traced,
-        inputs=[
-            ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
-            ct.TensorType(name="target_lengths", shape=(1,), dtype=np.int32),
-            ct.TensorType(name="h_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
-            ct.TensorType(name="c_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
-        ],
-        outputs=[
-            ct.TensorType(name="decoder_output"),
-            ct.TensorType(name="h_out"),
-            ct.TensorType(name="c_out"),
-        ],
-        minimum_deployment_target=ct.target.iOS17,
-        compute_precision=ct.precision.FLOAT16,
-    )
-    # Add metadata
-    mlmodel.author = "Fluid Inference"
-    mlmodel.short_description = "Hybrid TDT Decoder (110M)"
-    # Save
-    output_path = output_dir / "Decoder.mlpackage"
-    mlmodel.save(str(output_path))
-    print(f"  Saved to {output_path}")
-    return mlmodel
-def convert_joint(model, config, output_dir: Path):
-    """Convert joint network to CoreML."""
-    print(f"Converting JointDecision...")
-    print(f"  encoder_dim={config['encoder_dim']}, pred_hidden={config['pred_hidden']}")
-    print(f"  vocab_size={config['vocab_size']}, num_durations={config['num_durations']}")
-    wrapper = JointWrapper(
-        model.joint,
-        vocab_size=config['vocab_size'],
-        num_durations=config['num_durations']
-    )
-    wrapper.eval()
-    # Create example inputs - shape [1, dim, 1]
-    encoder_step = torch.randn(1, config['encoder_dim'], 1)
-    decoder_step = torch.randn(1, config['pred_hidden'], 1)
-    # Trace the model
-    with torch.no_grad():
-        traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
-    # Convert to CoreML
-    mlmodel = ct.convert(
-        traced,
-        inputs=[
-            ct.TensorType(name="encoder_step", shape=(1, config['encoder_dim'], 1), dtype=np.float32),
-            ct.TensorType(name="decoder_step", shape=(1, config['pred_hidden'], 1), dtype=np.float32),
-        ],
-        outputs=[
-            ct.TensorType(name="token_id"),
-            ct.TensorType(name="token_prob"),
-            ct.TensorType(name="duration"),
-        ],
-        minimum_deployment_target=ct.target.iOS17,
-        compute_precision=ct.precision.FLOAT16,
-    )
-    # Add metadata
-    mlmodel.author = "Fluid Inference"
-    mlmodel.short_description = "Hybrid Joint Decision (110M)"
-    # Save
-    output_path = output_dir / "JointDecision.mlpackage"
-    mlmodel.save(str(output_path))
-    print(f"  Saved to {output_path}")
-    return mlmodel
-def main():
-    parser = argparse.ArgumentParser(description="Convert TDT decoder to CoreML (0.6B format)")
-    parser.add_argument(
-        "--model-name",
-        default="nvidia/parakeet-tdt_ctc-110m",
-        help="NeMo model name or path"
-    )
-    parser.add_argument(
-        "--output-dir",
-        type=Path,
-        default=Path("./output"),
-        help="Output directory for CoreML models"
-    )
-    args = parser.parse_args()
-    # Create output directory
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    # Load model
-    print(f"Loading model: {args.model_name}")
-    model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(args.model_name)
-    model.eval()
-    # Get model configuration
-    config = get_model_config(model)
-    # Auto-detect encoder dim if not found
-    if config['encoder_dim'] is None:
-        print("Auto-detecting encoder dimension...")
-        dummy_audio = torch.randn(1, 16000)
-        dummy_length = torch.tensor([16000])
-        with torch.no_grad():
-            enc_out, enc_len = model.encoder(
-                audio_signal=dummy_audio,
-                length=dummy_length
-            )
-        config['encoder_dim'] = enc_out.shape[-1]
-    print(f"\nModel config:")
-    for k, v in config.items():
-        print(f"  {k}: {v}")
-    # Convert components
-    print()
-    convert_decoder(model, config, args.output_dir)
-    convert_joint(model, config, args.output_dir)
-    print("\nConversion complete!")
-    print(f"Models saved to: {args.output_dir}")
-    print("\nNext steps:")
-    print("1. Compile to .mlmodelc:")
-    print(f"   cd {args.output_dir}")
-    print("   xcrun coremlcompiler compile Decoder.mlpackage .")
-    print("   xcrun coremlcompiler compile JointDecision.mlpackage .")
-    print("2. Copy to model cache:")
-    print("   cp -r Decoder.mlmodelc JointDecision.mlmodelc ~/Library/Application\\ Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
-    print("3. Test with: swift run fluidaudio hybrid-earnings-benchmark --max-files 1")
-if __name__ == "__main__":
-    main()

convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c79c8bc763b4efccb3e12f199ec0a59aa2edc5e9e4d21ca70fde8f36762d4147
-size 480078

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc681823d92eca3dbece3a30c975afa7251eedae0e718b07ffbf1a8b4313b87e
-size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2ebec8fc38c063de4b2159e21b1f981309fa5947c24d7e4883aca20f7c15fbb9
-size 377

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json DELETED Viewed

@@ -1,66 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "Parakeet 110M CTC decoder head",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 188 × 1025)",
-        "shortDescription" : "",
-        "shape" : "[1, 188, 1025]",
-        "name" : "ctc_logits",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Float16",
-    "modelParameters" : [
-    ],
-    "author" : "Fluid Inference",
-    "specificationVersion" : 8,
-    "mlProgramOperationTypeHistogram" : {
-      "Ios17.cast" : 2,
-      "Ios17.conv" : 1,
-      "Ios17.transpose" : 1,
-      "Ios16.softmax" : 1,
-      "Ios17.log" : 1
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int32)",
-    "isUpdatable" : "0",
-    "stateSchema" : [
-    ],
-    "availability" : {
-      "macOS" : "14.0",
-      "tvOS" : "17.0",
-      "visionOS" : "1.0",
-      "watchOS" : "10.0",
-      "iOS" : "17.0",
-      "macCatalyst" : "17.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_mlProgram"
-    },
-    "inputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
-        "shortDescription" : "",
-        "shape" : "[1, 512, 188]",
-        "name" : "encoder_output",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "com.github.apple.coremltools.source_dialect" : "TorchScript",
-      "com.github.apple.coremltools.source" : "torch==2.9.0",
-      "com.github.apple.coremltools.version" : "8.3.0"
-    },
-    "generatedClassName" : "parakeet_ctc_head",
-    "method" : "predict"
-  }
-]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil DELETED Viewed

@@ -1,24 +0,0 @@
-program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
-{
-    func main<ios17>(tensor<fp32, [1, 512, 188]> encoder_output) {
-            tensor<int32, []> var_4 = const()[name = tensor<string, []>("op_4"), val = tensor<int32, []>(-1)];
-            tensor<string, []> var_18_pad_type_0 = const()[name = tensor<string, []>("op_18_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [1]> var_18_strides_0 = const()[name = tensor<string, []>("op_18_strides_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [2]> var_18_pad_0 = const()[name = tensor<string, []>("op_18_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> var_18_dilations_0 = const()[name = tensor<string, []>("op_18_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> var_18_groups_0 = const()[name = tensor<string, []>("op_18_groups_0"), val = tensor<int32, []>(1)];
-            tensor<string, []> encoder_output_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_output_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [1025, 512, 1]> module_decoder_layers_0_weight_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_weight_to_fp16"), val = tensor<fp16, [1025, 512, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp16, [1025]> module_decoder_layers_0_bias_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_bias_to_fp16"), val = tensor<fp16, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1049728)))];
-            tensor<fp16, [1, 512, 188]> encoder_output_to_fp16 = cast(dtype = encoder_output_to_fp16_dtype_0, x = encoder_output)[name = tensor<string, []>("cast_1")];
-            tensor<fp16, [1, 1025, 188]> var_18_cast_fp16 = conv(bias = module_decoder_layers_0_bias_to_fp16, dilations = var_18_dilations_0, groups = var_18_groups_0, pad = var_18_pad_0, pad_type = var_18_pad_type_0, strides = var_18_strides_0, weight = module_decoder_layers_0_weight_to_fp16, x = encoder_output_to_fp16)[name = tensor<string, []>("op_18_cast_fp16")];
-            tensor<int32, [3]> input_perm_0 = const()[name = tensor<string, []>("input_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<fp16, [1, 188, 1025]> input_cast_fp16 = transpose(perm = input_perm_0, x = var_18_cast_fp16)[name = tensor<string, []>("transpose_0")];
-            tensor<fp16, [1, 188, 1025]> out_objects_softmax_cast_fp16 = softmax(axis = var_4, x = input_cast_fp16)[name = tensor<string, []>("out_objects_softmax_cast_fp16")];
-            tensor<fp32, []> out_objects_epsilon_0 = const()[name = tensor<string, []>("out_objects_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
-            tensor<fp16, [1, 188, 1025]> out_objects_cast_fp16 = log(epsilon = out_objects_epsilon_0, x = out_objects_softmax_cast_fp16)[name = tensor<string, []>("out_objects_cast_fp16")];
-            tensor<string, []> out_objects_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("out_objects_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<fp32, [1, 188, 1025]> ctc_logits = cast(dtype = out_objects_cast_fp16_to_fp32_dtype_0, x = out_objects_cast_fp16)[name = tensor<string, []>("cast_0")];
-        } -> (ctc_logits);
-}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
-size 1051842

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:990455f6431342750254f66edf27bfb41be62a7ba17a18e1dd6afd4f5f56e9eb
-size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:29009727821ad8551ab5fe9271e93c597d92a9714f64b94aa533a9ceb6e22b93
-size 498

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json DELETED Viewed

@@ -1,118 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "Parakeet 110M decoder (RNNT prediction network)",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 640, 1]",
-        "name" : "decoder",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 640]",
-        "name" : "h_out",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 640]",
-        "name" : "c_out",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Float16",
-    "modelParameters" : [
-    ],
-    "author" : "Fluid Inference",
-    "specificationVersion" : 8,
-    "mlProgramOperationTypeHistogram" : {
-      "Ios17.squeeze" : 2,
-      "Ios17.gather" : 1,
-      "Ios17.cast" : 6,
-      "Ios17.lstm" : 1,
-      "Ios17.transpose" : 2,
-      "Identity" : 1,
-      "Ios17.expandDims" : 2
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
-    "isUpdatable" : "0",
-    "stateSchema" : [
-    ],
-    "availability" : {
-      "macOS" : "14.0",
-      "tvOS" : "17.0",
-      "visionOS" : "1.0",
-      "watchOS" : "10.0",
-      "iOS" : "17.0",
-      "macCatalyst" : "17.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_mlProgram"
-    },
-    "inputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 1]",
-        "name" : "targets",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "",
-        "shape" : "[1]",
-        "name" : "target_length",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 640]",
-        "name" : "h_in",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 640)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 640]",
-        "name" : "c_in",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "com.github.apple.coremltools.version" : "8.3.0",
-      "com.github.apple.coremltools.source_dialect" : "TorchScript",
-      "com.github.apple.coremltools.source" : "torch==2.9.0"
-    },
-    "generatedClassName" : "parakeet_decoder",
-    "method" : "predict"
-  }
-]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil DELETED Viewed

@@ -1,45 +0,0 @@
-program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
-{
-    func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
-            tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
-            tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1025, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
-            tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
-            tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
-            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
-            tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
-            tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
-            tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
-            tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
-            tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
-            tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
-            tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
-            tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
-            tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
-            tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1312128)))];
-            tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4588992)))];
-            tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7865856)))];
-            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
-            tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
-            tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
-            tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
-            tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
-            tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
-            tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
-            tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
-            tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
-            tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
-        } -> (decoder, h_out, c_out);
-}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
-size 7871040

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b7ae65e2af616df46066b7efca2d7c19941666ac0685f4ed005666890a052b0d
-size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0713c2d6ac5f8f6fb9582be250351ebd8efc925f71f4261191165f1406f2ee5d
-size 437

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json DELETED Viewed

@@ -1,105 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "Parakeet 110M encoder (15 s window)",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
-        "shortDescription" : "",
-        "shape" : "[1, 512, 188]",
-        "name" : "encoder_output",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "",
-        "shape" : "[1]",
-        "name" : "encoder_length",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Float16",
-    "modelParameters" : [
-    ],
-    "author" : "Fluid Inference",
-    "specificationVersion" : 8,
-    "mlProgramOperationTypeHistogram" : {
-      "Ios17.logicalAnd" : 2,
-      "Ios17.reshape" : 103,
-      "Ios16.softmax" : 17,
-      "Ios17.matmul" : 51,
-      "Ios17.transpose" : 123,
-      "Split" : 17,
-      "Ios17.expandDims" : 17,
-      "Select" : 51,
-      "Ios17.add" : 128,
-      "Tile" : 8,
-      "Ios17.sliceByIndex" : 34,
-      "Ios16.sigmoid" : 17,
-      "Pad" : 34,
-      "Ios17.logicalNot" : 2,
-      "Ios17.layerNorm" : 85,
-      "Ios16.silu" : 51,
-      "Ios17.less" : 5,
-      "Ios17.sub" : 3,
-      "Ios17.conv" : 56,
-      "Ios16.relu" : 3,
-      "Ios17.linear" : 137,
-      "Ios17.cast" : 11,
-      "Ios17.floorDiv" : 3,
-      "Ios17.mul" : 77
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int32)",
-    "isUpdatable" : "0",
-    "stateSchema" : [
-    ],
-    "availability" : {
-      "macOS" : "14.0",
-      "tvOS" : "17.0",
-      "visionOS" : "1.0",
-      "watchOS" : "10.0",
-      "iOS" : "17.0",
-      "macCatalyst" : "17.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_mlProgram"
-    },
-    "inputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 80 × 1501)",
-        "shortDescription" : "",
-        "shape" : "[1, 80, 1501]",
-        "name" : "mel_features",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "",
-        "shape" : "[1]",
-        "name" : "mel_length",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "com.github.apple.coremltools.source_dialect" : "TorchScript",
-      "com.github.apple.coremltools.source" : "torch==2.9.0",
-      "com.github.apple.coremltools.version" : "8.3.0"
-    },
-    "generatedClassName" : "parakeet_encoder",
-    "method" : "predict"
-  }
-]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil DELETED Viewed

The diff for this file is too large to render. See raw diff

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
-size 215143424

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:983ba26dd9276b8d2d4f75f3475aefb1817c542df87dbd0fdac95bd63647494f
-size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0800e3bdf4ecb1bd46fd27e1826d33125cd574f9ae1e15dd9ff70ea42944ca2d
-size 476

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json DELETED Viewed

@@ -1,102 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "Parakeet 110M joint + decision head (split, softmax, argmax)",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1 × 188 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 188, 1]",
-        "name" : "token_id",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 188 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 188, 1]",
-        "name" : "token_prob",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1 × 188 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 188, 1]",
-        "name" : "duration",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Float16",
-    "modelParameters" : [
-    ],
-    "author" : "Fluid Inference",
-    "specificationVersion" : 8,
-    "mlProgramOperationTypeHistogram" : {
-      "Ios17.reduceArgmax" : 2,
-      "Ios17.squeeze" : 1,
-      "Ios17.cast" : 4,
-      "Ios17.linear" : 3,
-      "Ios17.transpose" : 2,
-      "Ios17.sliceByIndex" : 2,
-      "Ios17.add" : 1,
-      "Ios16.relu" : 1,
-      "Ios16.softmax" : 1,
-      "Ios17.gatherAlongAxis" : 1,
-      "Ios17.expandDims" : 3
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
-    "isUpdatable" : "0",
-    "stateSchema" : [
-    ],
-    "availability" : {
-      "macOS" : "14.0",
-      "tvOS" : "17.0",
-      "visionOS" : "1.0",
-      "watchOS" : "10.0",
-      "iOS" : "17.0",
-      "macCatalyst" : "17.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_mlProgram"
-    },
-    "inputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 512 × 188)",
-        "shortDescription" : "",
-        "shape" : "[1, 512, 188]",
-        "name" : "encoder",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 640, 1]",
-        "name" : "decoder",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "com.github.apple.coremltools.version" : "8.3.0",
-      "com.github.apple.coremltools.source_dialect" : "TorchScript",
-      "com.github.apple.coremltools.source" : "torch==2.9.0"
-    },
-    "generatedClassName" : "parakeet_joint_decision",
-    "method" : "predict"
-  }
-]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil DELETED Viewed

@@ -1,58 +0,0 @@
-program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
-{
-    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 512, 188]> encoder) {
-            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<string, []> encoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<string, []> decoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
-            tensor<fp16, [1, 512, 188]> encoder_to_fp16 = cast(dtype = encoder_to_fp16_dtype_0, x = encoder)[name = tensor<string, []>("cast_6")];
-            tensor<fp16, [1, 188, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_to_fp16)[name = tensor<string, []>("transpose_1")];
-            tensor<fp16, [1, 188, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
-            tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
-            tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
-            tensor<fp16, [1, 640, 1]> decoder_to_fp16 = cast(dtype = decoder_to_fp16_dtype_0, x = decoder)[name = tensor<string, []>("cast_5")];
-            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_to_fp16)[name = tensor<string, []>("transpose_0")];
-            tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
-            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [1, 188, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
-            tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
-            tensor<fp16, [1, 188, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
-            tensor<fp16, [1, 188, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
-            tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
-            tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
-            tensor<fp16, [1, 188, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
-            tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1025])];
-            tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
-            tensor<fp16, [1, 188, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
-            tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
-            tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1030])];
-            tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 188, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
-            tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, [1, 188, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
-            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
-            tensor<fp16, [1, 188, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
-            tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<int32, [1, 188, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
-            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
-            tensor<int16, [1, 188, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_4")];
-            tensor<fp16, [1, 188, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
-            tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp16, [1, 188, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
-            tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, [1, 188, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
-            tensor<fp32, [1, 188, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_3")];
-        } -> (token_id, token_prob, duration);
-}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
-size 2798028

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7c11c6bb985fab7f835ba687a575f1eb04f4c93b0783155d634adbc49f0e797
-size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1af2cb9bcc13eec83ce006e4f1c2cf158393745cd9187428333fbcb6917da244
-size 535

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json DELETED Viewed

@@ -1,123 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "Parakeet 110M single-step joint decision (current frame)",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 1]",
-        "name" : "token_id",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 1]",
-        "name" : "token_prob",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1 × 1 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 1]",
-        "name" : "duration",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1 × 1 × 1 × 64)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 1, 64]",
-        "name" : "top_k_ids",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 1 × 1 × 64)",
-        "shortDescription" : "",
-        "shape" : "[1, 1, 1, 64]",
-        "name" : "top_k_logits",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Float16",
-    "modelParameters" : [
-    ],
-    "author" : "Fluid Inference",
-    "specificationVersion" : 8,
-    "mlProgramOperationTypeHistogram" : {
-      "Ios17.reduceArgmax" : 2,
-      "Ios17.linear" : 3,
-      "Ios17.transpose" : 2,
-      "Ios17.sliceByIndex" : 2,
-      "Ios17.add" : 1,
-      "Ios17.topk" : 1,
-      "Ios16.relu" : 1,
-      "Ios16.softmax" : 1,
-      "Ios17.expandDims" : 3,
-      "Ios17.squeeze" : 1,
-      "Ios17.cast" : 6,
-      "Ios17.gatherAlongAxis" : 1
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
-    "isUpdatable" : "0",
-    "stateSchema" : [
-    ],
-    "availability" : {
-      "macOS" : "14.0",
-      "tvOS" : "17.0",
-      "visionOS" : "1.0",
-      "watchOS" : "10.0",
-      "iOS" : "17.0",
-      "macCatalyst" : "17.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_mlProgram"
-    },
-    "inputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 512 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 512, 1]",
-        "name" : "encoder_step",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32 1 × 640 × 1)",
-        "shortDescription" : "",
-        "shape" : "[1, 640, 1]",
-        "name" : "decoder_step",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "com.github.apple.coremltools.source_dialect" : "TorchScript",
-      "com.github.apple.coremltools.source" : "torch==2.9.0",
-      "com.github.apple.coremltools.version" : "8.3.0"
-    },
-    "generatedClassName" : "parakeet_joint_decision_single_step",
-    "method" : "predict"
-  }
-]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil DELETED Viewed

@@ -1,69 +0,0 @@
-program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
-{
-    func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
-            tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
-            tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
-            tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_9")];
-            tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
-            tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
-            tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
-            tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
-            tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_8")];
-            tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
-            tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
-            tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
-            tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
-            tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
-            tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
-            tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
-            tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
-            tensor<fp16, [1, 1, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
-            tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
-            tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1025])];
-            tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
-            tensor<fp16, [1, 1, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
-            tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
-            tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1030])];
-            tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
-            tensor<fp16, [1, 1, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
-            tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
-            tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
-            tensor<fp16, [1, 1, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
-            tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<int32, [1, 1, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
-            tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
-            tensor<int16, [1, 1, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_7")];
-            tensor<fp16, [1, 1, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
-            tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<fp16, [1, 1, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
-            tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, [1, 1, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
-            tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(64)];
-            tensor<int32, []> var_76_axis_0 = const()[name = tensor<string, []>("op_76_axis_0"), val = tensor<int32, []>(-1)];
-            tensor<bool, []> var_76_ascending_0 = const()[name = tensor<string, []>("op_76_ascending_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> var_76_sort_0 = const()[name = tensor<string, []>("op_76_sort_0"), val = tensor<bool, []>(true)];
-            tensor<bool, []> var_76_return_indices_0 = const()[name = tensor<string, []>("op_76_return_indices_0"), val = tensor<bool, []>(true)];
-            tensor<string, []> var_76_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
-            tensor<fp16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_1 = topk(ascending = var_76_ascending_0, axis = var_76_axis_0, k = var_72, output_indices_dtype = var_76_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_76_return_indices_0, sort = var_76_sort_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_76_cast_fp16_cast_int16")];
-            tensor<string, []> var_76_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<string, []> var_76_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_76_cast_fp16_0_to_fp32_dtype_0, x = var_76_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_4")];
-            tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_76_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_76_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_5")];
-            tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_6")];
-        } -> (token_id, token_prob, duration, top_k_ids, top_k_logits);
-}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
-size 2798028

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a1ac15543fbb9301fba5f018b147e44d767479dec352aaa91dfe7bcf65949693
-size 243

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4940877938cc1b6d8830bbdd68ac8a49377cc57d75b61308883da5235b6a1914
-size 439

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json DELETED Viewed

@@ -1,112 +0,0 @@
-[
-  {
-    "metadataOutputVersion" : "3.0",
-    "shortDescription" : "Parakeet 110M preprocessor (15 s window)",
-    "outputSchema" : [
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Float32",
-        "formattedType" : "MultiArray (Float32)",
-        "shortDescription" : "",
-        "shape" : "[]",
-        "name" : "mel_features",
-        "type" : "MultiArray"
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "",
-        "shape" : "[1]",
-        "name" : "mel_length",
-        "type" : "MultiArray"
-      }
-    ],
-    "storagePrecision" : "Float16",
-    "modelParameters" : [
-    ],
-    "author" : "Fluid Inference",
-    "specificationVersion" : 8,
-    "mlProgramOperationTypeHistogram" : {
-      "Range1d" : 3,
-      "Ios17.equal" : 1,
-      "Ios17.notEqual" : 1,
-      "Ios17.reshape" : 2,
-      "Identity" : 1,
-      "Ios17.matmul" : 1,
-      "Select" : 6,
-      "Ios17.expandDims" : 12,
-      "Ios17.add" : 3,
-      "Tile" : 2,
-      "Ios17.sliceByIndex" : 3,
-      "Ios16.reduceSum" : 4,
-      "Shape" : 4,
-      "Ios17.gather" : 4,
-      "Ios17.logicalNot" : 1,
-      "Pad" : 1,
-      "Ios17.log" : 1,
-      "Ios17.less" : 2,
-      "Ios17.sub" : 4,
-      "Ios17.conv" : 2,
-      "Ios17.pow" : 2,
-      "Ios17.cast" : 10,
-      "Ios17.concat" : 3,
-      "Stack" : 1,
-      "Ios17.floorDiv" : 1,
-      "Ios17.realDiv" : 4,
-      "Ios17.sqrt" : 1,
-      "Ios17.greaterEqual" : 1,
-      "Ios17.mul" : 1
-    },
-    "computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
-    "isUpdatable" : "0",
-    "stateSchema" : [
-    ],
-    "availability" : {
-      "macOS" : "14.0",
-      "tvOS" : "17.0",
-      "visionOS" : "1.0",
-      "watchOS" : "10.0",
-      "iOS" : "17.0",
-      "macCatalyst" : "17.0"
-    },
-    "modelType" : {
-      "name" : "MLModelType_mlProgram"
-    },
-    "inputSchema" : [
-      {
-        "dataType" : "Float32",
-        "hasShapeFlexibility" : "1",
-        "isOptional" : "0",
-        "shapeFlexibility" : "1 × 1...240000",
-        "shapeRange" : "[[1, 1], [1, 240000]]",
-        "formattedType" : "MultiArray (Float32 1 × 1)",
-        "type" : "MultiArray",
-        "shape" : "[1, 1]",
-        "name" : "audio",
-        "shortDescription" : ""
-      },
-      {
-        "hasShapeFlexibility" : "0",
-        "isOptional" : "0",
-        "dataType" : "Int32",
-        "formattedType" : "MultiArray (Int32 1)",
-        "shortDescription" : "",
-        "shape" : "[1]",
-        "name" : "audio_length",
-        "type" : "MultiArray"
-      }
-    ],
-    "userDefinedMetadata" : {
-      "com.github.apple.coremltools.source_dialect" : "TorchScript",
-      "com.github.apple.coremltools.source" : "torch==2.9.0",
-      "com.github.apple.coremltools.version" : "8.3.0"
-    },
-    "generatedClassName" : "parakeet_preprocessor",
-    "method" : "predict"
-  }
-]

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil DELETED Viewed

@@ -1,191 +0,0 @@
-program(1.0)
-[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
-{
-    func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 240000]]}})))] {
-            tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
-            tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
-            tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
-            tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
-            tensor<int32, [1]> var_35 = add(x = audio_length, y = var_34)[name = tensor<string, []>("op_35")];
-            tensor<int32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, []>(512)];
-            tensor<int32, [1]> var_37 = sub(x = var_35, y = var_36)[name = tensor<string, []>("op_37")];
-            tensor<int32, [1]> floor_div_0 = floor_div(x = var_37, y = var_10)[name = tensor<string, []>("floor_div_0")];
-            tensor<bool, [1]> var_40 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_40")];
-            tensor<int32, [1]> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, [1]>([0])];
-            tensor<int32, [1]> mel_length = select(a = var_41, b = floor_div_0, cond = var_40)[name = tensor<string, []>("seq_len")];
-            tensor<string, []> audio_to_fp16_dtype_0 = const()[name = tensor<string, []>("audio_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [1, ?]> audio_to_fp16 = cast(dtype = audio_to_fp16_dtype_0, x = audio)[name = tensor<string, []>("cast_27")];
-            tensor<int32, [2]> var_43_shape_cast_fp16 = shape(x = audio_to_fp16)[name = tensor<string, []>("op_43_shape_cast_fp16")];
-            tensor<int32, []> gather_0_axis_0 = const()[name = tensor<string, []>("gather_0_axis_0"), val = tensor<int32, []>(0)];
-            tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_43_shape_cast_fp16_to_int16_dtype_0 = const()[name = tensor<string, []>("op_43_shape_cast_fp16_to_int16_dtype_0"), val = tensor<string, []>("int16")];
-            tensor<uint16, []> select_0_to_uint16 = const()[name = tensor<string, []>("select_0_to_uint16"), val = tensor<uint16, []>(1)];
-            tensor<int16, [2]> var_43_shape_cast_fp16_to_int16 = cast(dtype = var_43_shape_cast_fp16_to_int16_dtype_0, x = var_43_shape_cast_fp16)[name = tensor<string, []>("cast_26")];
-            tensor<int16, []> gather_0_cast_uint16 = gather(axis = gather_0_axis_0, batch_dims = gather_0_batch_dims_0, indices = select_0_to_uint16, validate_indices = gather_0_validate_indices_0, x = var_43_shape_cast_fp16_to_int16)[name = tensor<string, []>("gather_0_cast_uint16")];
-            tensor<string, []> gather_0_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_0_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
-            tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
-            tensor<int32, []> gather_0_cast_uint16_to_int32 = cast(dtype = gather_0_cast_uint16_to_int32_dtype_0, x = gather_0_cast_uint16)[name = tensor<string, []>("cast_25")];
-            tensor<int32, [?]> var_44 = range_1d(end = gather_0_cast_uint16_to_int32, start = const_0, step = const_1)[name = tensor<string, []>("op_44")];
-            tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<int32, [1, ?]> var_45 = expand_dims(axes = var_45_axes_0, x = var_44)[name = tensor<string, []>("op_45")];
-            tensor<int32, [1]> var_46_axes_0 = const()[name = tensor<string, []>("op_46_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [1, 1]> var_46 = expand_dims(axes = var_46_axes_0, x = audio_length)[name = tensor<string, []>("op_46")];
-            tensor<bool, [1, ?]> timemask = less(x = var_45, y = var_46)[name = tensor<string, []>("timemask")];
-            tensor<int32, [2]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [2]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [2]>([1, 1])];
-            tensor<bool, [2]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [2]>([true, false])];
-            tensor<bool, [2]> var_49_squeeze_mask_0 = const()[name = tensor<string, []>("op_49_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
-            tensor<fp16, [1]> var_49_cast_fp16 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, squeeze_mask = var_49_squeeze_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_49_cast_fp16")];
-            tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1]> var_50_cast_fp16 = expand_dims(axes = var_50_axes_0, x = var_49_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
-            tensor<int32, [2]> var_52_begin_0 = const()[name = tensor<string, []>("op_52_begin_0"), val = tensor<int32, [2]>([0, 1])];
-            tensor<int32, [2]> var_52_end_0 = const()[name = tensor<string, []>("op_52_end_0"), val = tensor<int32, [2]>([1, 0])];
-            tensor<bool, [2]> var_52_end_mask_0 = const()[name = tensor<string, []>("op_52_end_mask_0"), val = tensor<bool, [2]>([true, true])];
-            tensor<fp16, [1, ?]> var_52_cast_fp16 = slice_by_index(begin = var_52_begin_0, end = var_52_end_0, end_mask = var_52_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_52_cast_fp16")];
-            tensor<int32, [2]> var_54_begin_0 = const()[name = tensor<string, []>("op_54_begin_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [2]> var_54_end_0 = const()[name = tensor<string, []>("op_54_end_0"), val = tensor<int32, [2]>([1, -1])];
-            tensor<bool, [2]> var_54_end_mask_0 = const()[name = tensor<string, []>("op_54_end_mask_0"), val = tensor<bool, [2]>([true, false])];
-            tensor<fp16, [1, ?]> var_54_cast_fp16 = slice_by_index(begin = var_54_begin_0, end = var_54_end_0, end_mask = var_54_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_54_cast_fp16")];
-            tensor<fp16, []> var_55_to_fp16 = const()[name = tensor<string, []>("op_55_to_fp16"), val = tensor<fp16, []>(0x1.f0cp-1)];
-            tensor<fp16, [1, ?]> var_56_cast_fp16 = mul(x = var_54_cast_fp16, y = var_55_to_fp16)[name = tensor<string, []>("op_56_cast_fp16")];
-            tensor<fp16, [1, ?]> var_57_cast_fp16 = sub(x = var_52_cast_fp16, y = var_56_cast_fp16)[name = tensor<string, []>("op_57_cast_fp16")];
-            tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, ?]> x_3_cast_fp16 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_50_cast_fp16, var_57_cast_fp16))[name = tensor<string, []>("x_3_cast_fp16")];
-            tensor<bool, [1, ?]> var_60 = logical_not(x = timemask)[name = tensor<string, []>("op_60")];
-            tensor<fp16, []> var_16_to_fp16 = const()[name = tensor<string, []>("op_16_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
-            tensor<fp16, [1, ?]> input_1_cast_fp16 = select(a = var_16_to_fp16, b = x_3_cast_fp16, cond = var_60)[name = tensor<string, []>("input_1_cast_fp16")];
-            tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
-            tensor<fp16, [1, 1, ?]> input_3_cast_fp16 = reshape(shape = concat_1x, x = input_1_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
-            tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
-            tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
-            tensor<fp16, []> const_3_to_fp16 = const()[name = tensor<string, []>("const_3_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
-            tensor<fp16, [1, 1, ?]> input_5_cast_fp16 = pad(constant_val = const_3_to_fp16, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
-            tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
-            tensor<fp16, [1, ?]> input_cast_fp16 = reshape(shape = concat_2x, x = input_5_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
-            tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
-            tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [1, 1, ?]> expand_dims_4_cast_fp16 = expand_dims(axes = expand_dims_4_axes_0, x = input_cast_fp16)[name = tensor<string, []>("expand_dims_4_cast_fp16")];
-            tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [257, 1, 512]> expand_dims_1_to_fp16 = const()[name = tensor<string, []>("expand_dims_1_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
-            tensor<fp16, [1, 257, ?]> conv_0_cast_fp16 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_0_cast_fp16")];
-            tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
-            tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
-            tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
-            tensor<fp16, [257, 1, 512]> expand_dims_2_to_fp16 = const()[name = tensor<string, []>("expand_dims_2_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263296)))];
-            tensor<fp16, [1, 257, ?]> conv_1_cast_fp16 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_1_cast_fp16")];
-            tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
-            tensor<fp16, [1, 257, ?, 2]> stack_0_cast_fp16 = stack(axis = stack_0_axis_0, values = (conv_0_cast_fp16, conv_1_cast_fp16))[name = tensor<string, []>("stack_0_cast_fp16")];
-            tensor<fp16, []> var_19_promoted_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
-            tensor<fp16, [1, 257, ?, 2]> var_75_cast_fp16 = pow(x = stack_0_cast_fp16, y = var_19_promoted_to_fp16)[name = tensor<string, []>("op_75_cast_fp16")];
-            tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([-1])];
-            tensor<bool, []> var_77_keep_dims_0 = const()[name = tensor<string, []>("op_77_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 257, ?]> var_77_cast_fp16 = reduce_sum(axes = var_77_axes_0, keep_dims = var_77_keep_dims_0, x = var_75_cast_fp16)[name = tensor<string, []>("op_77_cast_fp16")];
-            tensor<fp16, [1, 257, ?]> x_11_cast_fp16 = identity(x = var_77_cast_fp16)[name = tensor<string, []>("x_11_cast_fp16")];
-            tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
-            tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 80, 257]> const_4_to_fp16 = const()[name = tensor<string, []>("const_4_to_fp16"), val = tensor<fp16, [1, 80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526528)))];
-            tensor<fp16, [1, 80, ?]> x_13_cast_fp16 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4_to_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("x_13_cast_fp16")];
-            tensor<fp16, []> var_84_to_fp16 = const()[name = tensor<string, []>("op_84_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
-            tensor<fp16, [1, 80, ?]> var_85_cast_fp16 = add(x = x_13_cast_fp16, y = var_84_to_fp16)[name = tensor<string, []>("op_85_cast_fp16")];
-            tensor<fp32, []> x_15_epsilon_0 = const()[name = tensor<string, []>("x_15_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
-            tensor<fp16, [1, 80, ?]> x_15_cast_fp16 = log(epsilon = x_15_epsilon_0, x = var_85_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
-            tensor<int32, [3]> var_87_shape_cast_fp16 = shape(x = x_15_cast_fp16)[name = tensor<string, []>("op_87_shape_cast_fp16")];
-            tensor<int32, []> gather_5 = const()[name = tensor<string, []>("gather_5"), val = tensor<int32, []>(1)];
-            tensor<int32, []> gather_6_axis_0 = const()[name = tensor<string, []>("gather_6_axis_0"), val = tensor<int32, []>(0)];
-            tensor<int32, []> gather_6_batch_dims_0 = const()[name = tensor<string, []>("gather_6_batch_dims_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> gather_6_validate_indices_0 = const()[name = tensor<string, []>("gather_6_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_87_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_87_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
-            tensor<uint16, []> select_6_to_uint16 = const()[name = tensor<string, []>("select_6_to_uint16"), val = tensor<uint16, []>(2)];
-            tensor<uint16, [3]> var_87_shape_cast_fp16_to_uint16 = cast(dtype = var_87_shape_cast_fp16_to_uint16_dtype_0, x = var_87_shape_cast_fp16)[name = tensor<string, []>("cast_24")];
-            tensor<uint16, []> gather_6_cast_uint16 = gather(axis = gather_6_axis_0, batch_dims = gather_6_batch_dims_0, indices = select_6_to_uint16, validate_indices = gather_6_validate_indices_0, x = var_87_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_6_cast_uint16")];
-            tensor<string, []> gather_6_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_6_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
-            tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
-            tensor<int32, []> gather_6_cast_uint16_to_int32 = cast(dtype = gather_6_cast_uint16_to_int32_dtype_0, x = gather_6_cast_uint16)[name = tensor<string, []>("cast_23")];
-            tensor<int32, [?]> var_89 = range_1d(end = gather_6_cast_uint16_to_int32, start = const_5, step = const_6)[name = tensor<string, []>("op_89")];
-            tensor<int32, [1]> var_90_axes_0 = const()[name = tensor<string, []>("op_90_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<int32, [1, ?]> var_90 = expand_dims(axes = var_90_axes_0, x = var_89)[name = tensor<string, []>("op_90")];
-            tensor<int32, []> concat_3_axis_0 = const()[name = tensor<string, []>("concat_3_axis_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> concat_3_interleave_0 = const()[name = tensor<string, []>("concat_3_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<int32, [2]> concat_3 = concat(axis = concat_3_axis_0, interleave = concat_3_interleave_0, values = (gather_5, gather_6_cast_uint16_to_int32))[name = tensor<string, []>("concat_3")];
-            tensor<int32, [2]> shape_8 = shape(x = var_90)[name = tensor<string, []>("shape_8")];
-            tensor<int32, [2]> real_div_0 = real_div(x = concat_3, y = shape_8)[name = tensor<string, []>("real_div_0")];
-            tensor<int32, [?, ?]> time_steps = tile(reps = real_div_0, x = var_90)[name = tensor<string, []>("time_steps")];
-            tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<int32, [1, 1]> var_93 = expand_dims(axes = var_93_axes_0, x = mel_length)[name = tensor<string, []>("op_93")];
-            tensor<bool, [?, ?]> valid_mask = less(x = time_steps, y = var_93)[name = tensor<string, []>("valid_mask")];
-            tensor<int32, [1]> var_95_axes_0 = const()[name = tensor<string, []>("op_95_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<bool, [?, 1, ?]> var_95 = expand_dims(axes = var_95_axes_0, x = valid_mask)[name = tensor<string, []>("op_95")];
-            tensor<fp16, [1, 80, ?]> var_96_cast_fp16 = select(a = x_15_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_96_cast_fp16")];
-            tensor<int32, [1]> x_mean_numerator_axes_0 = const()[name = tensor<string, []>("x_mean_numerator_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<bool, []> x_mean_numerator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_numerator_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [1, 80]> x_mean_numerator_cast_fp16 = reduce_sum(axes = x_mean_numerator_axes_0, keep_dims = x_mean_numerator_keep_dims_0, x = var_96_cast_fp16)[name = tensor<string, []>("x_mean_numerator_cast_fp16")];
-            tensor<int32, [1]> x_mean_denominator_axes_0 = const()[name = tensor<string, []>("x_mean_denominator_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<bool, []> x_mean_denominator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_denominator_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> cast_6_to_fp16_dtype_0 = const()[name = tensor<string, []>("cast_6_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
-            tensor<fp16, [?, ?]> valid_mask_to_fp16 = cast(dtype = cast_6_to_fp16_dtype_0, x = valid_mask)[name = tensor<string, []>("cast_22")];
-            tensor<fp16, [?]> x_mean_denominator_cast_fp16 = reduce_sum(axes = x_mean_denominator_axes_0, keep_dims = x_mean_denominator_keep_dims_0, x = valid_mask_to_fp16)[name = tensor<string, []>("x_mean_denominator_cast_fp16")];
-            tensor<int32, [1]> var_101_axes_0 = const()[name = tensor<string, []>("op_101_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<fp16, [?, 1]> var_101_cast_fp16 = expand_dims(axes = var_101_axes_0, x = x_mean_denominator_cast_fp16)[name = tensor<string, []>("op_101_cast_fp16")];
-            tensor<fp16, [?, 80]> x_mean_cast_fp16 = real_div(x = x_mean_numerator_cast_fp16, y = var_101_cast_fp16)[name = tensor<string, []>("x_mean_cast_fp16")];
-            tensor<int32, [1]> var_104_axes_0 = const()[name = tensor<string, []>("op_104_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [?, 80, 1]> var_104_cast_fp16 = expand_dims(axes = var_104_axes_0, x = x_mean_cast_fp16)[name = tensor<string, []>("op_104_cast_fp16")];
-            tensor<fp16, [?, 80, ?]> var_105_cast_fp16 = sub(x = x_15_cast_fp16, y = var_104_cast_fp16)[name = tensor<string, []>("op_105_cast_fp16")];
-            tensor<fp16, [?, 80, ?]> var_106_cast_fp16 = select(a = var_105_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_106_cast_fp16")];
-            tensor<fp16, []> var_19_promoted_1_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_1_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
-            tensor<fp16, [?, 80, ?]> var_107_cast_fp16 = pow(x = var_106_cast_fp16, y = var_19_promoted_1_to_fp16)[name = tensor<string, []>("op_107_cast_fp16")];
-            tensor<int32, [1]> var_109_axes_0 = const()[name = tensor<string, []>("op_109_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<bool, []> var_109_keep_dims_0 = const()[name = tensor<string, []>("op_109_keep_dims_0"), val = tensor<bool, []>(false)];
-            tensor<fp16, [?, 80]> var_109_cast_fp16 = reduce_sum(axes = var_109_axes_0, keep_dims = var_109_keep_dims_0, x = var_107_cast_fp16)[name = tensor<string, []>("op_109_cast_fp16")];
-            tensor<fp16, []> var_111_to_fp16 = const()[name = tensor<string, []>("op_111_to_fp16"), val = tensor<fp16, []>(0x1p+0)];
-            tensor<fp16, [?, 1]> var_112_cast_fp16 = sub(x = var_101_cast_fp16, y = var_111_to_fp16)[name = tensor<string, []>("op_112_cast_fp16")];
-            tensor<fp16, [?, 80]> var_113_cast_fp16 = real_div(x = var_109_cast_fp16, y = var_112_cast_fp16)[name = tensor<string, []>("op_113_cast_fp16")];
-            tensor<fp16, [?, 80]> x_std_1_cast_fp16 = sqrt(x = var_113_cast_fp16)[name = tensor<string, []>("x_std_1_cast_fp16")];
-            tensor<bool, [?, 80]> var_115_cast_fp16 = not_equal(x = x_std_1_cast_fp16, y = x_std_1_cast_fp16)[name = tensor<string, []>("op_115_cast_fp16")];
-            tensor<fp16, [?, 80]> x_std_3_cast_fp16 = select(a = var_16_to_fp16, b = x_std_1_cast_fp16, cond = var_115_cast_fp16)[name = tensor<string, []>("x_std_3_cast_fp16")];
-            tensor<fp16, []> var_25_to_fp16 = const()[name = tensor<string, []>("op_25_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
-            tensor<fp16, [?, 80]> x_std_cast_fp16 = add(x = x_std_3_cast_fp16, y = var_25_to_fp16)[name = tensor<string, []>("x_std_cast_fp16")];
-            tensor<int32, [1]> var_120_axes_0 = const()[name = tensor<string, []>("op_120_axes_0"), val = tensor<int32, [1]>([2])];
-            tensor<fp16, [?, 80, 1]> var_120_cast_fp16 = expand_dims(axes = var_120_axes_0, x = x_std_cast_fp16)[name = tensor<string, []>("op_120_cast_fp16")];
-            tensor<fp16, [?, 80, ?]> x_cast_fp16 = real_div(x = var_105_cast_fp16, y = var_120_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
-            tensor<int32, [3]> var_122_shape_cast_fp16 = shape(x = x_cast_fp16)[name = tensor<string, []>("op_122_shape_cast_fp16")];
-            tensor<int32, []> gather_7_axis_0 = const()[name = tensor<string, []>("gather_7_axis_0"), val = tensor<int32, []>(0)];
-            tensor<int32, []> gather_7_batch_dims_0 = const()[name = tensor<string, []>("gather_7_batch_dims_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> gather_7_validate_indices_0 = const()[name = tensor<string, []>("gather_7_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<string, []> var_122_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_122_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
-            tensor<uint16, []> select_7_to_uint16 = const()[name = tensor<string, []>("select_7_to_uint16"), val = tensor<uint16, []>(2)];
-            tensor<uint16, [3]> var_122_shape_cast_fp16_to_uint16 = cast(dtype = var_122_shape_cast_fp16_to_uint16_dtype_0, x = var_122_shape_cast_fp16)[name = tensor<string, []>("cast_21")];
-            tensor<uint16, []> gather_7_cast_uint16 = gather(axis = gather_7_axis_0, batch_dims = gather_7_batch_dims_0, indices = select_7_to_uint16, validate_indices = gather_7_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_7_cast_uint16")];
-            tensor<string, []> gather_7_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_7_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, []> const_7 = const()[name = tensor<string, []>("const_7"), val = tensor<int32, []>(0)];
-            tensor<int32, []> const_8 = const()[name = tensor<string, []>("const_8"), val = tensor<int32, []>(1)];
-            tensor<int32, []> gather_7_cast_uint16_to_int32 = cast(dtype = gather_7_cast_uint16_to_int32_dtype_0, x = gather_7_cast_uint16)[name = tensor<string, []>("cast_20")];
-            tensor<int32, [?]> mask_1 = range_1d(end = gather_7_cast_uint16_to_int32, start = const_7, step = const_8)[name = tensor<string, []>("mask_1")];
-            tensor<int32, []> gather_8_axis_0 = const()[name = tensor<string, []>("gather_8_axis_0"), val = tensor<int32, []>(0)];
-            tensor<int32, []> gather_8_batch_dims_0 = const()[name = tensor<string, []>("gather_8_batch_dims_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> gather_8_validate_indices_0 = const()[name = tensor<string, []>("gather_8_validate_indices_0"), val = tensor<bool, []>(false)];
-            tensor<uint16, []> select_8_to_uint16 = const()[name = tensor<string, []>("select_8_to_uint16"), val = tensor<uint16, []>(0)];
-            tensor<uint16, []> gather_8_cast_uint16 = gather(axis = gather_8_axis_0, batch_dims = gather_8_batch_dims_0, indices = select_8_to_uint16, validate_indices = gather_8_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_8_cast_uint16")];
-            tensor<string, []> gather_8_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_8_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
-            tensor<int32, []> concat_4_axis_0 = const()[name = tensor<string, []>("concat_4_axis_0"), val = tensor<int32, []>(0)];
-            tensor<bool, []> concat_4_interleave_0 = const()[name = tensor<string, []>("concat_4_interleave_0"), val = tensor<bool, []>(false)];
-            tensor<int32, []> gather_8_cast_uint16_to_int32 = cast(dtype = gather_8_cast_uint16_to_int32_dtype_0, x = gather_8_cast_uint16)[name = tensor<string, []>("cast_19")];
-            tensor<int32, [2]> concat_4 = concat(axis = concat_4_axis_0, interleave = concat_4_interleave_0, values = (gather_8_cast_uint16_to_int32, var_9))[name = tensor<string, []>("concat_4")];
-            tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
-            tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
-            tensor<int32, [?, ?]> var_126 = tile(reps = concat_4, x = expand_dims_0)[name = tensor<string, []>("op_126")];
-            tensor<bool, [?, ?]> mask = greater_equal(x = var_126, y = var_93)[name = tensor<string, []>("mask")];
-            tensor<int32, [1]> var_129_axes_0 = const()[name = tensor<string, []>("op_129_axes_0"), val = tensor<int32, [1]>([1])];
-            tensor<bool, [?, 1, ?]> var_129 = expand_dims(axes = var_129_axes_0, x = mask)[name = tensor<string, []>("op_129")];
-            tensor<fp16, []> cast_15_to_fp16 = const()[name = tensor<string, []>("cast_15_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
-            tensor<fp16, [?, 80, ?]> processed_signal_cast_fp16 = select(a = cast_15_to_fp16, b = x_cast_fp16, cond = var_129)[name = tensor<string, []>("processed_signal_cast_fp16")];
-            tensor<string, []> processed_signal_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("processed_signal_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
-            tensor<fp32, [?, 80, ?]> mel_features = cast(dtype = processed_signal_cast_fp16_to_fp32_dtype_0, x = processed_signal_cast_fp16)[name = tensor<string, []>("cast_18")];
-        } -> (mel_features, mel_length);
-}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c062338de852a26607ce4101f74e6895de3a4134a57b07232bd72bfc6f1d7f1a
-size 567712

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json DELETED Viewed

@@ -1,247 +0,0 @@
-{
-  "model_id": "nvidia/parakeet-tdt_ctc-110m",
-  "model_type": "hybrid_rnnt_ctc",
-  "sample_rate": 16000,
-  "max_audio_seconds": 15.0,
-  "max_audio_samples": 240000,
-  "max_symbol_steps": 1,
-  "vocab_size": 1024,
-  "joint_extra_outputs": 5,
-  "encoder_dim": 512,
-  "decoder_dim": 640,
-  "decoder_hidden": 640,
-  "decoder_layers": 1,
-  "blank_id": 1024,
-  "checkpoint": {
-    "type": "pretrained",
-    "model_id": "nvidia/parakeet-tdt_ctc-110m"
-  },
-  "coreml": {
-    "compute_units": "CPU_ONLY",
-    "compute_precision": "FLOAT32"
-  },
-  "components": {
-    "preprocessor": {
-      "inputs": {
-        "audio_signal": [
-          1,
-          240000
-        ],
-        "audio_length": [
-          1
-        ]
-      },
-      "outputs": {
-        "mel": [
-          1,
-          80,
-          1501
-        ],
-        "mel_length": [
-          1
-        ]
-      },
-      "path": "parakeet_preprocessor.mlpackage"
-    },
-    "encoder": {
-      "inputs": {
-        "mel": [
-          1,
-          80,
-          1501
-        ],
-        "mel_length": [
-          1
-        ]
-      },
-      "outputs": {
-        "encoder": [
-          1,
-          512,
-          188
-        ],
-        "encoder_length": [
-          1
-        ]
-      },
-      "path": "parakeet_encoder.mlpackage"
-    },
-    "ctc_head": {
-      "inputs": {
-        "encoder": [
-          1,
-          512,
-          188
-        ]
-      },
-      "outputs": {
-        "log_probs": [
-          1,
-          188,
-          1025
-        ]
-      },
-      "path": "parakeet_ctc_head.mlpackage"
-    },
-    "mel_encoder": {
-      "inputs": {
-        "audio_signal": [
-          1,
-          240000
-        ],
-        "audio_length": [
-          1
-        ]
-      },
-      "outputs": {
-        "encoder": [
-          1,
-          512,
-          188
-        ],
-        "encoder_length": [
-          1
-        ]
-      },
-      "path": "parakeet_mel_encoder.mlpackage"
-    },
-    "decoder": {
-      "inputs": {
-        "targets": [
-          1,
-          1
-        ],
-        "target_length": [
-          1
-        ],
-        "h_in": [
-          1,
-          1,
-          640
-        ],
-        "c_in": [
-          1,
-          1,
-          640
-        ]
-      },
-      "outputs": {
-        "decoder": [
-          1,
-          640,
-          1
-        ],
-        "h_out": [
-          1,
-          1,
-          640
-        ],
-        "c_out": [
-          1,
-          1,
-          640
-        ]
-      },
-      "path": "parakeet_decoder.mlpackage"
-    },
-    "joint": {
-      "inputs": {
-        "encoder": [
-          1,
-          512,
-          188
-        ],
-        "decoder": [
-          1,
-          640,
-          1
-        ]
-      },
-      "outputs": {
-        "logits": [
-          1,
-          188,
-          1,
-          1030
-        ]
-      },
-      "path": "parakeet_joint.mlpackage"
-    },
-    "joint_decision": {
-      "inputs": {
-        "encoder": [
-          1,
-          512,
-          188
-        ],
-        "decoder": [
-          1,
-          640,
-          1
-        ]
-      },
-      "outputs": {
-        "token_id": [
-          1,
-          188,
-          1
-        ],
-        "token_prob": [
-          1,
-          188,
-          1
-        ],
-        "duration": [
-          1,
-          188,
-          1
-        ]
-      },
-      "path": "parakeet_joint_decision.mlpackage"
-    },
-    "joint_decision_single_step": {
-      "inputs": {
-        "encoder_step": [
-          1,
-          512,
-          1
-        ],
-        "decoder_step": [
-          1,
-          640,
-          1
-        ]
-      },
-      "outputs": {
-        "token_id": [
-          1,
-          1,
-          1
-        ],
-        "token_prob": [
-          1,
-          1,
-          1
-        ],
-        "duration": [
-          1,
-          1,
-          1
-        ],
-        "top_k_ids": [
-          1,
-          1,
-          1,
-          64
-        ],
-        "top_k_logits": [
-          1,
-          1,
-          1,
-          64
-        ]
-      },
-      "path": "parakeet_joint_decision_single_step.mlpackage"
-    }
-  }
-}

convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json DELETED Viewed

@@ -1 +0,0 @@

- {"0": "<unk>", "1": "▁t", "2": "▁th", "3": "▁a", "4": "in", "5": "re", "6": "▁the", "7": "▁w", "8": "▁s", "9": "▁o", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "▁h", "16": "▁c", "17": "▁b", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "▁f", "23": "▁to", "24": "▁m", "25": "es", "26": "▁p", "27": "or", "28": "an", "29": "▁d", "30": "ll", "31": "▁I", "32": "ed", "33": "▁and", "34": "▁l", "35": "▁of", "36": "▁in", "37": "▁y", "38": "ar", "39": "▁g", "40": "▁you", "41": "as", "42": "om", "43": "▁n", "44": "ve", "45": "▁that", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "▁e", "53": "ut", "54": "▁it", "55": "ot", "56": "▁be", "57": "▁T", "58": "ion", "59": "▁is", "60": "▁wh", "61": "▁re", "62": "▁on", "63": "▁we", "64": "ent", "65": "▁A", "66": "ay", "67": "▁ha", "68": "▁Th", "69": "id", "70": "▁S", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "▁for", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "▁he", "81": "▁st", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "▁this", "91": "if", "92": "▁W", "93": "oo", "94": "ri", "95": "▁was", "96": "ght", "97": "▁u", "98": "▁with", "99": "ad", "100": "ch", "101": "▁se", "102": "▁k", "103": "▁an", "104": "▁The", "105": "▁li", "106": "▁do", "107": "▁B", "108": "▁have", "109": "▁as", "110": "th", "111": "▁are", "112": "▁sh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "▁H", "118": "▁j", "119": "ter", "120": "▁go", "121": "▁And", "122": "ation", "123": "▁C", "124": "▁so", "125": "ome", "126": "▁not", "127": "op", "128": "il", "129": "ore", "130": "▁ne", "131": "▁can", "132": "▁me", "133": "▁at", "134": "ould", "135": "ant", "136": "▁M", "137": "▁like", "138": "ere", "139": "▁they", "140": "ra", "141": "ers", "142": "▁ab", "143": "▁de", "144": "▁kn", "145": "ge", "146": "▁Y", "147": "▁ch", "148": "ul", "149": "pp", "150": "▁or", "151": "▁al", "152": "▁con", "153": "▁com", "154": "ess", "155": "▁su", "156": "out", "157": "▁your", "158": "▁So", "159": "ate", "160": "▁one", "161": "▁all", "162": "▁ex", "163": "est", "164": "▁fr", "165": "▁just", "166": "▁pro", "167": "▁know", "168": "▁O", "169": "ain", "170": "▁but", "171": "ol", "172": "ive", "173": "▁v", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "▁my", "179": "el", "180": "▁N", "181": "nt", "182": "▁It", "183": "▁what", "184": "ab", "185": "▁P", "186": "▁wor", "187": "▁out", "188": "▁there", "189": "▁up", "190": "um", "191": "▁from", "192": "pe", "193": "▁tw", "194": "▁r", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "▁L", "200": "ist", "201": "▁about", "202": "ide", "203": "ig", "204": "ake", "205": "▁D", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "▁We", "214": "▁get", "215": "▁E", "216": "▁G", "217": "ack", "218": "▁le", "219": "ity", "220": "od", "221": "▁F", "222": "ard", "223": "▁pl", "224": "▁our", "225": "▁int", "226": "ment", "227": "▁will", "228": "ies", "229": "▁by", "230": "ink", "231": "ca", "232": "▁if", "233": "red", "234": "her", "235": "ie", "236": "▁us", "237": "▁some", "238": "▁don", "239": "ven", "240": "ood", "241": "ast", "242": "▁R", "243": "▁his", "244": "▁tim", "245": "▁tr", "246": "▁more", "247": "ich", "248": "ous", "249": "ame", "250": "▁going", "251": "▁had", "252": "▁them", "253": "ook", "254": "▁pe", "255": "▁Wh", "256": "▁You", "257": "▁But", "258": "ine", "259": "▁here", "260": "▁would", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "▁has", "267": "ect", "268": "▁think", "269": "▁fe", "270": "ong", "271": "▁see", "272": "▁when", "273": "▁who", "274": "▁were", "275": "▁really", "276": "▁their", "277": "▁want", "278": "one", "279": "ople", "280": "▁then", "281": "▁time", "282": "▁sa", "283": "ap", "284": "▁te", "285": "▁He", "286": "▁ye", "287": "ck", "288": "▁her", "289": "▁thing", "290": "▁right", "291": "▁which", "292": "itt", "293": "ice", "294": "act", "295": "▁people", "296": "ty", "297": "▁two", "298": "▁J", "299": "▁im", "300": "ther", "301": "ci", "302": "ose", "303": "▁cl", "304": "▁qu", "305": "▁man", "306": "▁also", "307": "ree", "308": "▁en", "309": "ud", "310": "▁how", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "▁any", "316": "ff", "317": "ace", "318": "per", "319": "▁because", "320": "▁very", "321": "own", "322": "▁ad", "323": "▁act", "324": "▁been", "325": "▁now", "326": "▁ag", "327": "▁into", "328": "▁comp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "▁these", "335": "ays", "336": "ep", "337": "▁This", "338": "▁she", "339": "ans", "340": "ah", "341": "een", "342": "▁over", "343": "ry", "344": "▁lo", "345": "age", "346": "▁pr", "347": "▁sp", "348": "ue", "349": "▁co", "350": "ick", "351": "ber", "352": "▁did", "353": "ip", "354": "ach", "355": "▁back", "356": "▁no", "357": "▁cont", "358": "▁other", "359": "▁every", "360": "pt", "361": "▁need", "362": "▁him", "363": "▁U", "364": "▁In", "365": "▁work", "366": "irst", "367": "▁part", "368": "▁look", "369": "ittle", "370": "ble", "371": "iz", "372": "▁un", "373": "▁make", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "▁little", "379": "▁off", "380": "▁than", "381": "▁got", "382": "ually", "383": "▁per", "384": "▁good", "385": "▁way", "386": "▁could", "387": "▁ac", "388": "▁imp", "389": "able", "390": "▁where", "391": "iff", "392": "▁That", "393": "▁res", "394": "ount", "395": "pl", "396": "ance", "397": "▁first", "398": "▁ro", "399": "▁pre", "400": "ass", "401": "▁say", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "▁somet", "408": "ound", "409": "▁down", "410": "▁diff", "411": "sel", "412": "▁gu", "413": "▁am", "414": "ress", "415": "▁lot", "416": "ence", "417": "▁dis", "418": "orm", "419": "ix", "420": "▁po", "421": "ving", "422": "enty", "423": "▁K", "424": "▁spe", "425": "und", "426": "he", "427": "▁much", "428": "▁ar", "429": "round", "430": "▁app", "431": "co", "432": "ark", "433": "▁new", "434": "ater", "435": "ult", "436": "end", "437": "▁even", "438": "▁start", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "▁well", "444": "be", "445": "▁They", "446": "▁three", "447": "ign", "448": "ild", "449": "▁said", "450": "ough", "451": "ang", "452": "▁too", "453": "ade", "454": "▁bl", "455": "ens", "456": "▁inc", "457": "ia", "458": "▁those", "459": "▁mo", "460": "▁take", "461": "▁through", "462": "▁fl", "463": "▁kind", "464": "▁things", "465": "▁bet", "466": "▁only", "467": "▁St", "468": "▁let", "469": "cess", "470": "▁Ch", "471": "ary", "472": "vel", "473": "▁If", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "▁again", "480": "▁something", "481": "onna", "482": "fore", "483": "▁may", "484": "ting", "485": "▁bu", "486": "▁differe", "487": "urn", "488": "▁gonna", "489": "▁does", "490": "uct", "491": "og", "492": "▁twenty", "493": "▁gr", "494": "▁Ye", "495": "wn", "496": "▁should", "497": "▁comm", "498": "ition", "499": "▁under", "500": "▁hel", "501": "ory", "502": "▁fo", "503": "▁use", "504": "igh", "505": "ife", "506": "▁actually", "507": "▁tal", "508": "▁call", "509": "ents", "510": "ious", "511": "ull", "512": "▁There", "513": "▁Yeah", "514": "▁most", "515": "▁ke", "516": "ors", "517": "ved", "518": "ys", "519": "▁sc", "520": "▁happ", "521": "ope", "522": "▁help", "523": "atch", "524": "▁What", "525": "▁rem", "526": "ple", "527": "▁Now", "528": "▁br", "529": "ool", "530": "oth", "531": "▁four", "532": "self", "533": "▁str", "534": "ne", "535": "thing", "536": "▁put", "537": "ial", "538": "▁great", "539": "ail", "540": "ub", "541": "ning", "542": "▁sm", "543": "▁feel", "544": "▁five", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "▁many", "552": "▁hundred", "553": "▁years", "554": "▁being", "555": "▁come", "556": "▁mean", "557": "ily", "558": "▁different", "559": "▁after", "560": "▁ser", "561": "▁show", "562": "form", "563": "ful", "564": "oy", "565": "▁six", "566": "▁vide", "567": "▁V", "568": "▁its", "569": "▁point", "570": "▁day", "571": "▁des", "572": "ons", "573": "▁bit", "574": "▁bel", "575": "▁before", "576": "▁aw", "577": "▁end", "578": "▁Oh", "579": "▁still", "580": "ath", "581": "▁long", "582": "▁'", "583": "ise", "584": "ob", "585": "day", "586": "▁add", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "▁cr", "592": "▁around", "593": "▁try", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "▁find", "600": "ward", "601": "▁As", "602": "▁eight", "603": "lic", "604": "▁same", "605": "▁pos", "606": "▁em", "607": "▁made", "608": "▁supp", "609": "▁life", "610": "▁Be", "611": "pect", "612": "▁dec", "613": "▁play", "614": "ange", "615": "▁att", "616": "▁pers", "617": "ways", "618": "▁high", "619": "▁hand", "620": "▁next", "621": "▁cons", "622": "▁own", "623": "▁inv", "624": "ower", "625": "▁ind", "626": "ert", "627": "ng", "628": "ave", "629": "▁year", "630": "▁big", "631": "ating", "632": "▁world", "633": "▁rel", "634": "▁sure", "635": "▁tra", "636": "ew", "637": "ered", "638": "▁fin", "639": "▁Well", "640": "▁sl", "641": "▁doing", "642": "bs", "643": "▁set", "644": "▁rec", "645": "ual", "646": "cial", "647": "▁ph", "648": "erm", "649": "▁love", "650": "ph", "651": "▁real", "652": "▁last", "653": "ict", "654": "▁bo", "655": "▁ra", "656": "ible", "657": "▁wr", "658": "mer", "659": "▁count", "660": "ities", "661": "▁always", "662": "inet", "663": "ments", "664": "uc", "665": "▁might", "666": "▁inter", "667": "▁video", "668": "gin", "669": "▁tell", "670": "▁never", "671": "vent", "672": "▁import", "673": "ied", "674": "▁sy", "675": "▁How", "676": "ically", "677": "ought", "678": "▁thir", "679": "▁rep", "680": "ks", "681": "ib", "682": "▁fam", "683": "ject", "684": "▁bas", "685": "▁She", "686": "▁give", "687": "akes", "688": "▁ninet", "689": "▁reg", "690": "▁min", "691": "▁op", "692": "▁def", "693": "▁didn", "694": "te", "695": "▁cour", "696": "▁why", "697": "▁ent", "698": "▁place", "699": "▁ins", "700": "▁car", "701": "ather", "702": "▁person", "703": "ular", "704": "▁inst", "705": "▁prod", "706": "lect", "707": "▁Al", "708": "▁today", "709": "▁bec", "710": "▁sur", "711": "▁All", "712": "▁another", "713": "▁bus", "714": "▁keep", "715": "ell", "716": "ese", "717": "riend", "718": "▁quest", "719": "▁talk", "720": "als", "721": "ings", "722": "▁mon", "723": "cond", "724": "old", "725": "▁acc", "726": "▁la", "727": "▁num", "728": "ident", "729": "▁che", "730": "iness", "731": "▁turn", "732": "▁ear", "733": "▁No", "734": "ousand", "735": "▁better", "736": "ific", "737": "▁loo", "738": "▁gl", "739": "oc", "740": "▁important", "741": "ited", "742": "▁An", "743": "▁thousand", "744": "ility", "745": "llow", "746": "▁used", "747": "▁gen", "748": "▁sim", "749": "li", "750": "▁happen", "751": "▁Un", "752": "▁Let", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "▁watch", "758": "▁For", "759": "▁sw", "760": "ren", "761": "ute", "762": "ever", "763": "▁pol", "764": "▁sch", "765": "▁When", "766": "▁such", "767": "▁fif", "768": "▁home", "769": "▁cle", "770": "▁contin", "771": "ouse", "772": "▁friend", "773": "uring", "774": "▁Okay", "775": "gr", "776": "▁able", "777": "▁stud", "778": "▁eff", "779": "hip", "780": "body", "781": "▁top", "782": "ness", "783": "▁exper", "784": "▁pret", "785": "▁both", "786": "▁done", "787": "cri", "788": "▁mark", "789": "▁while", "790": "▁old", "791": "ros", "792": "ont", "793": "▁second", "794": "ative", "795": "▁thought", "796": "▁best", "797": "▁found", "798": "iew", "799": "▁belie", "800": "▁each", "801": "erest", "802": "▁tri", "803": "▁eas", "804": "▁ca", "805": "▁fact", "806": "▁care", "807": "▁fun", "808": "atter", "809": "ures", "810": "▁head", "811": "▁lear", "812": "▁water", "813": "▁hard", "814": "▁few", "815": "▁side", "816": "ween", "817": "▁exp", "818": "▁away", "819": "its", "820": "▁ext", "821": "lud", "822": "▁run", "823": "▁trans", "824": "ince", "825": "▁sk", "826": "▁open", "827": "cus", "828": "▁between", "829": "▁called", "830": "▁wee", "831": "▁pretty", "832": "ason", "833": "▁far", "834": "ember", "835": "omm", "836": "▁interest", "837": "any", "838": "ner", "839": "uff", "840": "▁pres", "841": "▁cur", "842": "▁child", "843": "ee", "844": "▁toget", "845": "▁together", "846": "olog", "847": "▁God", "848": "ond", "849": "▁char", "850": "▁looking", "851": "stem", "852": "az", "853": "cent", "854": "▁ob", "855": "▁ass", "856": "land", "857": "▁doesn", "858": "▁business", "859": "▁course", "860": "▁ten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "▁ref", "868": "▁name", "869": "ross", "870": "▁grow", "871": "oney", "872": "▁went", "873": "ics", "874": "teen", "875": "▁cou", "876": "▁prob", "877": "▁ret", "878": "▁guys", "879": "▁came", "880": "ash", "881": "led", "882": "▁Eur", "883": "ues", "884": "▁ide", "885": "gan", "886": "▁everything", "887": "▁getting", "888": "▁ask", "889": "▁cor", "890": "▁build", "891": "▁sign", "892": "▁small", "893": "uck", "894": "▁el", "895": "▁col", "896": "▁Is", "897": "ational", "898": "stand", "899": "cy", "900": "▁conf", "901": "der", "902": "▁bre", "903": "▁cap", "904": "▁mod", "905": "ets", "906": "ike", "907": "▁number", "908": "▁comple", "909": "ertain", "910": "▁ever", "911": "▁coll", "912": "▁hum", "913": "▁Europe", "914": "▁cre", "915": "▁met", "916": "▁exam", "917": "▁move", "918": "▁pass", "919": "▁left", "920": "▁system", "921": "▁includ", "922": "▁Thank", "923": "cept", "924": "▁wom", "925": "▁product", "926": "ten", "927": "▁rest", "928": "▁probably", "929": "▁dri", "930": "▁Do", "931": "▁gener", "932": "▁anything", "933": "▁lar", "934": "▁My", "935": "▁school", "936": "▁lead", "937": "▁sub", "938": "▁ty", "939": "▁plan", "940": "▁seem", "941": "▁whole", "942": "irect", "943": "▁light", "944": "▁must", "945": "▁mom", "946": "▁opp", "947": "▁support", "948": "▁family", "949": "ices", "950": "amp", "951": "▁proble", "952": "▁dr", "953": "ready", "954": "▁using", "955": "ense", "956": "▁prov", "957": "ush", "958": "ax", "959": "▁power", "960": "▁Re", "961": "alth", "962": "▁ev", "963": "▁stand", "964": "��war", "965": "ts", "966": "▁", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}

convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py DELETED Viewed

@@ -1,697 +0,0 @@
-#!/usr/bin/env python3
-"""CLI for exporting Parakeet TDT-CTC 110M Hybrid components to CoreML."""
-from __future__ import annotations
-import json
-from dataclasses import asdict
-from pathlib import Path
-from typing import Dict, Optional, Tuple
-import coremltools as ct
-import numpy as np
-import soundfile as sf
-import torch
-import typer
-import nemo.collections.asr as nemo_asr
-from individual_components import (
-    CTCHeadWrapper,
-    DecoderWrapper,
-    EncoderWrapper,
-    ExportSettings,
-    JointWrapper,
-    JointDecisionWrapper,
-    JointDecisionSingleStep,
-    PreprocessorWrapper,
-    MelEncoderWrapper,
-    _coreml_convert,
-)
-DEFAULT_MODEL_ID = "nvidia/parakeet-tdt_ctc-110m"
-AUTHOR = "Fluid Inference"
-def _compute_length(seconds: float, sample_rate: int) -> int:
-    return int(round(seconds * sample_rate))
-def _prepare_audio(
-    validation_audio: Optional[Path],
-    sample_rate: int,
-    max_samples: int,
-    seed: Optional[int],
-) -> torch.Tensor:
-    if validation_audio is None:
-        if seed is not None:
-            torch.manual_seed(seed)
-        audio = torch.randn(1, max_samples, dtype=torch.float32)
-        return audio
-    data, sr = sf.read(str(validation_audio), dtype="float32")
-    if sr != sample_rate:
-        raise typer.BadParameter(
-            f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
-        )
-    if data.ndim > 1:
-        data = data[:, 0]
-    if data.size == 0:
-        raise typer.BadParameter("Validation audio is empty")
-    if data.size < max_samples:
-        pad_width = max_samples - data.size
-        data = np.pad(data, (0, pad_width))
-    elif data.size > max_samples:
-        data = data[:max_samples]
-    audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
-    return audio
-def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
-    # Ensure iOS 17+ target for MLProgram ops and ANE readiness
-    try:
-        model.minimum_deployment_target = ct.target.iOS17
-    except Exception:
-        pass
-    model.short_description = description
-    model.author = AUTHOR
-    path.parent.mkdir(parents=True, exist_ok=True)
-    model.save(str(path))
-def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
-    return tuple(int(dim) for dim in tensor.shape)
-def _parse_compute_units(name: str) -> ct.ComputeUnit:
-    """Parse a human-friendly compute units string into ct.ComputeUnit.
-    Accepted (case-insensitive): ALL, CPU_ONLY, CPU_AND_GPU, CPU_AND_NE.
-    """
-    normalized = str(name).strip().upper()
-    mapping = {
-        "ALL": ct.ComputeUnit.ALL,
-        "CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
-        "CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
-        "CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
-        "CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
-    }
-    if normalized not in mapping:
-        raise typer.BadParameter(
-            f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
-        )
-    return mapping[normalized]
-def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
-    """Parse compute precision string into ct.precision or None.
-    Accepted (case-insensitive): FLOAT32, FLOAT16. If None/empty, returns None (tool default).
-    """
-    if name is None:
-        return None
-    normalized = str(name).strip().upper()
-    if normalized == "":
-        return None
-    mapping = {
-        "FLOAT32": ct.precision.FLOAT32,
-        "FLOAT16": ct.precision.FLOAT16,
-    }
-    if normalized not in mapping:
-        raise typer.BadParameter(
-            f"Unknown compute precision '{name}'. Choose from: " + ", ".join(mapping.keys())
-        )
-    return mapping[normalized]
-# Fixed export choices: CPU_ONLY + FP32, min target iOS17
-app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
-@app.command()
-def convert(
-    nemo_path: Optional[Path] = typer.Option(
-        None,
-        "--nemo-path",
-        exists=True,
-        resolve_path=True,
-        help="Path to parakeet-tdt_ctc-110m .nemo checkpoint (skip to auto-download)",
-    ),
-    model_id: str = typer.Option(
-        DEFAULT_MODEL_ID,
-        "--model-id",
-        help="Model identifier to download when --nemo-path is omitted",
-    ),
-    output_dir: Path = typer.Option(Path("parakeet_110m_coreml"), help="Directory where mlpackages and metadata will be written"),
-    preprocessor_cu: str = typer.Option(
-        "CPU_ONLY",
-        "--preprocessor-cu",
-        help="Compute units for preprocessor (default CPU_ONLY)",
-    ),
-    mel_encoder_cu: str = typer.Option(
-        "CPU_ONLY",
-        "--mel-encoder-cu",
-        help="Compute units for fused mel+encoder (default CPU_ONLY)",
-    ),
-    compute_precision: Optional[str] = typer.Option(
-        None,
-        "--compute-precision",
-        help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
-    ),
-) -> None:
-    """Export all Parakeet TDT-CTC 110M Hybrid sub-modules to CoreML with a fixed 15-second window.
-    This exports both CTC and TDT components from the hybrid model.
-    """
-    # Runtime CoreML contract keeps U=1 so the prediction net matches the streaming decoder.
-    export_settings = ExportSettings(
-        output_dir=output_dir,
-        compute_units=ct.ComputeUnit.CPU_ONLY,  # Default: CPU-only for all components
-        deployment_target=ct.target.iOS17,  # iOS 17+ features and kernels
-        compute_precision=_parse_compute_precision(compute_precision),
-        max_audio_seconds=15.0,
-        max_symbol_steps=1,
-    )
-    typer.echo("Export configuration:")
-    typer.echo(asdict(export_settings))
-    output_dir.mkdir(parents=True, exist_ok=True)
-    pre_cu = _parse_compute_units(preprocessor_cu)
-    melenc_cu = _parse_compute_units(mel_encoder_cu)
-    if nemo_path is not None:
-        typer.echo(f"Loading NeMo model from {nemo_path}…")
-        # 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
-        asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(
-            str(nemo_path), map_location="cpu"
-        )
-        checkpoint_meta = {
-            "type": "file",
-            "path": str(nemo_path),
-        }
-    else:
-        typer.echo(f"Downloading NeMo model via {model_id}…")
-        # 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
-        asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
-            model_id, map_location="cpu"
-        )
-        checkpoint_meta = {
-            "type": "pretrained",
-            "model_id": model_id,
-        }
-    asr_model.eval()
-    sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
-    max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
-    # Look for a bundled 15s 16kHz audio file
-    default_audio = (Path(__file__).parent / "audio" / "yc_first_minute_16k_15s.wav").resolve()
-    if default_audio.exists():
-        typer.echo(f"Using trace audio: {default_audio}")
-        audio_tensor = _prepare_audio(default_audio, sample_rate, max_samples, seed=None)
-    else:
-        typer.echo("No trace audio found, using random noise for tracing")
-        audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
-    audio_length = torch.tensor([max_samples], dtype=torch.int32)
-    preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
-    encoder = EncoderWrapper(asr_model.encoder.eval())
-    decoder = DecoderWrapper(asr_model.decoder.eval())
-    joint = JointWrapper(asr_model.joint.eval())
-    # CTC head for hybrid model
-    ctc_head = CTCHeadWrapper(asr_model.ctc_decoder.eval())
-    decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
-    asr_model.decoder._rnnt_export = True
-    try:
-        with torch.inference_mode():
-            mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
-            mel_length_ref = mel_length_ref.to(dtype=torch.int32)
-            encoder_ref, encoder_length_ref = encoder(mel_ref, mel_length_ref)
-            encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
-            # CTC log probs
-            ctc_log_probs_ref = ctc_head(encoder_ref)
-        # Clone Tensors to drop the inference tensor flag before tracing
-        mel_ref = mel_ref.clone()
-        mel_length_ref = mel_length_ref.clone()
-        encoder_ref = encoder_ref.clone()
-        encoder_length_ref = encoder_length_ref.clone()
-        ctc_log_probs_ref = ctc_log_probs_ref.clone()
-        vocab_size = int(asr_model.tokenizer.vocab_size)
-        num_extra = int(asr_model.joint.num_extra_outputs)
-        decoder_hidden = int(asr_model.decoder.pred_hidden)
-        decoder_layers = int(asr_model.decoder.pred_rnn_layers)
-        typer.echo(f"Model info:")
-        typer.echo(f"  Vocab size: {vocab_size}")
-        typer.echo(f"  Num extra (duration bins): {num_extra}")
-        typer.echo(f"  Decoder hidden: {decoder_hidden}")
-        typer.echo(f"  Decoder layers: {decoder_layers}")
-        typer.echo(f"  Encoder output shape: {_tensor_shape(encoder_ref)}")
-        targets = torch.full(
-            (1, export_settings.max_symbol_steps),
-            fill_value=asr_model.decoder.blank_idx,
-            dtype=torch.int32,
-        )
-        target_lengths = torch.tensor(
-            [export_settings.max_symbol_steps], dtype=torch.int32
-        )
-        zero_state = torch.zeros(
-            decoder_layers,
-            1,
-            decoder_hidden,
-            dtype=torch.float32,
-        )
-        with torch.inference_mode():
-            decoder_ref, h_ref, c_ref = decoder(targets, target_lengths, zero_state, zero_state)
-            joint_ref = joint(encoder_ref, decoder_ref)
-        decoder_ref = decoder_ref.clone()
-        h_ref = h_ref.clone()
-        c_ref = c_ref.clone()
-        joint_ref = joint_ref.clone()
-        typer.echo(f"  Decoder output shape: {_tensor_shape(decoder_ref)}")
-        typer.echo(f"  Joint output shape: {_tensor_shape(joint_ref)}")
-        typer.echo(f"  CTC log probs shape: {_tensor_shape(ctc_log_probs_ref)}")
-        typer.echo("Tracing and converting preprocessor…")
-        # Ensure tracing happens on CPU explicitly
-        preprocessor = preprocessor.cpu()
-        audio_tensor = audio_tensor.cpu()
-        audio_length = audio_length.cpu()
-        traced_preprocessor = torch.jit.trace(
-            preprocessor, (audio_tensor, audio_length), strict=False
-        )
-        traced_preprocessor.eval()
-        preprocessor_inputs = [
-            # Allow variable-length audio up to the fixed 15s window using RangeDim
-            ct.TensorType(
-                name="audio",
-                shape=(1, ct.RangeDim(1, max_samples)),
-                dtype=np.float32,
-            ),
-            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
-        ]
-        preprocessor_outputs = [
-            ct.TensorType(name="mel_features", dtype=np.float32),
-            ct.TensorType(name="mel_length", dtype=np.int32),
-        ]
-        # Preprocessor compute units (parametrized; default CPU_ONLY)
-        preprocessor_model = _coreml_convert(
-            traced_preprocessor,
-            preprocessor_inputs,
-            preprocessor_outputs,
-            export_settings,
-            compute_units_override=pre_cu,
-        )
-        preprocessor_path = output_dir / "parakeet_preprocessor.mlpackage"
-        _save_mlpackage(
-            preprocessor_model,
-            preprocessor_path,
-            "Parakeet 110M preprocessor (15 s window)",
-        )
-        typer.echo("Tracing and converting encoder…")
-        traced_encoder = torch.jit.trace(
-            encoder, (mel_ref, mel_length_ref), strict=False
-        )
-        traced_encoder.eval()
-        encoder_inputs = [
-            ct.TensorType(name="mel_features", shape=_tensor_shape(mel_ref), dtype=np.float32),
-            ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
-        ]
-        encoder_outputs = [
-            ct.TensorType(name="encoder_output", dtype=np.float32),
-            ct.TensorType(name="encoder_length", dtype=np.int32),
-        ]
-        # Encoder: CPU only
-        encoder_model = _coreml_convert(
-            traced_encoder,
-            encoder_inputs,
-            encoder_outputs,
-            export_settings,
-            compute_units_override=ct.ComputeUnit.CPU_ONLY,
-        )
-        encoder_path = output_dir / "parakeet_encoder.mlpackage"
-        _save_mlpackage(
-            encoder_model,
-            encoder_path,
-            "Parakeet 110M encoder (15 s window)",
-        )
-        # CTC Head for hybrid model
-        typer.echo("Tracing and converting CTC head…")
-        traced_ctc_head = torch.jit.trace(
-            ctc_head, (encoder_ref,), strict=False
-        )
-        traced_ctc_head.eval()
-        ctc_head_inputs = [
-            ct.TensorType(name="encoder_output", shape=_tensor_shape(encoder_ref), dtype=np.float32),
-        ]
-        ctc_head_outputs = [
-            ct.TensorType(name="ctc_logits", dtype=np.float32),
-        ]
-        ctc_head_model = _coreml_convert(
-            traced_ctc_head,
-            ctc_head_inputs,
-            ctc_head_outputs,
-            export_settings,
-            compute_units_override=ct.ComputeUnit.CPU_ONLY,
-        )
-        ctc_head_path = output_dir / "parakeet_ctc_head.mlpackage"
-        _save_mlpackage(
-            ctc_head_model,
-            ctc_head_path,
-            "Parakeet 110M CTC decoder head",
-        )
-        # Optional fused export: Preprocessor + Encoder
-        typer.echo("Tracing and converting fused mel+encoder…")
-        mel_encoder = MelEncoderWrapper(preprocessor, encoder)
-        traced_mel_encoder = torch.jit.trace(
-            mel_encoder, (audio_tensor, audio_length), strict=False
-        )
-        traced_mel_encoder.eval()
-        mel_encoder_inputs = [
-            # Keep fixed 15s window for fused Mel+Encoder
-            ct.TensorType(name="audio", shape=(1, max_samples), dtype=np.float32),
-            ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
-        ]
-        mel_encoder_outputs = [
-            ct.TensorType(name="encoder_output", dtype=np.float32),
-            ct.TensorType(name="encoder_length", dtype=np.int32),
-        ]
-        # Fused mel+encoder compute units (parametrized; default CPU_ONLY)
-        mel_encoder_model = _coreml_convert(
-            traced_mel_encoder,
-            mel_encoder_inputs,
-            mel_encoder_outputs,
-            export_settings,
-            compute_units_override=melenc_cu,
-        )
-        mel_encoder_path = output_dir / "parakeet_mel_encoder.mlpackage"
-        _save_mlpackage(
-            mel_encoder_model,
-            mel_encoder_path,
-            "Parakeet 110M fused Mel+Encoder (15 s window)",
-        )
-        typer.echo("Tracing and converting decoder…")
-        traced_decoder = torch.jit.trace(
-            decoder,
-            (targets, target_lengths, zero_state, zero_state),
-            strict=False,
-        )
-        traced_decoder.eval()
-        decoder_inputs = [
-            ct.TensorType(name="targets", shape=_tensor_shape(targets), dtype=np.int32),
-            ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
-            ct.TensorType(name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32),
-            ct.TensorType(name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32),
-        ]
-        decoder_outputs = [
-            ct.TensorType(name="decoder", dtype=np.float32),
-            ct.TensorType(name="h_out", dtype=np.float32),
-            ct.TensorType(name="c_out", dtype=np.float32),
-        ]
-        # Decoder: CPU only
-        decoder_model = _coreml_convert(
-            traced_decoder,
-            decoder_inputs,
-            decoder_outputs,
-            export_settings,
-            compute_units_override=ct.ComputeUnit.CPU_ONLY,
-        )
-        decoder_path = output_dir / "parakeet_decoder.mlpackage"
-        _save_mlpackage(
-            decoder_model,
-            decoder_path,
-            "Parakeet 110M decoder (RNNT prediction network)",
-        )
-        typer.echo("Tracing and converting joint…")
-        traced_joint = torch.jit.trace(
-            joint,
-            (encoder_ref, decoder_ref),
-            strict=False,
-        )
-        traced_joint.eval()
-        joint_inputs = [
-            ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
-            ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
-        ]
-        joint_outputs = [
-            ct.TensorType(name="logits", dtype=np.float32),
-        ]
-        # Joint: CPU only
-        joint_model = _coreml_convert(
-            traced_joint,
-            joint_inputs,
-            joint_outputs,
-            export_settings,
-            compute_units_override=ct.ComputeUnit.CPU_ONLY,
-        )
-        joint_path = output_dir / "parakeet_joint.mlpackage"
-        _save_mlpackage(
-            joint_model,
-            joint_path,
-            "Parakeet 110M joint network (RNNT)",
-        )
-        # Joint + decision head (split logits, softmax, argmax)
-        typer.echo("Tracing and converting joint decision head…")
-        vocab_size = int(asr_model.tokenizer.vocab_size)
-        num_extra = int(asr_model.joint.num_extra_outputs)
-        joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size, num_extra=num_extra)
-        traced_joint_decision = torch.jit.trace(
-            joint_decision,
-            (encoder_ref, decoder_ref),
-            strict=False,
-        )
-        traced_joint_decision.eval()
-        joint_decision_inputs = [
-            ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
-            ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
-        ]
-        joint_decision_outputs = [
-            ct.TensorType(name="token_id", dtype=np.int32),
-            ct.TensorType(name="token_prob", dtype=np.float32),
-            ct.TensorType(name="duration", dtype=np.int32),
-        ]
-        # JointDecision: CPU only
-        joint_decision_model = _coreml_convert(
-            traced_joint_decision,
-            joint_decision_inputs,
-            joint_decision_outputs,
-            export_settings,
-            compute_units_override=ct.ComputeUnit.CPU_ONLY,
-        )
-        joint_decision_path = output_dir / "parakeet_joint_decision.mlpackage"
-        _save_mlpackage(
-            joint_decision_model,
-            joint_decision_path,
-            "Parakeet 110M joint + decision head (split, softmax, argmax)",
-        )
-        # Single-step JointDecision for [1,512,1] x [1,640,1] -> [1,1,1]
-        # Note: 110M encoder dim is 512 (not 1024 like 0.6B)
-        typer.echo("Tracing and converting single-step joint decision…")
-        jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size, num_extra=num_extra)
-        # Create single-step slices from refs
-        enc_step = encoder_ref[:, :, :1].contiguous()
-        dec_step = decoder_ref[:, :, :1].contiguous()
-        traced_jd_single = torch.jit.trace(
-            jd_single,
-            (enc_step, dec_step),
-            strict=False,
-        )
-        traced_jd_single.eval()
-        jd_single_inputs = [
-            ct.TensorType(name="encoder_step", shape=(1, enc_step.shape[1], 1), dtype=np.float32),
-            ct.TensorType(name="decoder_step", shape=(1, dec_step.shape[1], 1), dtype=np.float32),
-        ]
-        jd_single_outputs = [
-            ct.TensorType(name="token_id", dtype=np.int32),
-            ct.TensorType(name="token_prob", dtype=np.float32),
-            ct.TensorType(name="duration", dtype=np.int32),
-            ct.TensorType(name="top_k_ids", dtype=np.int32),
-            ct.TensorType(name="top_k_logits", dtype=np.float32),
-        ]
-        # Single-step JointDecision: CPU only
-        jd_single_model = _coreml_convert(
-            traced_jd_single,
-            jd_single_inputs,
-            jd_single_outputs,
-            export_settings,
-            compute_units_override=ct.ComputeUnit.CPU_ONLY,
-        )
-        jd_single_path = output_dir / "parakeet_joint_decision_single_step.mlpackage"
-        _save_mlpackage(
-            jd_single_model,
-            jd_single_path,
-            "Parakeet 110M single-step joint decision (current frame)",
-        )
-        # Export vocabulary
-        typer.echo("Exporting vocabulary…")
-        vocab_path = output_dir / "vocab.json"
-        vocab_dict = {
-            "vocab_size": vocab_size,
-            "blank_id": int(asr_model.decoder.blank_idx),
-            "tokens": asr_model.tokenizer.vocab,
-        }
-        vocab_path.write_text(json.dumps(vocab_dict, indent=2, ensure_ascii=False))
-        metadata: Dict[str, object] = {
-            "model_id": model_id,
-            "model_type": "hybrid_rnnt_ctc",
-            "sample_rate": sample_rate,
-            "max_audio_seconds": export_settings.max_audio_seconds,
-            "max_audio_samples": max_samples,
-            "max_symbol_steps": export_settings.max_symbol_steps,
-            "vocab_size": vocab_size,
-            "joint_extra_outputs": num_extra,
-            "encoder_dim": int(encoder_ref.shape[1]),  # 512 for 110M
-            "decoder_dim": int(decoder_ref.shape[1]),  # 640
-            "decoder_hidden": decoder_hidden,
-            "decoder_layers": decoder_layers,
-            "blank_id": int(asr_model.decoder.blank_idx),
-            "checkpoint": checkpoint_meta,
-            "coreml": {
-                "compute_units": export_settings.compute_units.name,
-                "compute_precision": (
-                    export_settings.compute_precision.name
-                    if export_settings.compute_precision is not None
-                    else "FLOAT32"
-                ),
-            },
-            "components": {
-                "preprocessor": {
-                    "inputs": {
-                        "audio_signal": list(_tensor_shape(audio_tensor)),
-                        "audio_length": [1],
-                    },
-                    "outputs": {
-                        "mel": list(_tensor_shape(mel_ref)),
-                        "mel_length": [1],
-                    },
-                    "path": preprocessor_path.name,
-                },
-                "encoder": {
-                    "inputs": {
-                        "mel": list(_tensor_shape(mel_ref)),
-                        "mel_length": [1],
-                    },
-                    "outputs": {
-                        "encoder": list(_tensor_shape(encoder_ref)),
-                        "encoder_length": [1],
-                    },
-                    "path": encoder_path.name,
-                },
-                "ctc_head": {
-                    "inputs": {
-                        "encoder": list(_tensor_shape(encoder_ref)),
-                    },
-                    "outputs": {
-                        "log_probs": list(_tensor_shape(ctc_log_probs_ref)),
-                    },
-                    "path": ctc_head_path.name,
-                },
-                "mel_encoder": {
-                    "inputs": {
-                        "audio_signal": [1, max_samples],
-                        "audio_length": [1],
-                    },
-                    "outputs": {
-                        "encoder": list(_tensor_shape(encoder_ref)),
-                        "encoder_length": [1],
-                    },
-                    "path": mel_encoder_path.name,
-                },
-                "decoder": {
-                    "inputs": {
-                        "targets": list(_tensor_shape(targets)),
-                        "target_length": [1],
-                        "h_in": list(_tensor_shape(zero_state)),
-                        "c_in": list(_tensor_shape(zero_state)),
-                    },
-                    "outputs": {
-                        "decoder": list(_tensor_shape(decoder_ref)),
-                        "h_out": list(_tensor_shape(h_ref)),
-                        "c_out": list(_tensor_shape(c_ref)),
-                    },
-                    "path": decoder_path.name,
-                },
-                "joint": {
-                    "inputs": {
-                        "encoder": list(_tensor_shape(encoder_ref)),
-                        "decoder": list(_tensor_shape(decoder_ref)),
-                    },
-                    "outputs": {
-                        "logits": list(_tensor_shape(joint_ref)),
-                    },
-                    "path": joint_path.name,
-                },
-                "joint_decision": {
-                    "inputs": {
-                        "encoder": list(_tensor_shape(encoder_ref)),
-                        "decoder": list(_tensor_shape(decoder_ref)),
-                    },
-                    "outputs": {
-                        "token_id": [
-                            _tensor_shape(encoder_ref)[0],
-                            _tensor_shape(encoder_ref)[2],
-                            _tensor_shape(decoder_ref)[2],
-                        ],
-                        "token_prob": [
-                            _tensor_shape(encoder_ref)[0],
-                            _tensor_shape(encoder_ref)[2],
-                            _tensor_shape(decoder_ref)[2],
-                        ],
-                        "duration": [
-                            _tensor_shape(encoder_ref)[0],
-                            _tensor_shape(encoder_ref)[2],
-                            _tensor_shape(decoder_ref)[2],
-                        ],
-                    },
-                    "path": joint_decision_path.name,
-                },
-                "joint_decision_single_step": {
-                    "inputs": {
-                        "encoder_step": [1, int(encoder_ref.shape[1]), 1],
-                        "decoder_step": [1, int(decoder_ref.shape[1]), 1],
-                    },
-                    "outputs": {
-                        "token_id": [1, 1, 1],
-                        "token_prob": [1, 1, 1],
-                        "duration": [1, 1, 1],
-                        "top_k_ids": [1, 1, 1, 64],
-                        "top_k_logits": [1, 1, 1, 64],
-                    },
-                    "path": jd_single_path.name,
-                },
-            },
-        }
-        metadata_path = output_dir / "metadata.json"
-        metadata_path.write_text(json.dumps(metadata, indent=2))
-        typer.echo(f"Export complete. Metadata written to {metadata_path}")
-    finally:
-        asr_model.decoder._rnnt_export = decoder_export_flag
-if __name__ == "__main__":
-    app()

convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json DELETED Viewed

@@ -1,35 +0,0 @@
-{
-  "approach" : "single-encoder",
-  "model" : "parakeet-tdt-ctc-110m-hybrid",
-  "results" : [
-    {
-      "audioLength" : 15,
-      "ctcDetections" : [
-        {
-          "endTime" : 6.0800000000000001,
-          "inReference" : true,
-          "score" : -8.3699999999999992,
-          "source" : "ctc",
-          "startTime" : 4.96,
-          "word" : "LATAM Airlines"
-        }
-      ],
-      "dictFound" : 1,
-      "dictTotal" : 1,
-      "fileId" : "4329526_chunk0",
-      "hypothesis" : "goodday everyone and welcome to latam airlines group earnings release confonference call just as a reminder this conference is being recorded lat tam airlines group eararnings releaseed for the",
-      "processingTime" : 0.070000000000000007,
-      "reference" : "good day everyone and welcome to latam airlines group earnings release conference call just as a reminder this conference is being recorded latam airlines group earnings released for the",
-      "wer" : 24.140000000000001
-    }
-  ],
-  "summary" : {
-    "avgWer" : 24.140000000000001,
-    "dictPass" : 1,
-    "dictRate" : 100,
-    "dictTotal" : 1,
-    "totalAudioDuration" : 15,
-    "totalProcessingTime" : 0.070000000000000007,
-    "totalTests" : 1
-  }
-}