Alex-Wengg
commited on
Commit
Β·
8036e60
1
Parent(s):
e7fd089
cleanup: flatten repo to CTC-only models at root
Browse filesMove AudioEncoder.mlmodelc, MelSpectrogram.mlmodelc, and vocab.json
from parakeet-ctc-110m-coreml/ subdirectory to repo root.
Remove everything not needed by the Swift FluidAudio library:
- TDT model copies (parakeet-tdt-0.6b-v2-coreml/, parakeet-tdt-v2-0.6b/)
- Conversion scripts (convert/, parakeet-tdt-ctc-110m/, scripts/)
- CLI benchmarks (cli/) β already in FluidAudio repo
- Duplicate model copies (models/)
- Python artifacts (pyproject.toml, uv.lock)
- HuggingFace tokenizer files (tokenizer.json, tokenizer.model, etc.)
- .DS_Store files
Matches the clean flat structure of parakeet-ctc-0.6b-coreml.
This view is limited to 50 files because it contains too many changes. Β
See raw diff
- {models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/analytics/coremldata.bin +0 -0
- {models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/coremldata.bin +0 -0
- {models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/metadata.json +0 -0
- {models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/model.mil +0 -0
- {models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/weights/weight.bin +0 -0
- {models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/analytics/coremldata.bin +0 -0
- {models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/coremldata.bin +0 -0
- {models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/metadata.json +0 -0
- {models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/model.mil +0 -0
- {models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/weights/weight.bin +0 -0
- cli/CtcEarningsBenchmark.swift +0 -1048
- cli/HybridEarningsBenchmark.swift +0 -554
- config.json +0 -1
- convert/.DS_Store +0 -0
- convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py +0 -323
- convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json +0 -66
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil +0 -24
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json +0 -118
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil +0 -45
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json +0 -105
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil +0 -0
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json +0 -102
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil +0 -58
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json +0 -123
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil +0 -69
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json +0 -112
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil +0 -191
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin +0 -3
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json +0 -247
- convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json +0 -1
- convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py +0 -697
- convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json +0 -35
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/analytics/coremldata.bin
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/coremldata.bin
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/metadata.json
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/model.mil
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/AudioEncoder.mlmodelc β AudioEncoder.mlmodelc}/weights/weight.bin
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/analytics/coremldata.bin
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/coremldata.bin
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/metadata.json
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/model.mil
RENAMED
|
File without changes
|
{models/parakeet-ctc-110m-coreml/MelSpectrogram.mlmodelc β MelSpectrogram.mlmodelc}/weights/weight.bin
RENAMED
|
File without changes
|
cli/CtcEarningsBenchmark.swift
DELETED
|
@@ -1,1048 +0,0 @@
|
|
| 1 |
-
#if os(macOS)
|
| 2 |
-
import AVFoundation
|
| 3 |
-
import CoreML
|
| 4 |
-
import FluidAudio
|
| 5 |
-
import Foundation
|
| 6 |
-
|
| 7 |
-
/// Earnings22 benchmark using TDT for transcription + CTC for keyword spotting.
|
| 8 |
-
/// TDT provides low WER transcription, CTC provides high recall dictionary detection.
|
| 9 |
-
public enum CtcEarningsBenchmark {
|
| 10 |
-
|
| 11 |
-
private enum KeywordMode: String {
|
| 12 |
-
case chunk
|
| 13 |
-
case file
|
| 14 |
-
}
|
| 15 |
-
|
| 16 |
-
/// Default CTC model directory
|
| 17 |
-
private static func defaultCtcModelPath() -> String? {
|
| 18 |
-
let appSupport = FileManager.default.urls(
|
| 19 |
-
for: .applicationSupportDirectory, in: .userDomainMask
|
| 20 |
-
).first!
|
| 21 |
-
let modelPath = appSupport.appendingPathComponent("FluidAudio/Models/parakeet-ctc-110m-coreml")
|
| 22 |
-
if FileManager.default.fileExists(atPath: modelPath.path) {
|
| 23 |
-
return modelPath.path
|
| 24 |
-
}
|
| 25 |
-
return nil
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
/// Default data directory (from download command)
|
| 29 |
-
private static func defaultDataDir() -> String? {
|
| 30 |
-
let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
|
| 31 |
-
if FileManager.default.fileExists(atPath: dataDir.path) {
|
| 32 |
-
return dataDir.path
|
| 33 |
-
}
|
| 34 |
-
return nil
|
| 35 |
-
}
|
| 36 |
-
|
| 37 |
-
public static func runCLI(arguments: [String]) async {
|
| 38 |
-
// Check for help
|
| 39 |
-
if arguments.contains("--help") || arguments.contains("-h") {
|
| 40 |
-
printUsage()
|
| 41 |
-
return
|
| 42 |
-
}
|
| 43 |
-
|
| 44 |
-
// Parse arguments
|
| 45 |
-
var dataDir: String? = nil
|
| 46 |
-
var outputFile = "ctc_earnings_benchmark.json"
|
| 47 |
-
var maxFiles: Int? = nil
|
| 48 |
-
var ctcModelPath: String? = nil
|
| 49 |
-
// Note: Using v2 by default because v3 has issues with certain audio files
|
| 50 |
-
// (returns empty transcription for ~7 files in Earnings22 dataset)
|
| 51 |
-
var tdtVersion: AsrModelVersion = .v2
|
| 52 |
-
var autoDownload = false
|
| 53 |
-
var keywordMode: KeywordMode = .chunk
|
| 54 |
-
|
| 55 |
-
var i = 0
|
| 56 |
-
while i < arguments.count {
|
| 57 |
-
switch arguments[i] {
|
| 58 |
-
case "--data-dir":
|
| 59 |
-
if i + 1 < arguments.count {
|
| 60 |
-
dataDir = arguments[i + 1]
|
| 61 |
-
i += 1
|
| 62 |
-
}
|
| 63 |
-
case "--output", "-o":
|
| 64 |
-
if i + 1 < arguments.count {
|
| 65 |
-
outputFile = arguments[i + 1]
|
| 66 |
-
i += 1
|
| 67 |
-
}
|
| 68 |
-
case "--max-files":
|
| 69 |
-
if i + 1 < arguments.count {
|
| 70 |
-
maxFiles = Int(arguments[i + 1])
|
| 71 |
-
i += 1
|
| 72 |
-
}
|
| 73 |
-
case "--ctc-model":
|
| 74 |
-
if i + 1 < arguments.count {
|
| 75 |
-
ctcModelPath = arguments[i + 1]
|
| 76 |
-
i += 1
|
| 77 |
-
}
|
| 78 |
-
case "--tdt-version":
|
| 79 |
-
if i + 1 < arguments.count {
|
| 80 |
-
if arguments[i + 1] == "v2" || arguments[i + 1] == "2" {
|
| 81 |
-
tdtVersion = .v2
|
| 82 |
-
}
|
| 83 |
-
i += 1
|
| 84 |
-
}
|
| 85 |
-
case "--auto-download":
|
| 86 |
-
autoDownload = true
|
| 87 |
-
case "--keyword-mode":
|
| 88 |
-
if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
|
| 89 |
-
keywordMode = mode
|
| 90 |
-
i += 1
|
| 91 |
-
}
|
| 92 |
-
default:
|
| 93 |
-
break
|
| 94 |
-
}
|
| 95 |
-
i += 1
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
// Use defaults if not specified
|
| 99 |
-
if dataDir == nil {
|
| 100 |
-
dataDir = defaultDataDir()
|
| 101 |
-
}
|
| 102 |
-
if ctcModelPath == nil {
|
| 103 |
-
ctcModelPath = defaultCtcModelPath()
|
| 104 |
-
}
|
| 105 |
-
|
| 106 |
-
// Handle auto-download for dataset
|
| 107 |
-
if autoDownload && dataDir == nil {
|
| 108 |
-
print("π₯ Downloading earnings22-kws dataset...")
|
| 109 |
-
await DatasetDownloader.downloadEarnings22KWS(force: false)
|
| 110 |
-
dataDir = defaultDataDir()
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
-
// Handle auto-download for CTC models
|
| 114 |
-
if autoDownload && ctcModelPath == nil {
|
| 115 |
-
print("π₯ Downloading CTC models...")
|
| 116 |
-
do {
|
| 117 |
-
_ = try await CtcModels.download()
|
| 118 |
-
ctcModelPath = defaultCtcModelPath()
|
| 119 |
-
} catch {
|
| 120 |
-
print("ERROR: Failed to download CTC models: \(error)")
|
| 121 |
-
}
|
| 122 |
-
}
|
| 123 |
-
|
| 124 |
-
print("Earnings Benchmark (TDT transcription + CTC keyword spotting)")
|
| 125 |
-
print(" Data directory: \(dataDir ?? "not found")")
|
| 126 |
-
print(" Output file: \(outputFile)")
|
| 127 |
-
print(" TDT version: \(tdtVersion == .v2 ? "v2" : "v3")")
|
| 128 |
-
print(" CTC model: \(ctcModelPath ?? "not found")")
|
| 129 |
-
print(" Keyword mode: \(keywordMode.rawValue)")
|
| 130 |
-
|
| 131 |
-
guard let finalDataDir = dataDir else {
|
| 132 |
-
print("ERROR: Data directory not found")
|
| 133 |
-
print("π‘ Download with: fluidaudio download --dataset earnings22-kws")
|
| 134 |
-
print(" Or specify: --data-dir <path>")
|
| 135 |
-
printUsage()
|
| 136 |
-
return
|
| 137 |
-
}
|
| 138 |
-
|
| 139 |
-
guard let modelPath = ctcModelPath else {
|
| 140 |
-
print("ERROR: CTC model not found")
|
| 141 |
-
print("π‘ Download parakeet-ctc-110m-coreml model to:")
|
| 142 |
-
print(" ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
|
| 143 |
-
print(" Or specify: --ctc-model <path>")
|
| 144 |
-
printUsage()
|
| 145 |
-
return
|
| 146 |
-
}
|
| 147 |
-
|
| 148 |
-
let dataDirResolved = finalDataDir
|
| 149 |
-
|
| 150 |
-
do {
|
| 151 |
-
// Load TDT models for transcription
|
| 152 |
-
print("Loading TDT models (\(tdtVersion == .v2 ? "v2" : "v3")) for transcription...")
|
| 153 |
-
let tdtModels = try await AsrModels.downloadAndLoad(version: tdtVersion)
|
| 154 |
-
let asrManager = AsrManager(config: .default)
|
| 155 |
-
try await asrManager.initialize(models: tdtModels)
|
| 156 |
-
print("TDT models loaded successfully")
|
| 157 |
-
|
| 158 |
-
// Load CTC models for keyword spotting
|
| 159 |
-
print("Loading CTC models from: \(modelPath)")
|
| 160 |
-
let modelDir = URL(fileURLWithPath: modelPath)
|
| 161 |
-
let ctcModels = try await CtcModels.loadDirect(from: modelDir)
|
| 162 |
-
print("Loaded CTC vocabulary with \(ctcModels.vocabulary.count) tokens")
|
| 163 |
-
|
| 164 |
-
// Create keyword spotter
|
| 165 |
-
let vocabSize = ctcModels.vocabulary.count
|
| 166 |
-
let blankId = vocabSize // Blank is at index = vocab_size
|
| 167 |
-
let spotter = CtcKeywordSpotter(models: ctcModels, blankId: blankId)
|
| 168 |
-
print("Created CTC spotter with blankId=\(blankId)")
|
| 169 |
-
|
| 170 |
-
// Collect test files
|
| 171 |
-
let dataDirURL = URL(fileURLWithPath: dataDirResolved)
|
| 172 |
-
let fileIds = try collectFileIds(from: dataDirURL, maxFiles: maxFiles)
|
| 173 |
-
let keywordIndex = try buildKeywordIndex(dataDir: dataDirURL, keywordMode: keywordMode)
|
| 174 |
-
|
| 175 |
-
if fileIds.isEmpty {
|
| 176 |
-
print("ERROR: No test files found in \(dataDirResolved)")
|
| 177 |
-
return
|
| 178 |
-
}
|
| 179 |
-
|
| 180 |
-
print("Processing \(fileIds.count) test files...")
|
| 181 |
-
|
| 182 |
-
var results: [[String: Any]] = []
|
| 183 |
-
var totalWer = 0.0
|
| 184 |
-
var totalKeywordReference = 0
|
| 185 |
-
var totalKeywordPredicted = 0
|
| 186 |
-
var totalKeywordTruePositives = 0
|
| 187 |
-
var totalKeywordFalsePositives = 0
|
| 188 |
-
var totalKeywordFalseNegatives = 0
|
| 189 |
-
var totalAudioDuration = 0.0
|
| 190 |
-
var totalProcessingTime = 0.0
|
| 191 |
-
|
| 192 |
-
for (index, fileId) in fileIds.enumerated() {
|
| 193 |
-
print("[\(index + 1)/\(fileIds.count)] \(fileId)")
|
| 194 |
-
|
| 195 |
-
if let result = try await processFile(
|
| 196 |
-
fileId: fileId,
|
| 197 |
-
dataDir: dataDirURL,
|
| 198 |
-
asrManager: asrManager,
|
| 199 |
-
ctcModels: ctcModels,
|
| 200 |
-
spotter: spotter,
|
| 201 |
-
keywordMode: keywordMode,
|
| 202 |
-
keywordIndex: keywordIndex
|
| 203 |
-
) {
|
| 204 |
-
results.append(result)
|
| 205 |
-
totalWer += result["wer"] as? Double ?? 0
|
| 206 |
-
totalKeywordReference += result["keywordReference"] as? Int ?? 0
|
| 207 |
-
totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
|
| 208 |
-
totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
|
| 209 |
-
totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
|
| 210 |
-
totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
|
| 211 |
-
totalAudioDuration += result["audioLength"] as? Double ?? 0
|
| 212 |
-
totalProcessingTime += result["processingTime"] as? Double ?? 0
|
| 213 |
-
|
| 214 |
-
let wer = result["wer"] as? Double ?? 0
|
| 215 |
-
let precision = result["keywordPrecision"] as? Double ?? 0
|
| 216 |
-
let recall = result["keywordRecall"] as? Double ?? 0
|
| 217 |
-
let fscore = result["keywordFscore"] as? Double ?? 0
|
| 218 |
-
print(
|
| 219 |
-
" WER: \(String(format: "%.1f", wer))%, " +
|
| 220 |
-
"KW P/R/F: \(String(format: "%.2f", precision))/" +
|
| 221 |
-
"\(String(format: "%.2f", recall))/" +
|
| 222 |
-
"\(String(format: "%.2f", fscore))"
|
| 223 |
-
)
|
| 224 |
-
}
|
| 225 |
-
}
|
| 226 |
-
|
| 227 |
-
// Calculate summary
|
| 228 |
-
let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
|
| 229 |
-
let keywordPrecision =
|
| 230 |
-
totalKeywordPredicted > 0
|
| 231 |
-
? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
|
| 232 |
-
: 0
|
| 233 |
-
let keywordRecall =
|
| 234 |
-
totalKeywordReference > 0
|
| 235 |
-
? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
|
| 236 |
-
: 0
|
| 237 |
-
let keywordFscore =
|
| 238 |
-
(keywordPrecision + keywordRecall) > 0
|
| 239 |
-
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 240 |
-
: 0
|
| 241 |
-
|
| 242 |
-
// Print summary
|
| 243 |
-
print("\n" + String(repeating: "=", count: 60))
|
| 244 |
-
print("EARNINGS22 BENCHMARK (TDT + CTC)")
|
| 245 |
-
print(String(repeating: "=", count: 60))
|
| 246 |
-
print("Model: \(modelPath)")
|
| 247 |
-
print("Total tests: \(results.count)")
|
| 248 |
-
print("Average WER: \(String(format: "%.2f", avgWer))%")
|
| 249 |
-
print(
|
| 250 |
-
"Keyword Precision/Recall/F1: " +
|
| 251 |
-
"\(String(format: "%.2f", keywordPrecision))/" +
|
| 252 |
-
"\(String(format: "%.2f", keywordRecall))/" +
|
| 253 |
-
"\(String(format: "%.2f", keywordFscore))"
|
| 254 |
-
)
|
| 255 |
-
print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
|
| 256 |
-
print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
|
| 257 |
-
if totalProcessingTime > 0 {
|
| 258 |
-
print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
|
| 259 |
-
}
|
| 260 |
-
print(String(repeating: "=", count: 60))
|
| 261 |
-
|
| 262 |
-
// Sort results by WER descending (worst first)
|
| 263 |
-
let sortedResults = results.sorted { r1, r2 in
|
| 264 |
-
let wer1 = r1["wer"] as? Double ?? 0
|
| 265 |
-
let wer2 = r2["wer"] as? Double ?? 0
|
| 266 |
-
return wer1 > wer2
|
| 267 |
-
}
|
| 268 |
-
|
| 269 |
-
// Save to JSON
|
| 270 |
-
let summaryDict: [String: Any] = [
|
| 271 |
-
"totalTests": results.count,
|
| 272 |
-
"avgWer": round(avgWer * 100) / 100,
|
| 273 |
-
"keywordTruePositives": totalKeywordTruePositives,
|
| 274 |
-
"keywordFalsePositives": totalKeywordFalsePositives,
|
| 275 |
-
"keywordFalseNegatives": totalKeywordFalseNegatives,
|
| 276 |
-
"keywordPredicted": totalKeywordPredicted,
|
| 277 |
-
"keywordReference": totalKeywordReference,
|
| 278 |
-
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 279 |
-
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 280 |
-
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 281 |
-
"totalAudioDuration": round(totalAudioDuration * 100) / 100,
|
| 282 |
-
"totalProcessingTime": round(totalProcessingTime * 100) / 100,
|
| 283 |
-
]
|
| 284 |
-
|
| 285 |
-
let output: [String: Any] = [
|
| 286 |
-
"model": modelPath,
|
| 287 |
-
"keywordMode": keywordMode.rawValue,
|
| 288 |
-
"summary": summaryDict,
|
| 289 |
-
"results": sortedResults,
|
| 290 |
-
]
|
| 291 |
-
|
| 292 |
-
let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
|
| 293 |
-
try jsonData.write(to: URL(fileURLWithPath: outputFile))
|
| 294 |
-
print("\nResults written to: \(outputFile)")
|
| 295 |
-
|
| 296 |
-
} catch {
|
| 297 |
-
print("ERROR: Benchmark failed: \(error)")
|
| 298 |
-
}
|
| 299 |
-
}
|
| 300 |
-
|
| 301 |
-
private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
|
| 302 |
-
var fileIds: [String] = []
|
| 303 |
-
let suffix = ".dictionary.txt"
|
| 304 |
-
|
| 305 |
-
let fileManager = FileManager.default
|
| 306 |
-
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 307 |
-
|
| 308 |
-
for url in contents.sorted(by: { $0.path < $1.path }) {
|
| 309 |
-
let name = url.lastPathComponent
|
| 310 |
-
if name.hasSuffix(suffix) {
|
| 311 |
-
let data = try? Data(contentsOf: url)
|
| 312 |
-
if let data = data, !data.isEmpty {
|
| 313 |
-
let fileId = String(name.dropLast(suffix.count))
|
| 314 |
-
fileIds.append(fileId)
|
| 315 |
-
}
|
| 316 |
-
}
|
| 317 |
-
}
|
| 318 |
-
|
| 319 |
-
if let maxFiles = maxFiles {
|
| 320 |
-
return Array(fileIds.prefix(maxFiles))
|
| 321 |
-
}
|
| 322 |
-
return fileIds
|
| 323 |
-
}
|
| 324 |
-
|
| 325 |
-
private static func processFile(
|
| 326 |
-
fileId: String,
|
| 327 |
-
dataDir: URL,
|
| 328 |
-
asrManager: AsrManager,
|
| 329 |
-
ctcModels: CtcModels,
|
| 330 |
-
spotter: CtcKeywordSpotter,
|
| 331 |
-
keywordMode: KeywordMode,
|
| 332 |
-
keywordIndex: [String: [String]]
|
| 333 |
-
) async throws -> [String: Any]? {
|
| 334 |
-
let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
|
| 335 |
-
let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
|
| 336 |
-
let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
|
| 337 |
-
|
| 338 |
-
let fm = FileManager.default
|
| 339 |
-
guard fm.fileExists(atPath: wavFile.path),
|
| 340 |
-
fm.fileExists(atPath: dictionaryFile.path)
|
| 341 |
-
else {
|
| 342 |
-
return nil
|
| 343 |
-
}
|
| 344 |
-
|
| 345 |
-
// Load dictionary words (chunk or file keywords)
|
| 346 |
-
let dictionaryWords = try loadDictionaryWords(
|
| 347 |
-
fileId: fileId,
|
| 348 |
-
dictionaryFile: dictionaryFile,
|
| 349 |
-
keywordMode: keywordMode,
|
| 350 |
-
keywordIndex: keywordIndex
|
| 351 |
-
)
|
| 352 |
-
|
| 353 |
-
// Load reference text
|
| 354 |
-
let referenceRaw =
|
| 355 |
-
(try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
| 356 |
-
|
| 357 |
-
// Get audio samples
|
| 358 |
-
let audioFile = try AVAudioFile(forReading: wavFile)
|
| 359 |
-
let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
|
| 360 |
-
let format = audioFile.processingFormat
|
| 361 |
-
let frameCount = AVAudioFrameCount(audioFile.length)
|
| 362 |
-
|
| 363 |
-
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
|
| 364 |
-
throw NSError(
|
| 365 |
-
domain: "CtcEarningsBenchmark", code: 1,
|
| 366 |
-
userInfo: [NSLocalizedDescriptionKey: "Failed to create audio buffer"])
|
| 367 |
-
}
|
| 368 |
-
try audioFile.read(into: buffer)
|
| 369 |
-
|
| 370 |
-
// Resample to 16kHz
|
| 371 |
-
let converter = AudioConverter()
|
| 372 |
-
let samples = try converter.resampleBuffer(buffer)
|
| 373 |
-
|
| 374 |
-
let startTime = Date()
|
| 375 |
-
|
| 376 |
-
// 1. TDT transcription for low WER
|
| 377 |
-
let tdtResult = try await asrManager.transcribe(wavFile)
|
| 378 |
-
|
| 379 |
-
// Skip files where TDT returns empty (some audio files cause model issues)
|
| 380 |
-
if tdtResult.text.isEmpty {
|
| 381 |
-
print(" SKIPPED: TDT returned empty transcription")
|
| 382 |
-
return nil
|
| 383 |
-
}
|
| 384 |
-
|
| 385 |
-
// 2. Build custom vocabulary for CTC keyword spotting
|
| 386 |
-
var vocabTerms: [CustomVocabularyTerm] = []
|
| 387 |
-
for word in dictionaryWords {
|
| 388 |
-
let tokenIds = tokenize(word, vocabulary: ctcModels.vocabulary)
|
| 389 |
-
if !tokenIds.isEmpty {
|
| 390 |
-
let term = CustomVocabularyTerm(
|
| 391 |
-
text: word,
|
| 392 |
-
weight: nil,
|
| 393 |
-
aliases: nil,
|
| 394 |
-
tokenIds: nil,
|
| 395 |
-
ctcTokenIds: tokenIds
|
| 396 |
-
)
|
| 397 |
-
vocabTerms.append(term)
|
| 398 |
-
}
|
| 399 |
-
}
|
| 400 |
-
let customVocab = CustomVocabularyContext(terms: vocabTerms)
|
| 401 |
-
|
| 402 |
-
// 3. CTC keyword spotting for high recall dictionary detection
|
| 403 |
-
let spotResult = try await spotter.spotKeywordsWithLogProbs(
|
| 404 |
-
audioSamples: samples,
|
| 405 |
-
customVocabulary: customVocab,
|
| 406 |
-
minScore: nil
|
| 407 |
-
)
|
| 408 |
-
|
| 409 |
-
// 4. Post-process: Use VocabularyRescorer with Argmax-style parameters
|
| 410 |
-
// Argmax uses cbw=3.0 (context-biasing weight) for boosting vocab terms
|
| 411 |
-
let useRescorer = ProcessInfo.processInfo.environment["NO_CTC_RESCORING"] != "1"
|
| 412 |
-
let hypothesis: String
|
| 413 |
-
if useRescorer {
|
| 414 |
-
let rescorerConfig = VocabularyRescorer.Config(
|
| 415 |
-
minScoreAdvantage: 1.0, // Lower threshold - rely more on CTC scoring
|
| 416 |
-
minVocabScore: -15.0, // Permissive to include more detections
|
| 417 |
-
maxOriginalScoreForReplacement: -2.0, // Don't replace very confident words
|
| 418 |
-
vocabBoostWeight: 3.0 // Argmax cbw=3.0
|
| 419 |
-
)
|
| 420 |
-
let rescorer = VocabularyRescorer(
|
| 421 |
-
spotter: spotter,
|
| 422 |
-
vocabulary: customVocab,
|
| 423 |
-
config: rescorerConfig
|
| 424 |
-
)
|
| 425 |
-
let rescoreResult = rescorer.rescore(transcript: tdtResult.text, spotResult: spotResult)
|
| 426 |
-
hypothesis = rescoreResult.text
|
| 427 |
-
} else {
|
| 428 |
-
hypothesis = tdtResult.text // Baseline: no CTC corrections
|
| 429 |
-
}
|
| 430 |
-
|
| 431 |
-
let processingTime = Date().timeIntervalSince(startTime)
|
| 432 |
-
|
| 433 |
-
// Normalize texts
|
| 434 |
-
let referenceNormalized = TextNormalizer.normalize(referenceRaw)
|
| 435 |
-
let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
|
| 436 |
-
|
| 437 |
-
// Keyword sets for precision/recall
|
| 438 |
-
let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
|
| 439 |
-
let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
|
| 440 |
-
let truePositives = referenceKeywords.intersection(predictedKeywords)
|
| 441 |
-
let falsePositives = predictedKeywords.subtracting(referenceKeywords)
|
| 442 |
-
let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
|
| 443 |
-
let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
|
| 444 |
-
let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
|
| 445 |
-
let keywordFscore =
|
| 446 |
-
(keywordPrecision + keywordRecall) > 0
|
| 447 |
-
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 448 |
-
: 0
|
| 449 |
-
|
| 450 |
-
let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 451 |
-
!$0.isEmpty
|
| 452 |
-
}
|
| 453 |
-
let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 454 |
-
!$0.isEmpty
|
| 455 |
-
}
|
| 456 |
-
|
| 457 |
-
// Calculate WER
|
| 458 |
-
let wer: Double
|
| 459 |
-
if referenceWords.isEmpty {
|
| 460 |
-
wer = hypothesisWords.isEmpty ? 0.0 : 1.0
|
| 461 |
-
} else {
|
| 462 |
-
wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
|
| 463 |
-
}
|
| 464 |
-
|
| 465 |
-
// Count dictionary detections (debug only)
|
| 466 |
-
let minCtcScore: Float = -15.0 // Permissive threshold for detection
|
| 467 |
-
var detectionDetails: [[String: Any]] = []
|
| 468 |
-
var ctcFoundWords: Set<String> = []
|
| 469 |
-
|
| 470 |
-
// 1. CTC detections
|
| 471 |
-
for detection in spotResult.detections {
|
| 472 |
-
let inRef = referenceKeywords.contains(detection.term.text.lowercased())
|
| 473 |
-
let detail: [String: Any] = [
|
| 474 |
-
"word": detection.term.text,
|
| 475 |
-
"score": round(Double(detection.score) * 100) / 100,
|
| 476 |
-
"startTime": round(detection.startTime * 100) / 100,
|
| 477 |
-
"endTime": round(detection.endTime * 100) / 100,
|
| 478 |
-
"source": "ctc",
|
| 479 |
-
"inReference": inRef,
|
| 480 |
-
]
|
| 481 |
-
detectionDetails.append(detail)
|
| 482 |
-
|
| 483 |
-
if detection.score >= minCtcScore { // Use >= to include edge cases
|
| 484 |
-
ctcFoundWords.insert(detection.term.text.lowercased())
|
| 485 |
-
}
|
| 486 |
-
}
|
| 487 |
-
|
| 488 |
-
// 2. Fallback: check hypothesis for dictionary words not found by CTC
|
| 489 |
-
let hypothesisLower = hypothesis.lowercased()
|
| 490 |
-
for word in dictionaryWords {
|
| 491 |
-
let wordLower = word.lowercased()
|
| 492 |
-
if !ctcFoundWords.contains(wordLower) {
|
| 493 |
-
// Check if word appears as whole word in hypothesis (avoid substring false positives)
|
| 494 |
-
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 495 |
-
if let regex = try? NSRegularExpression(pattern: pattern, options: []),
|
| 496 |
-
regex.firstMatch(
|
| 497 |
-
in: hypothesisLower, options: [],
|
| 498 |
-
range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
|
| 499 |
-
{
|
| 500 |
-
ctcFoundWords.insert(wordLower)
|
| 501 |
-
let inRef = referenceKeywords.contains(wordLower)
|
| 502 |
-
let detail: [String: Any] = [
|
| 503 |
-
"word": word,
|
| 504 |
-
"score": 0.0,
|
| 505 |
-
"startTime": 0.0,
|
| 506 |
-
"endTime": 0.0,
|
| 507 |
-
"source": "hypothesis",
|
| 508 |
-
"inReference": inRef,
|
| 509 |
-
]
|
| 510 |
-
detectionDetails.append(detail)
|
| 511 |
-
}
|
| 512 |
-
}
|
| 513 |
-
}
|
| 514 |
-
|
| 515 |
-
let result: [String: Any] = [
|
| 516 |
-
"fileId": fileId,
|
| 517 |
-
"reference": referenceNormalized,
|
| 518 |
-
"hypothesis": hypothesisNormalized,
|
| 519 |
-
"wer": round(wer * 10000) / 100,
|
| 520 |
-
"dictFound": predictedKeywords.count,
|
| 521 |
-
"dictTotal": referenceKeywords.count,
|
| 522 |
-
"keywordPredicted": predictedKeywords.count,
|
| 523 |
-
"keywordReference": referenceKeywords.count,
|
| 524 |
-
"keywordTruePositives": truePositives.count,
|
| 525 |
-
"keywordFalsePositives": falsePositives.count,
|
| 526 |
-
"keywordFalseNegatives": falseNegatives.count,
|
| 527 |
-
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 528 |
-
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 529 |
-
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 530 |
-
"audioLength": round(audioLength * 100) / 100,
|
| 531 |
-
"processingTime": round(processingTime * 1000) / 1000,
|
| 532 |
-
"ctcDetections": detectionDetails,
|
| 533 |
-
]
|
| 534 |
-
return result
|
| 535 |
-
}
|
| 536 |
-
|
| 537 |
-
/// Simple tokenization using vocabulary lookup
|
| 538 |
-
private static func tokenize(_ text: String, vocabulary: [Int: String]) -> [Int] {
|
| 539 |
-
// Build reverse vocabulary (token -> id)
|
| 540 |
-
var tokenToId: [String: Int] = [:]
|
| 541 |
-
for (id, token) in vocabulary {
|
| 542 |
-
tokenToId[token] = id
|
| 543 |
-
}
|
| 544 |
-
|
| 545 |
-
let normalizedText = text.lowercased()
|
| 546 |
-
var result: [Int] = []
|
| 547 |
-
var position = normalizedText.startIndex
|
| 548 |
-
var isWordStart = true
|
| 549 |
-
|
| 550 |
-
while position < normalizedText.endIndex {
|
| 551 |
-
var matched = false
|
| 552 |
-
let remaining = normalizedText.distance(from: position, to: normalizedText.endIndex)
|
| 553 |
-
var matchLength = min(20, remaining)
|
| 554 |
-
|
| 555 |
-
while matchLength > 0 {
|
| 556 |
-
let endPos = normalizedText.index(position, offsetBy: matchLength)
|
| 557 |
-
let substring = String(normalizedText[position..<endPos])
|
| 558 |
-
|
| 559 |
-
// Try with SentencePiece prefix for word start
|
| 560 |
-
let withPrefix = isWordStart ? "β" + substring : substring
|
| 561 |
-
|
| 562 |
-
if let tokenId = tokenToId[withPrefix] {
|
| 563 |
-
result.append(tokenId)
|
| 564 |
-
position = endPos
|
| 565 |
-
isWordStart = false
|
| 566 |
-
matched = true
|
| 567 |
-
break
|
| 568 |
-
} else if let tokenId = tokenToId[substring] {
|
| 569 |
-
result.append(tokenId)
|
| 570 |
-
position = endPos
|
| 571 |
-
isWordStart = false
|
| 572 |
-
matched = true
|
| 573 |
-
break
|
| 574 |
-
}
|
| 575 |
-
|
| 576 |
-
matchLength -= 1
|
| 577 |
-
}
|
| 578 |
-
|
| 579 |
-
if !matched {
|
| 580 |
-
let char = normalizedText[position]
|
| 581 |
-
if char == " " {
|
| 582 |
-
isWordStart = true
|
| 583 |
-
position = normalizedText.index(after: position)
|
| 584 |
-
} else {
|
| 585 |
-
// Unknown character - skip
|
| 586 |
-
position = normalizedText.index(after: position)
|
| 587 |
-
isWordStart = false
|
| 588 |
-
}
|
| 589 |
-
}
|
| 590 |
-
}
|
| 591 |
-
|
| 592 |
-
return result
|
| 593 |
-
}
|
| 594 |
-
|
| 595 |
-
/// Apply CTC keyword corrections to TDT transcription using multiple strategies:
|
| 596 |
-
/// 1. Fuzzy matching (for words that are phonetically similar)
|
| 597 |
-
/// 2. Context pattern matching (for "this is X" type patterns)
|
| 598 |
-
/// 3. Proper noun replacement (for names after common patterns)
|
| 599 |
-
private static func applyKeywordCorrections(
|
| 600 |
-
tdtResult: ASRResult,
|
| 601 |
-
detections: [CtcKeywordSpotter.KeywordDetection],
|
| 602 |
-
minScore: Float
|
| 603 |
-
) -> String {
|
| 604 |
-
// Filter detections by score
|
| 605 |
-
let validDetections = detections.filter { $0.score >= minScore }
|
| 606 |
-
guard !validDetections.isEmpty else {
|
| 607 |
-
return tdtResult.text
|
| 608 |
-
}
|
| 609 |
-
|
| 610 |
-
var text = tdtResult.text
|
| 611 |
-
var usedDetections: Set<String> = []
|
| 612 |
-
|
| 613 |
-
// PASS 1: Fuzzy matching for phonetically similar words
|
| 614 |
-
for detection in validDetections {
|
| 615 |
-
let keyword = detection.term.text
|
| 616 |
-
let keywordLower = keyword.lowercased()
|
| 617 |
-
let keywordParts = keywordLower.components(separatedBy: " ").filter { !$0.isEmpty }
|
| 618 |
-
|
| 619 |
-
let words = text.components(separatedBy: .whitespacesAndNewlines).filter { !$0.isEmpty }
|
| 620 |
-
|
| 621 |
-
// Handle multi-word keywords
|
| 622 |
-
if keywordParts.count > 1 {
|
| 623 |
-
for i in 0..<(words.count - keywordParts.count + 1) {
|
| 624 |
-
var allMatch = true
|
| 625 |
-
var matchedWords: [String] = []
|
| 626 |
-
|
| 627 |
-
for j in 0..<keywordParts.count {
|
| 628 |
-
let wordClean = words[i + j].trimmingCharacters(in: .punctuationCharacters).lowercased()
|
| 629 |
-
if isSimilar(wordClean, keywordParts[j]) {
|
| 630 |
-
matchedWords.append(words[i + j])
|
| 631 |
-
} else {
|
| 632 |
-
allMatch = false
|
| 633 |
-
break
|
| 634 |
-
}
|
| 635 |
-
}
|
| 636 |
-
|
| 637 |
-
if allMatch && !matchedWords.isEmpty {
|
| 638 |
-
let originalPhrase = matchedWords.joined(separator: " ")
|
| 639 |
-
let replacement = matchCase(keyword, to: matchedWords[0])
|
| 640 |
-
text = text.replacingOccurrences(of: originalPhrase, with: replacement)
|
| 641 |
-
usedDetections.insert(keyword)
|
| 642 |
-
break
|
| 643 |
-
}
|
| 644 |
-
}
|
| 645 |
-
} else {
|
| 646 |
-
// Single word keyword
|
| 647 |
-
for word in words {
|
| 648 |
-
let wordClean = word.trimmingCharacters(in: .punctuationCharacters).lowercased()
|
| 649 |
-
guard !wordClean.isEmpty else { continue }
|
| 650 |
-
|
| 651 |
-
if isSimilar(wordClean, keywordLower) && wordClean != keywordLower {
|
| 652 |
-
let replacement = matchCase(keyword, to: word)
|
| 653 |
-
text = text.replacingOccurrences(of: word, with: replacement)
|
| 654 |
-
usedDetections.insert(keyword)
|
| 655 |
-
break
|
| 656 |
-
}
|
| 657 |
-
}
|
| 658 |
-
}
|
| 659 |
-
}
|
| 660 |
-
|
| 661 |
-
// PASS 2: Context pattern matching - specifically for "this is X" pattern
|
| 662 |
-
// Only replace if keyword is NOT already in the text
|
| 663 |
-
for detection in validDetections {
|
| 664 |
-
let keyword = detection.term.text
|
| 665 |
-
guard !usedDetections.contains(keyword) else { continue }
|
| 666 |
-
|
| 667 |
-
let keywordLower = keyword.lowercased()
|
| 668 |
-
|
| 669 |
-
// Skip if keyword already exists in text (case-insensitive)
|
| 670 |
-
if text.lowercased().contains(keywordLower) {
|
| 671 |
-
usedDetections.insert(keyword) // Mark as handled
|
| 672 |
-
continue
|
| 673 |
-
}
|
| 674 |
-
|
| 675 |
-
// Check if keyword looks like a proper noun (starts with uppercase)
|
| 676 |
-
let isProperNoun =
|
| 677 |
-
keyword.first?.isUppercase == true
|
| 678 |
-
&& keyword.count >= 3
|
| 679 |
-
&& !stopWords.contains(keywordLower)
|
| 680 |
-
|
| 681 |
-
guard isProperNoun else { continue }
|
| 682 |
-
|
| 683 |
-
// Look for "this is X" pattern specifically for names
|
| 684 |
-
let thisIsPattern = try? NSRegularExpression(pattern: "this is ([A-Z][a-z]+)", options: [])
|
| 685 |
-
if let regex = thisIsPattern {
|
| 686 |
-
let textRange = NSRange(text.startIndex..., in: text)
|
| 687 |
-
if let match = regex.firstMatch(in: text, options: [], range: textRange),
|
| 688 |
-
match.numberOfRanges > 1,
|
| 689 |
-
let captureRange = Range(match.range(at: 1), in: text)
|
| 690 |
-
{
|
| 691 |
-
let capturedWord = String(text[captureRange])
|
| 692 |
-
let capturedLower = capturedWord.lowercased()
|
| 693 |
-
|
| 694 |
-
// Skip if captured word is already a detected keyword
|
| 695 |
-
let isOtherKeyword = validDetections.contains { det in
|
| 696 |
-
det.term.text.lowercased() == capturedLower
|
| 697 |
-
}
|
| 698 |
-
|
| 699 |
-
if !isOtherKeyword && !stopWords.contains(capturedLower) {
|
| 700 |
-
// Similar length check
|
| 701 |
-
if abs(capturedWord.count - keyword.count) <= 3 {
|
| 702 |
-
text = text.replacingOccurrences(of: capturedWord, with: keyword)
|
| 703 |
-
usedDetections.insert(keyword)
|
| 704 |
-
}
|
| 705 |
-
}
|
| 706 |
-
}
|
| 707 |
-
}
|
| 708 |
-
}
|
| 709 |
-
|
| 710 |
-
return text
|
| 711 |
-
}
|
| 712 |
-
|
| 713 |
-
/// Build word timings by merging subword tokens (tokens starting with "β" begin new words)
|
| 714 |
-
private static func buildWordTimings(
|
| 715 |
-
from tokenTimings: [TokenTiming]
|
| 716 |
-
) -> [(word: String, startTime: Double, endTime: Double)] {
|
| 717 |
-
var wordTimings: [(word: String, startTime: Double, endTime: Double)] = []
|
| 718 |
-
var currentWord = ""
|
| 719 |
-
var wordStart: Double = 0
|
| 720 |
-
var wordEnd: Double = 0
|
| 721 |
-
|
| 722 |
-
for timing in tokenTimings {
|
| 723 |
-
let token = timing.token
|
| 724 |
-
|
| 725 |
-
// Skip special tokens
|
| 726 |
-
if token.isEmpty || token == "<blank>" || token == "<pad>" {
|
| 727 |
-
continue
|
| 728 |
-
}
|
| 729 |
-
|
| 730 |
-
// Check if this starts a new word (has β prefix or is first token)
|
| 731 |
-
let startsNewWord = token.hasPrefix("β") || currentWord.isEmpty
|
| 732 |
-
|
| 733 |
-
if startsNewWord && !currentWord.isEmpty {
|
| 734 |
-
// Save previous word
|
| 735 |
-
wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
|
| 736 |
-
currentWord = ""
|
| 737 |
-
}
|
| 738 |
-
|
| 739 |
-
if startsNewWord {
|
| 740 |
-
currentWord = token.hasPrefix("β") ? String(token.dropFirst()) : token
|
| 741 |
-
wordStart = timing.startTime
|
| 742 |
-
} else {
|
| 743 |
-
currentWord += token
|
| 744 |
-
}
|
| 745 |
-
wordEnd = timing.endTime
|
| 746 |
-
}
|
| 747 |
-
|
| 748 |
-
// Save final word
|
| 749 |
-
if !currentWord.isEmpty {
|
| 750 |
-
wordTimings.append((word: currentWord, startTime: wordStart, endTime: wordEnd))
|
| 751 |
-
}
|
| 752 |
-
|
| 753 |
-
return wordTimings
|
| 754 |
-
}
|
| 755 |
-
|
| 756 |
-
/// Common English words that should never be replaced by keyword matching
|
| 757 |
-
private static let stopWords: Set<String> = [
|
| 758 |
-
// Pronouns
|
| 759 |
-
"i", "you", "he", "she", "it", "we", "they", "me", "him", "her", "us", "them",
|
| 760 |
-
"my", "your", "his", "its", "our", "their", "mine", "yours", "hers", "ours", "theirs",
|
| 761 |
-
"this", "that", "these", "those", "who", "whom", "what", "which", "whose",
|
| 762 |
-
// Common verbs
|
| 763 |
-
"is", "are", "was", "were", "be", "been", "being", "am",
|
| 764 |
-
"have", "has", "had", "having", "do", "does", "did", "doing", "done",
|
| 765 |
-
"will", "would", "shall", "should", "may", "might", "must", "can", "could",
|
| 766 |
-
"get", "got", "getting", "go", "goes", "went", "going", "gone",
|
| 767 |
-
"come", "came", "coming", "see", "saw", "seen", "know", "knew", "known",
|
| 768 |
-
"think", "thought", "make", "made", "take", "took", "taken", "give", "gave", "given",
|
| 769 |
-
"say", "said", "tell", "told", "ask", "asked", "use", "used", "want", "wanted",
|
| 770 |
-
"need", "needed", "try", "tried", "let", "put", "keep", "kept", "look", "looked",
|
| 771 |
-
// Articles and determiners
|
| 772 |
-
"a", "an", "the", "some", "any", "no", "every", "each", "all", "both", "few", "many",
|
| 773 |
-
"much", "more", "most", "other", "another", "such",
|
| 774 |
-
// Prepositions
|
| 775 |
-
"in", "on", "at", "to", "for", "of", "with", "by", "from", "up", "down", "out",
|
| 776 |
-
"about", "into", "over", "after", "before", "between", "under", "through", "during",
|
| 777 |
-
// Conjunctions
|
| 778 |
-
"and", "or", "but", "so", "yet", "nor", "if", "then", "than", "because", "while",
|
| 779 |
-
"although", "unless", "since", "when", "where", "as",
|
| 780 |
-
// Adverbs
|
| 781 |
-
"not", "very", "just", "also", "only", "even", "still", "already", "always", "never",
|
| 782 |
-
"often", "sometimes", "usually", "really", "well", "now", "here", "there", "how", "why",
|
| 783 |
-
// Common words
|
| 784 |
-
"yes", "no", "okay", "ok", "thank", "thanks", "please", "sorry", "hello", "hi", "bye",
|
| 785 |
-
"good", "great", "bad", "new", "old", "first", "last", "long", "short", "big", "small",
|
| 786 |
-
"high", "low", "right", "left", "next", "back", "same", "different", "own", "able",
|
| 787 |
-
"way", "thing", "things", "time", "times", "year", "years", "day", "days", "week", "weeks",
|
| 788 |
-
"part", "place", "case", "point", "fact", "end", "kind", "lot", "set",
|
| 789 |
-
// Numbers
|
| 790 |
-
"one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
|
| 791 |
-
"hundred", "thousand", "million", "billion",
|
| 792 |
-
]
|
| 793 |
-
|
| 794 |
-
/// Check if two words are similar (edit distance / length ratio)
|
| 795 |
-
private static func isSimilar(_ a: String, _ b: String) -> Bool {
|
| 796 |
-
// Never match stop words - they're too common to be proper nouns
|
| 797 |
-
if stopWords.contains(a) || stopWords.contains(b) {
|
| 798 |
-
return false
|
| 799 |
-
}
|
| 800 |
-
|
| 801 |
-
let maxLen = max(a.count, b.count)
|
| 802 |
-
let minLen = min(a.count, b.count)
|
| 803 |
-
guard maxLen > 0, minLen >= 3 else { return false }
|
| 804 |
-
|
| 805 |
-
// Allow more length difference for longer words
|
| 806 |
-
let lenDiff = abs(a.count - b.count)
|
| 807 |
-
if lenDiff > max(3, maxLen / 2) { return false }
|
| 808 |
-
|
| 809 |
-
// Calculate edit distance
|
| 810 |
-
let distance = editDistance(a, b)
|
| 811 |
-
|
| 812 |
-
// More aggressive threshold: allow up to 40% of max length as edits
|
| 813 |
-
let threshold = max(2, Int(Double(maxLen) * 0.4))
|
| 814 |
-
|
| 815 |
-
// Also check if one is substring of other (handles "Erik" in "Ririek")
|
| 816 |
-
if a.contains(b) || b.contains(a) {
|
| 817 |
-
return true
|
| 818 |
-
}
|
| 819 |
-
|
| 820 |
-
// Check common prefix/suffix (handles "Heri" vs "Harry")
|
| 821 |
-
let commonPrefix = commonPrefixLength(a, b)
|
| 822 |
-
let commonSuffix = commonSuffixLength(a, b)
|
| 823 |
-
if commonPrefix >= 2 || commonSuffix >= 2 {
|
| 824 |
-
return distance <= threshold + 1
|
| 825 |
-
}
|
| 826 |
-
|
| 827 |
-
return distance <= threshold
|
| 828 |
-
}
|
| 829 |
-
|
| 830 |
-
/// Get length of common prefix
|
| 831 |
-
private static func commonPrefixLength(_ a: String, _ b: String) -> Int {
|
| 832 |
-
let aChars = Array(a)
|
| 833 |
-
let bChars = Array(b)
|
| 834 |
-
var count = 0
|
| 835 |
-
for i in 0..<min(aChars.count, bChars.count) {
|
| 836 |
-
if aChars[i] == bChars[i] {
|
| 837 |
-
count += 1
|
| 838 |
-
} else {
|
| 839 |
-
break
|
| 840 |
-
}
|
| 841 |
-
}
|
| 842 |
-
return count
|
| 843 |
-
}
|
| 844 |
-
|
| 845 |
-
/// Get length of common suffix
|
| 846 |
-
private static func commonSuffixLength(_ a: String, _ b: String) -> Int {
|
| 847 |
-
let aChars = Array(a.reversed())
|
| 848 |
-
let bChars = Array(b.reversed())
|
| 849 |
-
var count = 0
|
| 850 |
-
for i in 0..<min(aChars.count, bChars.count) {
|
| 851 |
-
if aChars[i] == bChars[i] {
|
| 852 |
-
count += 1
|
| 853 |
-
} else {
|
| 854 |
-
break
|
| 855 |
-
}
|
| 856 |
-
}
|
| 857 |
-
return count
|
| 858 |
-
}
|
| 859 |
-
|
| 860 |
-
/// Simple edit distance calculation
|
| 861 |
-
private static func editDistance(_ a: String, _ b: String) -> Int {
|
| 862 |
-
let a = Array(a)
|
| 863 |
-
let b = Array(b)
|
| 864 |
-
let m = a.count
|
| 865 |
-
let n = b.count
|
| 866 |
-
|
| 867 |
-
if m == 0 { return n }
|
| 868 |
-
if n == 0 { return m }
|
| 869 |
-
|
| 870 |
-
var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
|
| 871 |
-
|
| 872 |
-
for i in 0...m { dp[i][0] = i }
|
| 873 |
-
for j in 0...n { dp[0][j] = j }
|
| 874 |
-
|
| 875 |
-
for i in 1...m {
|
| 876 |
-
for j in 1...n {
|
| 877 |
-
if a[i - 1] == b[j - 1] {
|
| 878 |
-
dp[i][j] = dp[i - 1][j - 1]
|
| 879 |
-
} else {
|
| 880 |
-
dp[i][j] = 1 + min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1]))
|
| 881 |
-
}
|
| 882 |
-
}
|
| 883 |
-
}
|
| 884 |
-
|
| 885 |
-
return dp[m][n]
|
| 886 |
-
}
|
| 887 |
-
|
| 888 |
-
/// Match the case pattern of the original word
|
| 889 |
-
private static func matchCase(_ keyword: String, to original: String) -> String {
|
| 890 |
-
let origClean = original.trimmingCharacters(in: .punctuationCharacters)
|
| 891 |
-
|
| 892 |
-
// Check case pattern
|
| 893 |
-
if origClean.first?.isUppercase == true {
|
| 894 |
-
// Capitalize first letter
|
| 895 |
-
return keyword.prefix(1).uppercased() + keyword.dropFirst()
|
| 896 |
-
}
|
| 897 |
-
return keyword
|
| 898 |
-
}
|
| 899 |
-
|
| 900 |
-
private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
|
| 901 |
-
if reference.isEmpty {
|
| 902 |
-
return hypothesis.isEmpty ? 0.0 : 1.0
|
| 903 |
-
}
|
| 904 |
-
|
| 905 |
-
let m = reference.count
|
| 906 |
-
let n = hypothesis.count
|
| 907 |
-
var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
|
| 908 |
-
|
| 909 |
-
for i in 0...m { dp[i][0] = i }
|
| 910 |
-
for j in 0...n { dp[0][j] = j }
|
| 911 |
-
|
| 912 |
-
for i in 1...m {
|
| 913 |
-
for j in 1...n {
|
| 914 |
-
if reference[i - 1] == hypothesis[j - 1] {
|
| 915 |
-
dp[i][j] = dp[i - 1][j - 1]
|
| 916 |
-
} else {
|
| 917 |
-
dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
|
| 918 |
-
}
|
| 919 |
-
}
|
| 920 |
-
}
|
| 921 |
-
|
| 922 |
-
return Double(dp[m][n]) / Double(m)
|
| 923 |
-
}
|
| 924 |
-
|
| 925 |
-
private static func printUsage() {
|
| 926 |
-
print(
|
| 927 |
-
"""
|
| 928 |
-
CTC Earnings Benchmark (TDT + CTC keyword spotting)
|
| 929 |
-
|
| 930 |
-
Usage: fluidaudio ctc-earnings-benchmark [options]
|
| 931 |
-
|
| 932 |
-
Options:
|
| 933 |
-
--data-dir <path> Path to earnings test dataset (auto-detected if downloaded)
|
| 934 |
-
--ctc-model <path> Path to CTC model directory (auto-detected if in standard location)
|
| 935 |
-
--max-files <n> Maximum number of files to process
|
| 936 |
-
--output, -o <path> Output JSON file (default: ctc_earnings_benchmark.json)
|
| 937 |
-
--auto-download Download earnings22-kws dataset if not found
|
| 938 |
-
--keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
|
| 939 |
-
|
| 940 |
-
Default locations:
|
| 941 |
-
Dataset: ~/Library/Application Support/FluidAudio/earnings22-kws/test-dataset/
|
| 942 |
-
CTC Model: ~/Library/Application Support/FluidAudio/Models/parakeet-ctc-110m-coreml/
|
| 943 |
-
|
| 944 |
-
Setup:
|
| 945 |
-
1. Download dataset: fluidaudio download --dataset earnings22-kws
|
| 946 |
-
2. Place CTC model in standard location
|
| 947 |
-
3. Run: fluidaudio ctc-earnings-benchmark
|
| 948 |
-
|
| 949 |
-
Examples:
|
| 950 |
-
# Run with auto-detected paths
|
| 951 |
-
fluidaudio ctc-earnings-benchmark
|
| 952 |
-
|
| 953 |
-
# Run with auto-download
|
| 954 |
-
fluidaudio ctc-earnings-benchmark --auto-download
|
| 955 |
-
|
| 956 |
-
# Run with explicit paths
|
| 957 |
-
fluidaudio ctc-earnings-benchmark \\
|
| 958 |
-
--data-dir /path/to/test-dataset \\
|
| 959 |
-
--ctc-model /path/to/parakeet-ctc-110m-coreml \\
|
| 960 |
-
--max-files 100
|
| 961 |
-
""")
|
| 962 |
-
}
|
| 963 |
-
|
| 964 |
-
private static func parseKeywordMode(_ value: String) -> KeywordMode? {
|
| 965 |
-
switch value.lowercased() {
|
| 966 |
-
case "chunk", "chunk-keywords":
|
| 967 |
-
return .chunk
|
| 968 |
-
case "file", "file-keywords":
|
| 969 |
-
return .file
|
| 970 |
-
default:
|
| 971 |
-
return nil
|
| 972 |
-
}
|
| 973 |
-
}
|
| 974 |
-
|
| 975 |
-
private static func parentId(from fileId: String) -> String {
|
| 976 |
-
guard let range = fileId.range(of: "_chunk") else {
|
| 977 |
-
return fileId
|
| 978 |
-
}
|
| 979 |
-
return String(fileId[..<range.lowerBound])
|
| 980 |
-
}
|
| 981 |
-
|
| 982 |
-
private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
|
| 983 |
-
guard keywordMode == .file else {
|
| 984 |
-
return [:]
|
| 985 |
-
}
|
| 986 |
-
|
| 987 |
-
var index: [String: Set<String>] = [:]
|
| 988 |
-
let suffix = ".dictionary.txt"
|
| 989 |
-
let fileManager = FileManager.default
|
| 990 |
-
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 991 |
-
|
| 992 |
-
for url in contents {
|
| 993 |
-
let name = url.lastPathComponent
|
| 994 |
-
guard name.hasSuffix(suffix) else { continue }
|
| 995 |
-
let fileId = String(name.dropLast(suffix.count))
|
| 996 |
-
let parent = parentId(from: fileId)
|
| 997 |
-
let words = try loadDictionaryWords(from: url)
|
| 998 |
-
var set = index[parent] ?? Set<String>()
|
| 999 |
-
set.formUnion(words)
|
| 1000 |
-
index[parent] = set
|
| 1001 |
-
}
|
| 1002 |
-
|
| 1003 |
-
return index.mapValues { Array($0).sorted() }
|
| 1004 |
-
}
|
| 1005 |
-
|
| 1006 |
-
private static func loadDictionaryWords(
|
| 1007 |
-
fileId: String,
|
| 1008 |
-
dictionaryFile: URL,
|
| 1009 |
-
keywordMode: KeywordMode,
|
| 1010 |
-
keywordIndex: [String: [String]]
|
| 1011 |
-
) throws -> [String] {
|
| 1012 |
-
switch keywordMode {
|
| 1013 |
-
case .chunk:
|
| 1014 |
-
return try loadDictionaryWords(from: dictionaryFile)
|
| 1015 |
-
case .file:
|
| 1016 |
-
let parent = parentId(from: fileId)
|
| 1017 |
-
if let words = keywordIndex[parent] {
|
| 1018 |
-
return words
|
| 1019 |
-
}
|
| 1020 |
-
return try loadDictionaryWords(from: dictionaryFile)
|
| 1021 |
-
}
|
| 1022 |
-
}
|
| 1023 |
-
|
| 1024 |
-
private static func loadDictionaryWords(from url: URL) throws -> [String] {
|
| 1025 |
-
let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
|
| 1026 |
-
return dictionaryContent
|
| 1027 |
-
.components(separatedBy: .newlines)
|
| 1028 |
-
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
| 1029 |
-
.filter { !$0.isEmpty }
|
| 1030 |
-
}
|
| 1031 |
-
|
| 1032 |
-
private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
|
| 1033 |
-
let textLower = text.lowercased()
|
| 1034 |
-
var result: Set<String> = []
|
| 1035 |
-
|
| 1036 |
-
for word in dictionaryWords {
|
| 1037 |
-
let wordLower = word.lowercased()
|
| 1038 |
-
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 1039 |
-
guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
|
| 1040 |
-
let range = NSRange(textLower.startIndex..., in: textLower)
|
| 1041 |
-
if regex.firstMatch(in: textLower, options: [], range: range) != nil {
|
| 1042 |
-
result.insert(wordLower)
|
| 1043 |
-
}
|
| 1044 |
-
}
|
| 1045 |
-
return result
|
| 1046 |
-
}
|
| 1047 |
-
}
|
| 1048 |
-
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cli/HybridEarningsBenchmark.swift
DELETED
|
@@ -1,554 +0,0 @@
|
|
| 1 |
-
#if os(macOS)
|
| 2 |
-
import AVFoundation
|
| 3 |
-
import FluidAudio
|
| 4 |
-
import Foundation
|
| 5 |
-
|
| 6 |
-
/// Earnings22 benchmark using ONLY the Hybrid 110M model (single encoder).
|
| 7 |
-
/// CTC head provides both transcription AND keyword spotting from the same encoder.
|
| 8 |
-
public enum HybridEarningsBenchmark {
|
| 9 |
-
|
| 10 |
-
private enum KeywordMode: String {
|
| 11 |
-
case chunk
|
| 12 |
-
case file
|
| 13 |
-
}
|
| 14 |
-
|
| 15 |
-
public static func runCLI(arguments: [String]) async {
|
| 16 |
-
if arguments.contains("--help") || arguments.contains("-h") {
|
| 17 |
-
printUsage()
|
| 18 |
-
return
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
-
// Parse arguments
|
| 22 |
-
var outputFile = "hybrid_earnings_benchmark.json"
|
| 23 |
-
var maxFiles: Int? = nil
|
| 24 |
-
var decodingMode: HybridDecodingMode = .tdt
|
| 25 |
-
var useRescoring = false
|
| 26 |
-
var keywordMode: KeywordMode = .chunk
|
| 27 |
-
|
| 28 |
-
var i = 0
|
| 29 |
-
while i < arguments.count {
|
| 30 |
-
switch arguments[i] {
|
| 31 |
-
case "--output", "-o":
|
| 32 |
-
if i + 1 < arguments.count {
|
| 33 |
-
outputFile = arguments[i + 1]
|
| 34 |
-
i += 1
|
| 35 |
-
}
|
| 36 |
-
case "--max-files":
|
| 37 |
-
if i + 1 < arguments.count {
|
| 38 |
-
maxFiles = Int(arguments[i + 1])
|
| 39 |
-
i += 1
|
| 40 |
-
}
|
| 41 |
-
case "--ctc":
|
| 42 |
-
decodingMode = .ctc
|
| 43 |
-
case "--tdt":
|
| 44 |
-
decodingMode = .tdt
|
| 45 |
-
case "--rescore":
|
| 46 |
-
useRescoring = true
|
| 47 |
-
case "--keyword-mode":
|
| 48 |
-
if i + 1 < arguments.count, let mode = parseKeywordMode(arguments[i + 1]) {
|
| 49 |
-
keywordMode = mode
|
| 50 |
-
i += 1
|
| 51 |
-
}
|
| 52 |
-
default:
|
| 53 |
-
break
|
| 54 |
-
}
|
| 55 |
-
i += 1
|
| 56 |
-
}
|
| 57 |
-
|
| 58 |
-
let dataDir = DatasetDownloader.getEarnings22Directory().appendingPathComponent("test-dataset")
|
| 59 |
-
guard FileManager.default.fileExists(atPath: dataDir.path) else {
|
| 60 |
-
print("ERROR: Earnings dataset not found at \(dataDir.path)")
|
| 61 |
-
print("Download with: fluidaudio download --dataset earnings22-kws")
|
| 62 |
-
return
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
let modeStr = decodingMode == .ctc ? "CTC" : "TDT"
|
| 66 |
-
let rescoringStr = useRescoring ? " + Rescoring" : ""
|
| 67 |
-
print("Hybrid 110M Earnings Benchmark (Decoding: \(modeStr)\(rescoringStr))")
|
| 68 |
-
print(" Output file: \(outputFile)")
|
| 69 |
-
print(" Decoding mode: \(modeStr)")
|
| 70 |
-
print(" Rescoring: \(useRescoring ? "enabled" : "disabled")")
|
| 71 |
-
print(" Keyword mode: \(keywordMode.rawValue)")
|
| 72 |
-
|
| 73 |
-
do {
|
| 74 |
-
// Load Hybrid 110M model (single encoder with CTC head)
|
| 75 |
-
print("Loading Hybrid 110M model...")
|
| 76 |
-
let hybridModels = try await HybridAsrModels.downloadAndLoad()
|
| 77 |
-
let hybridManager = HybridAsrManager(models: hybridModels, decodingMode: decodingMode)
|
| 78 |
-
let spotter = HybridKeywordSpotter(vocabulary: hybridModels.vocabulary, blankId: hybridModels.blankId)
|
| 79 |
-
print(" Vocab size: \(hybridModels.vocabSize)")
|
| 80 |
-
|
| 81 |
-
// Collect test files
|
| 82 |
-
let fileIds = try collectFileIds(from: dataDir, maxFiles: maxFiles)
|
| 83 |
-
let keywordIndex = try buildKeywordIndex(dataDir: dataDir, keywordMode: keywordMode)
|
| 84 |
-
|
| 85 |
-
if fileIds.isEmpty {
|
| 86 |
-
print("ERROR: No test files found")
|
| 87 |
-
return
|
| 88 |
-
}
|
| 89 |
-
|
| 90 |
-
print("Processing \(fileIds.count) test files...")
|
| 91 |
-
|
| 92 |
-
var results: [[String: Any]] = []
|
| 93 |
-
var totalWer = 0.0
|
| 94 |
-
var totalKeywordReference = 0
|
| 95 |
-
var totalKeywordPredicted = 0
|
| 96 |
-
var totalKeywordTruePositives = 0
|
| 97 |
-
var totalKeywordFalsePositives = 0
|
| 98 |
-
var totalKeywordFalseNegatives = 0
|
| 99 |
-
var totalAudioDuration = 0.0
|
| 100 |
-
var totalProcessingTime = 0.0
|
| 101 |
-
|
| 102 |
-
for (index, fileId) in fileIds.enumerated() {
|
| 103 |
-
print("[\(index + 1)/\(fileIds.count)] \(fileId)")
|
| 104 |
-
|
| 105 |
-
if let result = try await processFile(
|
| 106 |
-
fileId: fileId,
|
| 107 |
-
dataDir: dataDir,
|
| 108 |
-
hybridManager: hybridManager,
|
| 109 |
-
spotter: spotter,
|
| 110 |
-
useRescoring: useRescoring,
|
| 111 |
-
keywordMode: keywordMode,
|
| 112 |
-
keywordIndex: keywordIndex
|
| 113 |
-
) {
|
| 114 |
-
results.append(result)
|
| 115 |
-
totalWer += result["wer"] as? Double ?? 0
|
| 116 |
-
totalKeywordReference += result["keywordReference"] as? Int ?? 0
|
| 117 |
-
totalKeywordPredicted += result["keywordPredicted"] as? Int ?? 0
|
| 118 |
-
totalKeywordTruePositives += result["keywordTruePositives"] as? Int ?? 0
|
| 119 |
-
totalKeywordFalsePositives += result["keywordFalsePositives"] as? Int ?? 0
|
| 120 |
-
totalKeywordFalseNegatives += result["keywordFalseNegatives"] as? Int ?? 0
|
| 121 |
-
totalAudioDuration += result["audioLength"] as? Double ?? 0
|
| 122 |
-
totalProcessingTime += result["processingTime"] as? Double ?? 0
|
| 123 |
-
|
| 124 |
-
let wer = result["wer"] as? Double ?? 0
|
| 125 |
-
let precision = result["keywordPrecision"] as? Double ?? 0
|
| 126 |
-
let recall = result["keywordRecall"] as? Double ?? 0
|
| 127 |
-
let fscore = result["keywordFscore"] as? Double ?? 0
|
| 128 |
-
print(
|
| 129 |
-
" WER: \(String(format: "%.1f", wer))%, " +
|
| 130 |
-
"KW P/R/F: \(String(format: "%.2f", precision))/" +
|
| 131 |
-
"\(String(format: "%.2f", recall))/" +
|
| 132 |
-
"\(String(format: "%.2f", fscore))"
|
| 133 |
-
)
|
| 134 |
-
}
|
| 135 |
-
}
|
| 136 |
-
|
| 137 |
-
// Calculate summary
|
| 138 |
-
let avgWer = results.isEmpty ? 0.0 : totalWer / Double(results.count)
|
| 139 |
-
let keywordPrecision =
|
| 140 |
-
totalKeywordPredicted > 0
|
| 141 |
-
? Double(totalKeywordTruePositives) / Double(totalKeywordPredicted)
|
| 142 |
-
: 0
|
| 143 |
-
let keywordRecall =
|
| 144 |
-
totalKeywordReference > 0
|
| 145 |
-
? Double(totalKeywordTruePositives) / Double(totalKeywordReference)
|
| 146 |
-
: 0
|
| 147 |
-
let keywordFscore =
|
| 148 |
-
(keywordPrecision + keywordRecall) > 0
|
| 149 |
-
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 150 |
-
: 0
|
| 151 |
-
|
| 152 |
-
// Print summary
|
| 153 |
-
print("\n" + String(repeating: "=", count: 60))
|
| 154 |
-
print("HYBRID 110M BENCHMARK (\(modeStr)\(rescoringStr))")
|
| 155 |
-
print(String(repeating: "=", count: 60))
|
| 156 |
-
print("Model: parakeet-tdt-ctc-110m-hybrid")
|
| 157 |
-
print("Decoding: \(modeStr), Rescoring: \(useRescoring ? "yes" : "no")")
|
| 158 |
-
print("Total tests: \(results.count)")
|
| 159 |
-
print("Average WER: \(String(format: "%.2f", avgWer))%")
|
| 160 |
-
print(
|
| 161 |
-
"Keyword Precision/Recall/F1: " +
|
| 162 |
-
"\(String(format: "%.2f", keywordPrecision))/" +
|
| 163 |
-
"\(String(format: "%.2f", keywordRecall))/" +
|
| 164 |
-
"\(String(format: "%.2f", keywordFscore))"
|
| 165 |
-
)
|
| 166 |
-
print("Total audio: \(String(format: "%.1f", totalAudioDuration))s")
|
| 167 |
-
print("Total processing: \(String(format: "%.1f", totalProcessingTime))s")
|
| 168 |
-
if totalProcessingTime > 0 {
|
| 169 |
-
print("RTFx: \(String(format: "%.2f", totalAudioDuration / totalProcessingTime))x")
|
| 170 |
-
}
|
| 171 |
-
print(String(repeating: "=", count: 60))
|
| 172 |
-
|
| 173 |
-
// Sort results by WER descending (worst first)
|
| 174 |
-
let sortedResults = results.sorted { r1, r2 in
|
| 175 |
-
let wer1 = r1["wer"] as? Double ?? 0
|
| 176 |
-
let wer2 = r2["wer"] as? Double ?? 0
|
| 177 |
-
return wer1 > wer2
|
| 178 |
-
}
|
| 179 |
-
|
| 180 |
-
// Save to JSON
|
| 181 |
-
let summaryDict: [String: Any] = [
|
| 182 |
-
"totalTests": results.count,
|
| 183 |
-
"avgWer": round(avgWer * 100) / 100,
|
| 184 |
-
"keywordTruePositives": totalKeywordTruePositives,
|
| 185 |
-
"keywordFalsePositives": totalKeywordFalsePositives,
|
| 186 |
-
"keywordFalseNegatives": totalKeywordFalseNegatives,
|
| 187 |
-
"keywordPredicted": totalKeywordPredicted,
|
| 188 |
-
"keywordReference": totalKeywordReference,
|
| 189 |
-
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 190 |
-
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 191 |
-
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 192 |
-
"totalAudioDuration": round(totalAudioDuration * 100) / 100,
|
| 193 |
-
"totalProcessingTime": round(totalProcessingTime * 100) / 100,
|
| 194 |
-
]
|
| 195 |
-
|
| 196 |
-
let output: [String: Any] = [
|
| 197 |
-
"model": "parakeet-tdt-ctc-110m-hybrid",
|
| 198 |
-
"approach": "single-encoder",
|
| 199 |
-
"decodingMode": modeStr,
|
| 200 |
-
"rescoring": useRescoring,
|
| 201 |
-
"keywordMode": keywordMode.rawValue,
|
| 202 |
-
"summary": summaryDict,
|
| 203 |
-
"results": sortedResults,
|
| 204 |
-
]
|
| 205 |
-
|
| 206 |
-
let jsonData = try JSONSerialization.data(withJSONObject: output, options: [.prettyPrinted, .sortedKeys])
|
| 207 |
-
try jsonData.write(to: URL(fileURLWithPath: outputFile))
|
| 208 |
-
print("\nResults written to: \(outputFile)")
|
| 209 |
-
|
| 210 |
-
} catch {
|
| 211 |
-
print("ERROR: \(error)")
|
| 212 |
-
}
|
| 213 |
-
}
|
| 214 |
-
|
| 215 |
-
private static func collectFileIds(from dataDir: URL, maxFiles: Int?) throws -> [String] {
|
| 216 |
-
var fileIds: [String] = []
|
| 217 |
-
let suffix = ".dictionary.txt"
|
| 218 |
-
|
| 219 |
-
let fileManager = FileManager.default
|
| 220 |
-
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 221 |
-
|
| 222 |
-
for url in contents.sorted(by: { $0.path < $1.path }) {
|
| 223 |
-
let name = url.lastPathComponent
|
| 224 |
-
if name.hasSuffix(suffix) {
|
| 225 |
-
let data = try? Data(contentsOf: url)
|
| 226 |
-
if let data = data, !data.isEmpty {
|
| 227 |
-
let fileId = String(name.dropLast(suffix.count))
|
| 228 |
-
fileIds.append(fileId)
|
| 229 |
-
}
|
| 230 |
-
}
|
| 231 |
-
}
|
| 232 |
-
|
| 233 |
-
if let maxFiles = maxFiles {
|
| 234 |
-
return Array(fileIds.prefix(maxFiles))
|
| 235 |
-
}
|
| 236 |
-
return fileIds
|
| 237 |
-
}
|
| 238 |
-
|
| 239 |
-
private static func processFile(
|
| 240 |
-
fileId: String,
|
| 241 |
-
dataDir: URL,
|
| 242 |
-
hybridManager: HybridAsrManager,
|
| 243 |
-
spotter: HybridKeywordSpotter,
|
| 244 |
-
useRescoring: Bool,
|
| 245 |
-
keywordMode: KeywordMode,
|
| 246 |
-
keywordIndex: [String: [String]]
|
| 247 |
-
) async throws -> [String: Any]? {
|
| 248 |
-
let wavFile = dataDir.appendingPathComponent("\(fileId).wav")
|
| 249 |
-
let dictionaryFile = dataDir.appendingPathComponent("\(fileId).dictionary.txt")
|
| 250 |
-
let textFile = dataDir.appendingPathComponent("\(fileId).text.txt")
|
| 251 |
-
|
| 252 |
-
let fm = FileManager.default
|
| 253 |
-
guard fm.fileExists(atPath: wavFile.path),
|
| 254 |
-
fm.fileExists(atPath: dictionaryFile.path)
|
| 255 |
-
else {
|
| 256 |
-
return nil
|
| 257 |
-
}
|
| 258 |
-
|
| 259 |
-
// Load dictionary words (chunk or file keywords)
|
| 260 |
-
let dictionaryWords = try loadDictionaryWords(
|
| 261 |
-
fileId: fileId,
|
| 262 |
-
dictionaryFile: dictionaryFile,
|
| 263 |
-
keywordMode: keywordMode,
|
| 264 |
-
keywordIndex: keywordIndex
|
| 265 |
-
)
|
| 266 |
-
|
| 267 |
-
// Load reference text
|
| 268 |
-
let referenceRaw =
|
| 269 |
-
(try? String(contentsOf: textFile, encoding: .utf8))?.trimmingCharacters(in: .whitespacesAndNewlines) ?? ""
|
| 270 |
-
|
| 271 |
-
// Get audio samples
|
| 272 |
-
let audioFile = try AVAudioFile(forReading: wavFile)
|
| 273 |
-
let audioLength = Double(audioFile.length) / audioFile.processingFormat.sampleRate
|
| 274 |
-
let format = audioFile.processingFormat
|
| 275 |
-
let frameCount = AVAudioFrameCount(audioFile.length)
|
| 276 |
-
|
| 277 |
-
guard let buffer = AVAudioPCMBuffer(pcmFormat: format, frameCapacity: frameCount) else {
|
| 278 |
-
return nil
|
| 279 |
-
}
|
| 280 |
-
try audioFile.read(into: buffer)
|
| 281 |
-
|
| 282 |
-
// Resample to 16kHz
|
| 283 |
-
let converter = AudioConverter()
|
| 284 |
-
let samples = try converter.resampleBuffer(buffer)
|
| 285 |
-
|
| 286 |
-
// Build custom vocabulary for keyword spotting
|
| 287 |
-
var vocabTerms: [CustomVocabularyTerm] = []
|
| 288 |
-
for word in dictionaryWords {
|
| 289 |
-
let term = CustomVocabularyTerm(
|
| 290 |
-
text: word,
|
| 291 |
-
weight: nil,
|
| 292 |
-
aliases: nil,
|
| 293 |
-
tokenIds: nil,
|
| 294 |
-
ctcTokenIds: nil
|
| 295 |
-
)
|
| 296 |
-
vocabTerms.append(term)
|
| 297 |
-
}
|
| 298 |
-
let customVocab = CustomVocabularyContext(terms: vocabTerms)
|
| 299 |
-
|
| 300 |
-
// Run Hybrid 110M using new API (TDT transcription + CTC keyword detection)
|
| 301 |
-
let rescorerConfig: HybridTextRescorer.Config? = useRescoring ? .default : nil
|
| 302 |
-
let hybridResult = try await hybridManager.transcribeHybrid(
|
| 303 |
-
audioSamples: samples,
|
| 304 |
-
customVocabulary: customVocab,
|
| 305 |
-
rescorerConfig: rescorerConfig
|
| 306 |
-
)
|
| 307 |
-
|
| 308 |
-
// Skip if empty transcription
|
| 309 |
-
if hybridResult.text.isEmpty {
|
| 310 |
-
print(" SKIPPED: Empty transcription")
|
| 311 |
-
return nil
|
| 312 |
-
}
|
| 313 |
-
|
| 314 |
-
let detections = hybridResult.keywordDetections
|
| 315 |
-
let processingTime = hybridResult.processingTime
|
| 316 |
-
|
| 317 |
-
// Use hybrid transcription as hypothesis (may be rescored if enabled)
|
| 318 |
-
let hypothesis = hybridResult.text
|
| 319 |
-
|
| 320 |
-
// Normalize texts
|
| 321 |
-
let referenceNormalized = TextNormalizer.normalize(referenceRaw)
|
| 322 |
-
let hypothesisNormalized = TextNormalizer.normalize(hypothesis)
|
| 323 |
-
|
| 324 |
-
// Keyword sets for precision/recall
|
| 325 |
-
let referenceKeywords = keywordsInText(referenceNormalized, dictionaryWords: dictionaryWords)
|
| 326 |
-
let predictedKeywords = keywordsInText(hypothesisNormalized, dictionaryWords: dictionaryWords)
|
| 327 |
-
let truePositives = referenceKeywords.intersection(predictedKeywords)
|
| 328 |
-
let falsePositives = predictedKeywords.subtracting(referenceKeywords)
|
| 329 |
-
let falseNegatives = referenceKeywords.subtracting(predictedKeywords)
|
| 330 |
-
let keywordPrecision = predictedKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(predictedKeywords.count)
|
| 331 |
-
let keywordRecall = referenceKeywords.isEmpty ? 0 : Double(truePositives.count) / Double(referenceKeywords.count)
|
| 332 |
-
let keywordFscore =
|
| 333 |
-
(keywordPrecision + keywordRecall) > 0
|
| 334 |
-
? 2 * keywordPrecision * keywordRecall / (keywordPrecision + keywordRecall)
|
| 335 |
-
: 0
|
| 336 |
-
|
| 337 |
-
let referenceWords = referenceNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 338 |
-
!$0.isEmpty
|
| 339 |
-
}
|
| 340 |
-
let hypothesisWords = hypothesisNormalized.components(separatedBy: CharacterSet.whitespacesAndNewlines).filter {
|
| 341 |
-
!$0.isEmpty
|
| 342 |
-
}
|
| 343 |
-
|
| 344 |
-
// Calculate WER
|
| 345 |
-
let wer: Double
|
| 346 |
-
if referenceWords.isEmpty {
|
| 347 |
-
wer = hypothesisWords.isEmpty ? 0.0 : 1.0
|
| 348 |
-
} else {
|
| 349 |
-
wer = calculateWER(reference: referenceWords, hypothesis: hypothesisWords)
|
| 350 |
-
}
|
| 351 |
-
|
| 352 |
-
// Count dictionary detections for debugging
|
| 353 |
-
let minCtcScore: Float = -15.0
|
| 354 |
-
var detectionDetails: [[String: Any]] = []
|
| 355 |
-
var foundWords: Set<String> = []
|
| 356 |
-
|
| 357 |
-
// CTC detections
|
| 358 |
-
for detection in detections {
|
| 359 |
-
let inRef = referenceKeywords.contains(detection.term.text.lowercased())
|
| 360 |
-
let detail: [String: Any] = [
|
| 361 |
-
"word": detection.term.text,
|
| 362 |
-
"score": round(Double(detection.score) * 100) / 100,
|
| 363 |
-
"startTime": round(detection.startTime * 100) / 100,
|
| 364 |
-
"endTime": round(detection.endTime * 100) / 100,
|
| 365 |
-
"source": "ctc",
|
| 366 |
-
"inReference": inRef,
|
| 367 |
-
]
|
| 368 |
-
detectionDetails.append(detail)
|
| 369 |
-
|
| 370 |
-
if detection.score >= minCtcScore {
|
| 371 |
-
foundWords.insert(detection.term.text.lowercased())
|
| 372 |
-
}
|
| 373 |
-
}
|
| 374 |
-
|
| 375 |
-
// Fallback: check hypothesis for dictionary words not found by CTC
|
| 376 |
-
let hypothesisLower = hypothesis.lowercased()
|
| 377 |
-
for word in dictionaryWords {
|
| 378 |
-
let wordLower = word.lowercased()
|
| 379 |
-
if !foundWords.contains(wordLower) {
|
| 380 |
-
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 381 |
-
if let regex = try? NSRegularExpression(pattern: pattern, options: []),
|
| 382 |
-
regex.firstMatch(
|
| 383 |
-
in: hypothesisLower, options: [],
|
| 384 |
-
range: NSRange(hypothesisLower.startIndex..., in: hypothesisLower)) != nil
|
| 385 |
-
{
|
| 386 |
-
foundWords.insert(wordLower)
|
| 387 |
-
let inRef = referenceKeywords.contains(wordLower)
|
| 388 |
-
let detail: [String: Any] = [
|
| 389 |
-
"word": word,
|
| 390 |
-
"score": 0.0,
|
| 391 |
-
"startTime": 0.0,
|
| 392 |
-
"endTime": 0.0,
|
| 393 |
-
"source": "hypothesis",
|
| 394 |
-
"inReference": inRef,
|
| 395 |
-
]
|
| 396 |
-
detectionDetails.append(detail)
|
| 397 |
-
}
|
| 398 |
-
}
|
| 399 |
-
}
|
| 400 |
-
|
| 401 |
-
let result: [String: Any] = [
|
| 402 |
-
"fileId": fileId,
|
| 403 |
-
"reference": referenceNormalized,
|
| 404 |
-
"hypothesis": hypothesisNormalized,
|
| 405 |
-
"wer": round(wer * 10000) / 100,
|
| 406 |
-
"dictFound": predictedKeywords.count,
|
| 407 |
-
"dictTotal": referenceKeywords.count,
|
| 408 |
-
"keywordPredicted": predictedKeywords.count,
|
| 409 |
-
"keywordReference": referenceKeywords.count,
|
| 410 |
-
"keywordTruePositives": truePositives.count,
|
| 411 |
-
"keywordFalsePositives": falsePositives.count,
|
| 412 |
-
"keywordFalseNegatives": falseNegatives.count,
|
| 413 |
-
"keywordPrecision": round(keywordPrecision * 1000) / 1000,
|
| 414 |
-
"keywordRecall": round(keywordRecall * 1000) / 1000,
|
| 415 |
-
"keywordFscore": round(keywordFscore * 1000) / 1000,
|
| 416 |
-
"audioLength": round(audioLength * 100) / 100,
|
| 417 |
-
"processingTime": round(processingTime * 1000) / 1000,
|
| 418 |
-
"ctcDetections": detectionDetails,
|
| 419 |
-
]
|
| 420 |
-
return result
|
| 421 |
-
}
|
| 422 |
-
|
| 423 |
-
private static func calculateWER(reference: [String], hypothesis: [String]) -> Double {
|
| 424 |
-
if reference.isEmpty {
|
| 425 |
-
return hypothesis.isEmpty ? 0.0 : 1.0
|
| 426 |
-
}
|
| 427 |
-
|
| 428 |
-
let m = reference.count
|
| 429 |
-
let n = hypothesis.count
|
| 430 |
-
var dp = Array(repeating: Array(repeating: 0, count: n + 1), count: m + 1)
|
| 431 |
-
|
| 432 |
-
for i in 0...m { dp[i][0] = i }
|
| 433 |
-
for j in 0...n { dp[0][j] = j }
|
| 434 |
-
|
| 435 |
-
for i in 1...m {
|
| 436 |
-
for j in 1...n {
|
| 437 |
-
if reference[i - 1] == hypothesis[j - 1] {
|
| 438 |
-
dp[i][j] = dp[i - 1][j - 1]
|
| 439 |
-
} else {
|
| 440 |
-
dp[i][j] = min(dp[i - 1][j - 1], min(dp[i - 1][j], dp[i][j - 1])) + 1
|
| 441 |
-
}
|
| 442 |
-
}
|
| 443 |
-
}
|
| 444 |
-
|
| 445 |
-
return Double(dp[m][n]) / Double(m)
|
| 446 |
-
}
|
| 447 |
-
|
| 448 |
-
private static func printUsage() {
|
| 449 |
-
print(
|
| 450 |
-
"""
|
| 451 |
-
Hybrid 110M Earnings Benchmark (Single Encoder)
|
| 452 |
-
|
| 453 |
-
Usage: fluidaudio hybrid-earnings-benchmark [options]
|
| 454 |
-
|
| 455 |
-
This benchmark uses ONLY the Hybrid 110M model:
|
| 456 |
-
- Single encoder provides CTC log-probs
|
| 457 |
-
- CTC greedy decode for transcription
|
| 458 |
-
- CTC keyword spotting from same encoder output
|
| 459 |
-
|
| 460 |
-
Options:
|
| 461 |
-
--max-files <n> Maximum number of files to process
|
| 462 |
-
--output, -o <path> Output JSON file (default: hybrid_earnings_benchmark.json)
|
| 463 |
-
--keyword-mode <mode> Keyword mode: chunk or file (default: chunk)
|
| 464 |
-
|
| 465 |
-
Compare with:
|
| 466 |
-
fluidaudio ctc-earnings-benchmark (Canary-CTC + TDT 0.6B, two encoders)
|
| 467 |
-
""")
|
| 468 |
-
}
|
| 469 |
-
|
| 470 |
-
private static func parseKeywordMode(_ value: String) -> KeywordMode? {
|
| 471 |
-
switch value.lowercased() {
|
| 472 |
-
case "chunk", "chunk-keywords":
|
| 473 |
-
return .chunk
|
| 474 |
-
case "file", "file-keywords":
|
| 475 |
-
return .file
|
| 476 |
-
default:
|
| 477 |
-
return nil
|
| 478 |
-
}
|
| 479 |
-
}
|
| 480 |
-
|
| 481 |
-
private static func parentId(from fileId: String) -> String {
|
| 482 |
-
guard let range = fileId.range(of: "_chunk") else {
|
| 483 |
-
return fileId
|
| 484 |
-
}
|
| 485 |
-
return String(fileId[..<range.lowerBound])
|
| 486 |
-
}
|
| 487 |
-
|
| 488 |
-
private static func buildKeywordIndex(dataDir: URL, keywordMode: KeywordMode) throws -> [String: [String]] {
|
| 489 |
-
guard keywordMode == .file else {
|
| 490 |
-
return [:]
|
| 491 |
-
}
|
| 492 |
-
|
| 493 |
-
var index: [String: Set<String>] = [:]
|
| 494 |
-
let suffix = ".dictionary.txt"
|
| 495 |
-
let fileManager = FileManager.default
|
| 496 |
-
let contents = try fileManager.contentsOfDirectory(at: dataDir, includingPropertiesForKeys: nil)
|
| 497 |
-
|
| 498 |
-
for url in contents {
|
| 499 |
-
let name = url.lastPathComponent
|
| 500 |
-
guard name.hasSuffix(suffix) else { continue }
|
| 501 |
-
let fileId = String(name.dropLast(suffix.count))
|
| 502 |
-
let parent = parentId(from: fileId)
|
| 503 |
-
let words = try loadDictionaryWords(from: url)
|
| 504 |
-
var set = index[parent] ?? Set<String>()
|
| 505 |
-
set.formUnion(words)
|
| 506 |
-
index[parent] = set
|
| 507 |
-
}
|
| 508 |
-
|
| 509 |
-
return index.mapValues { Array($0).sorted() }
|
| 510 |
-
}
|
| 511 |
-
|
| 512 |
-
private static func loadDictionaryWords(
|
| 513 |
-
fileId: String,
|
| 514 |
-
dictionaryFile: URL,
|
| 515 |
-
keywordMode: KeywordMode,
|
| 516 |
-
keywordIndex: [String: [String]]
|
| 517 |
-
) throws -> [String] {
|
| 518 |
-
switch keywordMode {
|
| 519 |
-
case .chunk:
|
| 520 |
-
return try loadDictionaryWords(from: dictionaryFile)
|
| 521 |
-
case .file:
|
| 522 |
-
let parent = parentId(from: fileId)
|
| 523 |
-
if let words = keywordIndex[parent] {
|
| 524 |
-
return words
|
| 525 |
-
}
|
| 526 |
-
return try loadDictionaryWords(from: dictionaryFile)
|
| 527 |
-
}
|
| 528 |
-
}
|
| 529 |
-
|
| 530 |
-
private static func loadDictionaryWords(from url: URL) throws -> [String] {
|
| 531 |
-
let dictionaryContent = try String(contentsOf: url, encoding: .utf8)
|
| 532 |
-
return dictionaryContent
|
| 533 |
-
.components(separatedBy: .newlines)
|
| 534 |
-
.map { $0.trimmingCharacters(in: .whitespacesAndNewlines) }
|
| 535 |
-
.filter { !$0.isEmpty }
|
| 536 |
-
}
|
| 537 |
-
|
| 538 |
-
private static func keywordsInText(_ text: String, dictionaryWords: [String]) -> Set<String> {
|
| 539 |
-
let textLower = text.lowercased()
|
| 540 |
-
var result: Set<String> = []
|
| 541 |
-
|
| 542 |
-
for word in dictionaryWords {
|
| 543 |
-
let wordLower = word.lowercased()
|
| 544 |
-
let pattern = "\\b\(NSRegularExpression.escapedPattern(for: wordLower))\\b"
|
| 545 |
-
guard let regex = try? NSRegularExpression(pattern: pattern, options: []) else { continue }
|
| 546 |
-
let range = NSRange(textLower.startIndex..., in: textLower)
|
| 547 |
-
if regex.firstMatch(in: textLower, options: [], range: range) != nil {
|
| 548 |
-
result.insert(wordLower)
|
| 549 |
-
}
|
| 550 |
-
}
|
| 551 |
-
return result
|
| 552 |
-
}
|
| 553 |
-
}
|
| 554 |
-
#endif
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{}
|
|
|
|
|
|
convert/.DS_Store
DELETED
|
Binary file (10.2 kB)
|
|
|
convert/parakeet-tdt-ctc-110m/convert_tdt_decoder.py
DELETED
|
@@ -1,323 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""
|
| 3 |
-
Convert Parakeet TDT-CTC 110M decoder components to CoreML.
|
| 4 |
-
|
| 5 |
-
This script exports the TDT decoder (prediction network) and joint network
|
| 6 |
-
with the SAME format as the working 0.6B model:
|
| 7 |
-
- JointDecision outputs token_id, token_prob, duration (argmax done inside)
|
| 8 |
-
- Uses shape [1, dim, 1] for encoder/decoder steps
|
| 9 |
-
- Matches the interface expected by TdtDecoderV3
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import argparse
|
| 13 |
-
import os
|
| 14 |
-
import torch
|
| 15 |
-
import torch.nn.functional as F
|
| 16 |
-
import coremltools as ct
|
| 17 |
-
import numpy as np
|
| 18 |
-
from pathlib import Path
|
| 19 |
-
|
| 20 |
-
# NeMo imports
|
| 21 |
-
import nemo.collections.asr as nemo_asr
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
def get_model_config(model):
|
| 25 |
-
"""Extract model configuration."""
|
| 26 |
-
encoder_dim = None
|
| 27 |
-
pred_hidden = 640 # Default for parakeet models
|
| 28 |
-
num_layers = 1
|
| 29 |
-
vocab_size = 1024
|
| 30 |
-
num_durations = 5
|
| 31 |
-
|
| 32 |
-
# Get encoder dimension
|
| 33 |
-
if hasattr(model, 'encoder'):
|
| 34 |
-
encoder = model.encoder
|
| 35 |
-
if hasattr(encoder, 'd_model'):
|
| 36 |
-
encoder_dim = encoder.d_model
|
| 37 |
-
elif hasattr(encoder, '_feat_out'):
|
| 38 |
-
encoder_dim = encoder._feat_out
|
| 39 |
-
|
| 40 |
-
# Get decoder config
|
| 41 |
-
if hasattr(model, 'decoder'):
|
| 42 |
-
decoder = model.decoder
|
| 43 |
-
if hasattr(decoder, 'pred_hidden'):
|
| 44 |
-
pred_hidden = decoder.pred_hidden
|
| 45 |
-
if hasattr(decoder, 'pred_rnn_layers'):
|
| 46 |
-
num_layers = decoder.pred_rnn_layers
|
| 47 |
-
|
| 48 |
-
# Get joint config
|
| 49 |
-
if hasattr(model, 'joint'):
|
| 50 |
-
joint = model.joint
|
| 51 |
-
if hasattr(joint, 'num_extra_outputs'):
|
| 52 |
-
num_durations = joint.num_extra_outputs
|
| 53 |
-
if hasattr(joint, 'num_classes'):
|
| 54 |
-
vocab_size = joint.num_classes - num_durations
|
| 55 |
-
|
| 56 |
-
return {
|
| 57 |
-
'encoder_dim': encoder_dim,
|
| 58 |
-
'pred_hidden': pred_hidden,
|
| 59 |
-
'num_layers': num_layers,
|
| 60 |
-
'vocab_size': vocab_size,
|
| 61 |
-
'num_durations': num_durations,
|
| 62 |
-
}
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
class DecoderWrapper(torch.nn.Module):
|
| 66 |
-
"""
|
| 67 |
-
Wrapper for the RNNT/TDT decoder (prediction network).
|
| 68 |
-
|
| 69 |
-
Matches 0.6B format:
|
| 70 |
-
- Input: targets[1,1], target_lengths[1], h_in[num_layers,1,pred_hidden], c_in[...]
|
| 71 |
-
- Output: decoder_output[1,pred_hidden,2], h_out[...], c_out[...]
|
| 72 |
-
"""
|
| 73 |
-
|
| 74 |
-
def __init__(self, decoder, pred_hidden):
|
| 75 |
-
super().__init__()
|
| 76 |
-
self.decoder = decoder
|
| 77 |
-
self.pred_hidden = pred_hidden
|
| 78 |
-
|
| 79 |
-
def forward(self, targets, target_lengths, h_in, c_in):
|
| 80 |
-
"""
|
| 81 |
-
Args:
|
| 82 |
-
targets: [1, 1] - previous token ID
|
| 83 |
-
target_lengths: [1] - always 1
|
| 84 |
-
h_in: [num_layers, 1, pred_hidden]
|
| 85 |
-
c_in: [num_layers, 1, pred_hidden]
|
| 86 |
-
Returns:
|
| 87 |
-
decoder_output: [1, pred_hidden, 2] - prediction network output (transposed)
|
| 88 |
-
h_out: [num_layers, 1, pred_hidden]
|
| 89 |
-
c_out: [num_layers, 1, pred_hidden]
|
| 90 |
-
"""
|
| 91 |
-
state = (h_in, c_in)
|
| 92 |
-
# pred_output shape: [batch, time, pred_hidden] = [1, 1, pred_hidden]
|
| 93 |
-
pred_output, new_state = self.decoder.predict(targets, state=state, add_sos=False)
|
| 94 |
-
h_out, c_out = new_state
|
| 95 |
-
|
| 96 |
-
# Transpose to [batch, pred_hidden, time] and concat two time steps
|
| 97 |
-
# (0.6B outputs [1, 640, 2] - we match this by duplicating)
|
| 98 |
-
pred_transposed = pred_output.transpose(1, 2) # [1, pred_hidden, 1]
|
| 99 |
-
decoder_output = torch.cat([pred_transposed, pred_transposed], dim=2) # [1, pred_hidden, 2]
|
| 100 |
-
|
| 101 |
-
return decoder_output, h_out, c_out
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
class JointWrapper(torch.nn.Module):
|
| 105 |
-
"""
|
| 106 |
-
Wrapper for the TDT joint network with internal argmax.
|
| 107 |
-
|
| 108 |
-
Matches 0.6B format:
|
| 109 |
-
- Input: encoder_step[1,encoder_dim,1], decoder_step[1,pred_hidden,1]
|
| 110 |
-
- Output: token_id[1,1,1], token_prob[1,1,1], duration[1,1,1]
|
| 111 |
-
"""
|
| 112 |
-
|
| 113 |
-
def __init__(self, joint, vocab_size, num_durations=5):
|
| 114 |
-
super().__init__()
|
| 115 |
-
self.joint = joint
|
| 116 |
-
self.vocab_size = vocab_size
|
| 117 |
-
self.num_durations = num_durations
|
| 118 |
-
|
| 119 |
-
def forward(self, encoder_step, decoder_step):
|
| 120 |
-
"""
|
| 121 |
-
Args:
|
| 122 |
-
encoder_step: [1, encoder_dim, 1]
|
| 123 |
-
decoder_step: [1, pred_hidden, 1]
|
| 124 |
-
Returns:
|
| 125 |
-
token_id: [1, 1, 1] - argmax token ID
|
| 126 |
-
token_prob: [1, 1, 1] - probability of selected token
|
| 127 |
-
duration: [1, 1, 1] - argmax duration bin
|
| 128 |
-
"""
|
| 129 |
-
# Transpose to [batch, 1, dim] for joint network
|
| 130 |
-
enc = encoder_step.transpose(1, 2) # [1, 1, encoder_dim]
|
| 131 |
-
dec = decoder_step.transpose(1, 2) # [1, 1, pred_hidden]
|
| 132 |
-
|
| 133 |
-
# Run joint network
|
| 134 |
-
# Joint output: [1, 1, 1, vocab_size + 1 (blank) + num_durations]
|
| 135 |
-
joint_out = self.joint.joint(enc, dec)
|
| 136 |
-
|
| 137 |
-
# Debug: print shape on first call
|
| 138 |
-
if not hasattr(self, '_debug_printed'):
|
| 139 |
-
self._debug_printed = True
|
| 140 |
-
print(f" Joint output shape: {joint_out.shape}")
|
| 141 |
-
print(f" Expected: vocab={self.vocab_size} + blank=1 + durations={self.num_durations} = {self.vocab_size + 1 + self.num_durations}")
|
| 142 |
-
|
| 143 |
-
# Split: token logits include vocab + blank, durations are separate
|
| 144 |
-
# vocab_size = 1024 tokens (0-1023), blank = index 1024, durations = indices 1025+
|
| 145 |
-
num_tokens = self.vocab_size + 1 # Include blank at vocab_size
|
| 146 |
-
logits = joint_out[..., :num_tokens] # [1, 1, 1, vocab_size + 1]
|
| 147 |
-
duration_logits = joint_out[..., num_tokens:] # [1, 1, 1, num_durations]
|
| 148 |
-
|
| 149 |
-
# Apply softmax and get probabilities
|
| 150 |
-
probs = F.softmax(logits, dim=-1)
|
| 151 |
-
|
| 152 |
-
# Argmax for token
|
| 153 |
-
token_id = torch.argmax(logits, dim=-1, keepdim=True) # [1, 1, 1, 1]
|
| 154 |
-
token_id = token_id.squeeze(-1) # [1, 1, 1]
|
| 155 |
-
|
| 156 |
-
# Get probability of selected token
|
| 157 |
-
token_prob = torch.gather(probs, -1, token_id.unsqueeze(-1)) # [1, 1, 1, 1]
|
| 158 |
-
token_prob = token_prob.squeeze(-1) # [1, 1, 1]
|
| 159 |
-
|
| 160 |
-
# Argmax for duration
|
| 161 |
-
duration = torch.argmax(duration_logits, dim=-1, keepdim=False) # [1, 1, 1]
|
| 162 |
-
|
| 163 |
-
return token_id.int(), token_prob, duration.int()
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
def convert_decoder(model, config, output_dir: Path):
|
| 167 |
-
"""Convert decoder to CoreML."""
|
| 168 |
-
print(f"Converting Decoder...")
|
| 169 |
-
print(f" pred_hidden={config['pred_hidden']}, num_layers={config['num_layers']}")
|
| 170 |
-
|
| 171 |
-
wrapper = DecoderWrapper(model.decoder, config['pred_hidden'])
|
| 172 |
-
wrapper.eval()
|
| 173 |
-
|
| 174 |
-
# Create example inputs
|
| 175 |
-
targets = torch.zeros(1, 1, dtype=torch.long)
|
| 176 |
-
target_lengths = torch.ones(1, dtype=torch.long)
|
| 177 |
-
h_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
|
| 178 |
-
c_in = torch.zeros(config['num_layers'], 1, config['pred_hidden'])
|
| 179 |
-
|
| 180 |
-
# Trace the model
|
| 181 |
-
with torch.no_grad():
|
| 182 |
-
traced = torch.jit.trace(wrapper, (targets, target_lengths, h_in, c_in))
|
| 183 |
-
|
| 184 |
-
# Convert to CoreML
|
| 185 |
-
mlmodel = ct.convert(
|
| 186 |
-
traced,
|
| 187 |
-
inputs=[
|
| 188 |
-
ct.TensorType(name="targets", shape=(1, 1), dtype=np.int32),
|
| 189 |
-
ct.TensorType(name="target_lengths", shape=(1,), dtype=np.int32),
|
| 190 |
-
ct.TensorType(name="h_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
|
| 191 |
-
ct.TensorType(name="c_in", shape=(config['num_layers'], 1, config['pred_hidden']), dtype=np.float32),
|
| 192 |
-
],
|
| 193 |
-
outputs=[
|
| 194 |
-
ct.TensorType(name="decoder_output"),
|
| 195 |
-
ct.TensorType(name="h_out"),
|
| 196 |
-
ct.TensorType(name="c_out"),
|
| 197 |
-
],
|
| 198 |
-
minimum_deployment_target=ct.target.iOS17,
|
| 199 |
-
compute_precision=ct.precision.FLOAT16,
|
| 200 |
-
)
|
| 201 |
-
|
| 202 |
-
# Add metadata
|
| 203 |
-
mlmodel.author = "Fluid Inference"
|
| 204 |
-
mlmodel.short_description = "Hybrid TDT Decoder (110M)"
|
| 205 |
-
|
| 206 |
-
# Save
|
| 207 |
-
output_path = output_dir / "Decoder.mlpackage"
|
| 208 |
-
mlmodel.save(str(output_path))
|
| 209 |
-
print(f" Saved to {output_path}")
|
| 210 |
-
|
| 211 |
-
return mlmodel
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
def convert_joint(model, config, output_dir: Path):
|
| 215 |
-
"""Convert joint network to CoreML."""
|
| 216 |
-
print(f"Converting JointDecision...")
|
| 217 |
-
print(f" encoder_dim={config['encoder_dim']}, pred_hidden={config['pred_hidden']}")
|
| 218 |
-
print(f" vocab_size={config['vocab_size']}, num_durations={config['num_durations']}")
|
| 219 |
-
|
| 220 |
-
wrapper = JointWrapper(
|
| 221 |
-
model.joint,
|
| 222 |
-
vocab_size=config['vocab_size'],
|
| 223 |
-
num_durations=config['num_durations']
|
| 224 |
-
)
|
| 225 |
-
wrapper.eval()
|
| 226 |
-
|
| 227 |
-
# Create example inputs - shape [1, dim, 1]
|
| 228 |
-
encoder_step = torch.randn(1, config['encoder_dim'], 1)
|
| 229 |
-
decoder_step = torch.randn(1, config['pred_hidden'], 1)
|
| 230 |
-
|
| 231 |
-
# Trace the model
|
| 232 |
-
with torch.no_grad():
|
| 233 |
-
traced = torch.jit.trace(wrapper, (encoder_step, decoder_step))
|
| 234 |
-
|
| 235 |
-
# Convert to CoreML
|
| 236 |
-
mlmodel = ct.convert(
|
| 237 |
-
traced,
|
| 238 |
-
inputs=[
|
| 239 |
-
ct.TensorType(name="encoder_step", shape=(1, config['encoder_dim'], 1), dtype=np.float32),
|
| 240 |
-
ct.TensorType(name="decoder_step", shape=(1, config['pred_hidden'], 1), dtype=np.float32),
|
| 241 |
-
],
|
| 242 |
-
outputs=[
|
| 243 |
-
ct.TensorType(name="token_id"),
|
| 244 |
-
ct.TensorType(name="token_prob"),
|
| 245 |
-
ct.TensorType(name="duration"),
|
| 246 |
-
],
|
| 247 |
-
minimum_deployment_target=ct.target.iOS17,
|
| 248 |
-
compute_precision=ct.precision.FLOAT16,
|
| 249 |
-
)
|
| 250 |
-
|
| 251 |
-
# Add metadata
|
| 252 |
-
mlmodel.author = "Fluid Inference"
|
| 253 |
-
mlmodel.short_description = "Hybrid Joint Decision (110M)"
|
| 254 |
-
|
| 255 |
-
# Save
|
| 256 |
-
output_path = output_dir / "JointDecision.mlpackage"
|
| 257 |
-
mlmodel.save(str(output_path))
|
| 258 |
-
print(f" Saved to {output_path}")
|
| 259 |
-
|
| 260 |
-
return mlmodel
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
def main():
|
| 264 |
-
parser = argparse.ArgumentParser(description="Convert TDT decoder to CoreML (0.6B format)")
|
| 265 |
-
parser.add_argument(
|
| 266 |
-
"--model-name",
|
| 267 |
-
default="nvidia/parakeet-tdt_ctc-110m",
|
| 268 |
-
help="NeMo model name or path"
|
| 269 |
-
)
|
| 270 |
-
parser.add_argument(
|
| 271 |
-
"--output-dir",
|
| 272 |
-
type=Path,
|
| 273 |
-
default=Path("./output"),
|
| 274 |
-
help="Output directory for CoreML models"
|
| 275 |
-
)
|
| 276 |
-
args = parser.parse_args()
|
| 277 |
-
|
| 278 |
-
# Create output directory
|
| 279 |
-
args.output_dir.mkdir(parents=True, exist_ok=True)
|
| 280 |
-
|
| 281 |
-
# Load model
|
| 282 |
-
print(f"Loading model: {args.model_name}")
|
| 283 |
-
model = nemo_asr.models.EncDecRNNTBPEModel.from_pretrained(args.model_name)
|
| 284 |
-
model.eval()
|
| 285 |
-
|
| 286 |
-
# Get model configuration
|
| 287 |
-
config = get_model_config(model)
|
| 288 |
-
|
| 289 |
-
# Auto-detect encoder dim if not found
|
| 290 |
-
if config['encoder_dim'] is None:
|
| 291 |
-
print("Auto-detecting encoder dimension...")
|
| 292 |
-
dummy_audio = torch.randn(1, 16000)
|
| 293 |
-
dummy_length = torch.tensor([16000])
|
| 294 |
-
with torch.no_grad():
|
| 295 |
-
enc_out, enc_len = model.encoder(
|
| 296 |
-
audio_signal=dummy_audio,
|
| 297 |
-
length=dummy_length
|
| 298 |
-
)
|
| 299 |
-
config['encoder_dim'] = enc_out.shape[-1]
|
| 300 |
-
|
| 301 |
-
print(f"\nModel config:")
|
| 302 |
-
for k, v in config.items():
|
| 303 |
-
print(f" {k}: {v}")
|
| 304 |
-
|
| 305 |
-
# Convert components
|
| 306 |
-
print()
|
| 307 |
-
convert_decoder(model, config, args.output_dir)
|
| 308 |
-
convert_joint(model, config, args.output_dir)
|
| 309 |
-
|
| 310 |
-
print("\nConversion complete!")
|
| 311 |
-
print(f"Models saved to: {args.output_dir}")
|
| 312 |
-
print("\nNext steps:")
|
| 313 |
-
print("1. Compile to .mlmodelc:")
|
| 314 |
-
print(f" cd {args.output_dir}")
|
| 315 |
-
print(" xcrun coremlcompiler compile Decoder.mlpackage .")
|
| 316 |
-
print(" xcrun coremlcompiler compile JointDecision.mlpackage .")
|
| 317 |
-
print("2. Copy to model cache:")
|
| 318 |
-
print(" cp -r Decoder.mlmodelc JointDecision.mlmodelc ~/Library/Application\\ Support/FluidAudio/Models/parakeet-ctc-110m-coreml/")
|
| 319 |
-
print("3. Test with: swift run fluidaudio hybrid-earnings-benchmark --max-files 1")
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
if __name__ == "__main__":
|
| 323 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/audio/yc_first_minute_16k_15s.wav
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c79c8bc763b4efccb3e12f199ec0a59aa2edc5e9e4d21ca70fde8f36762d4147
|
| 3 |
-
size 480078
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/analytics/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fc681823d92eca3dbece3a30c975afa7251eedae0e718b07ffbf1a8b4313b87e
|
| 3 |
-
size 243
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:2ebec8fc38c063de4b2159e21b1f981309fa5947c24d7e4883aca20f7c15fbb9
|
| 3 |
-
size 377
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/metadata.json
DELETED
|
@@ -1,66 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "Parakeet 110M CTC decoder head",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Float32",
|
| 10 |
-
"formattedType" : "MultiArray (Float32 1 Γ 188 Γ 1025)",
|
| 11 |
-
"shortDescription" : "",
|
| 12 |
-
"shape" : "[1, 188, 1025]",
|
| 13 |
-
"name" : "ctc_logits",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
}
|
| 16 |
-
],
|
| 17 |
-
"storagePrecision" : "Float16",
|
| 18 |
-
"modelParameters" : [
|
| 19 |
-
|
| 20 |
-
],
|
| 21 |
-
"author" : "Fluid Inference",
|
| 22 |
-
"specificationVersion" : 8,
|
| 23 |
-
"mlProgramOperationTypeHistogram" : {
|
| 24 |
-
"Ios17.cast" : 2,
|
| 25 |
-
"Ios17.conv" : 1,
|
| 26 |
-
"Ios17.transpose" : 1,
|
| 27 |
-
"Ios16.softmax" : 1,
|
| 28 |
-
"Ios17.log" : 1
|
| 29 |
-
},
|
| 30 |
-
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 31 |
-
"isUpdatable" : "0",
|
| 32 |
-
"stateSchema" : [
|
| 33 |
-
|
| 34 |
-
],
|
| 35 |
-
"availability" : {
|
| 36 |
-
"macOS" : "14.0",
|
| 37 |
-
"tvOS" : "17.0",
|
| 38 |
-
"visionOS" : "1.0",
|
| 39 |
-
"watchOS" : "10.0",
|
| 40 |
-
"iOS" : "17.0",
|
| 41 |
-
"macCatalyst" : "17.0"
|
| 42 |
-
},
|
| 43 |
-
"modelType" : {
|
| 44 |
-
"name" : "MLModelType_mlProgram"
|
| 45 |
-
},
|
| 46 |
-
"inputSchema" : [
|
| 47 |
-
{
|
| 48 |
-
"hasShapeFlexibility" : "0",
|
| 49 |
-
"isOptional" : "0",
|
| 50 |
-
"dataType" : "Float32",
|
| 51 |
-
"formattedType" : "MultiArray (Float32 1 Γ 512 Γ 188)",
|
| 52 |
-
"shortDescription" : "",
|
| 53 |
-
"shape" : "[1, 512, 188]",
|
| 54 |
-
"name" : "encoder_output",
|
| 55 |
-
"type" : "MultiArray"
|
| 56 |
-
}
|
| 57 |
-
],
|
| 58 |
-
"userDefinedMetadata" : {
|
| 59 |
-
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 60 |
-
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 61 |
-
"com.github.apple.coremltools.version" : "8.3.0"
|
| 62 |
-
},
|
| 63 |
-
"generatedClassName" : "parakeet_ctc_head",
|
| 64 |
-
"method" : "predict"
|
| 65 |
-
}
|
| 66 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/model.mil
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
-
{
|
| 4 |
-
func main<ios17>(tensor<fp32, [1, 512, 188]> encoder_output) {
|
| 5 |
-
tensor<int32, []> var_4 = const()[name = tensor<string, []>("op_4"), val = tensor<int32, []>(-1)];
|
| 6 |
-
tensor<string, []> var_18_pad_type_0 = const()[name = tensor<string, []>("op_18_pad_type_0"), val = tensor<string, []>("valid")];
|
| 7 |
-
tensor<int32, [1]> var_18_strides_0 = const()[name = tensor<string, []>("op_18_strides_0"), val = tensor<int32, [1]>([1])];
|
| 8 |
-
tensor<int32, [2]> var_18_pad_0 = const()[name = tensor<string, []>("op_18_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 9 |
-
tensor<int32, [1]> var_18_dilations_0 = const()[name = tensor<string, []>("op_18_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 10 |
-
tensor<int32, []> var_18_groups_0 = const()[name = tensor<string, []>("op_18_groups_0"), val = tensor<int32, []>(1)];
|
| 11 |
-
tensor<string, []> encoder_output_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_output_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 12 |
-
tensor<fp16, [1025, 512, 1]> module_decoder_layers_0_weight_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_weight_to_fp16"), val = tensor<fp16, [1025, 512, 1]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 13 |
-
tensor<fp16, [1025]> module_decoder_layers_0_bias_to_fp16 = const()[name = tensor<string, []>("module_decoder_layers_0_bias_to_fp16"), val = tensor<fp16, [1025]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1049728)))];
|
| 14 |
-
tensor<fp16, [1, 512, 188]> encoder_output_to_fp16 = cast(dtype = encoder_output_to_fp16_dtype_0, x = encoder_output)[name = tensor<string, []>("cast_1")];
|
| 15 |
-
tensor<fp16, [1, 1025, 188]> var_18_cast_fp16 = conv(bias = module_decoder_layers_0_bias_to_fp16, dilations = var_18_dilations_0, groups = var_18_groups_0, pad = var_18_pad_0, pad_type = var_18_pad_type_0, strides = var_18_strides_0, weight = module_decoder_layers_0_weight_to_fp16, x = encoder_output_to_fp16)[name = tensor<string, []>("op_18_cast_fp16")];
|
| 16 |
-
tensor<int32, [3]> input_perm_0 = const()[name = tensor<string, []>("input_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 17 |
-
tensor<fp16, [1, 188, 1025]> input_cast_fp16 = transpose(perm = input_perm_0, x = var_18_cast_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
-
tensor<fp16, [1, 188, 1025]> out_objects_softmax_cast_fp16 = softmax(axis = var_4, x = input_cast_fp16)[name = tensor<string, []>("out_objects_softmax_cast_fp16")];
|
| 19 |
-
tensor<fp32, []> out_objects_epsilon_0 = const()[name = tensor<string, []>("out_objects_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 20 |
-
tensor<fp16, [1, 188, 1025]> out_objects_cast_fp16 = log(epsilon = out_objects_epsilon_0, x = out_objects_softmax_cast_fp16)[name = tensor<string, []>("out_objects_cast_fp16")];
|
| 21 |
-
tensor<string, []> out_objects_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("out_objects_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 22 |
-
tensor<fp32, [1, 188, 1025]> ctc_logits = cast(dtype = out_objects_cast_fp16_to_fp32_dtype_0, x = out_objects_cast_fp16)[name = tensor<string, []>("cast_0")];
|
| 23 |
-
} -> (ctc_logits);
|
| 24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/CTCHead.mlmodelc/weights/weight.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fb9bead064427ffcb7529c0e3f378e421b4dde8e6d81447b6d1ca3352ca850e1
|
| 3 |
-
size 1051842
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/analytics/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:990455f6431342750254f66edf27bfb41be62a7ba17a18e1dd6afd4f5f56e9eb
|
| 3 |
-
size 243
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:29009727821ad8551ab5fe9271e93c597d92a9714f64b94aa533a9ceb6e22b93
|
| 3 |
-
size 498
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/metadata.json
DELETED
|
@@ -1,118 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "Parakeet 110M decoder (RNNT prediction network)",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Float32",
|
| 10 |
-
"formattedType" : "MultiArray (Float32 1 Γ 640 Γ 1)",
|
| 11 |
-
"shortDescription" : "",
|
| 12 |
-
"shape" : "[1, 640, 1]",
|
| 13 |
-
"name" : "decoder",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"hasShapeFlexibility" : "0",
|
| 18 |
-
"isOptional" : "0",
|
| 19 |
-
"dataType" : "Float32",
|
| 20 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1 Γ 640)",
|
| 21 |
-
"shortDescription" : "",
|
| 22 |
-
"shape" : "[1, 1, 640]",
|
| 23 |
-
"name" : "h_out",
|
| 24 |
-
"type" : "MultiArray"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"hasShapeFlexibility" : "0",
|
| 28 |
-
"isOptional" : "0",
|
| 29 |
-
"dataType" : "Float32",
|
| 30 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1 Γ 640)",
|
| 31 |
-
"shortDescription" : "",
|
| 32 |
-
"shape" : "[1, 1, 640]",
|
| 33 |
-
"name" : "c_out",
|
| 34 |
-
"type" : "MultiArray"
|
| 35 |
-
}
|
| 36 |
-
],
|
| 37 |
-
"storagePrecision" : "Float16",
|
| 38 |
-
"modelParameters" : [
|
| 39 |
-
|
| 40 |
-
],
|
| 41 |
-
"author" : "Fluid Inference",
|
| 42 |
-
"specificationVersion" : 8,
|
| 43 |
-
"mlProgramOperationTypeHistogram" : {
|
| 44 |
-
"Ios17.squeeze" : 2,
|
| 45 |
-
"Ios17.gather" : 1,
|
| 46 |
-
"Ios17.cast" : 6,
|
| 47 |
-
"Ios17.lstm" : 1,
|
| 48 |
-
"Ios17.transpose" : 2,
|
| 49 |
-
"Identity" : 1,
|
| 50 |
-
"Ios17.expandDims" : 2
|
| 51 |
-
},
|
| 52 |
-
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
|
| 53 |
-
"isUpdatable" : "0",
|
| 54 |
-
"stateSchema" : [
|
| 55 |
-
|
| 56 |
-
],
|
| 57 |
-
"availability" : {
|
| 58 |
-
"macOS" : "14.0",
|
| 59 |
-
"tvOS" : "17.0",
|
| 60 |
-
"visionOS" : "1.0",
|
| 61 |
-
"watchOS" : "10.0",
|
| 62 |
-
"iOS" : "17.0",
|
| 63 |
-
"macCatalyst" : "17.0"
|
| 64 |
-
},
|
| 65 |
-
"modelType" : {
|
| 66 |
-
"name" : "MLModelType_mlProgram"
|
| 67 |
-
},
|
| 68 |
-
"inputSchema" : [
|
| 69 |
-
{
|
| 70 |
-
"hasShapeFlexibility" : "0",
|
| 71 |
-
"isOptional" : "0",
|
| 72 |
-
"dataType" : "Int32",
|
| 73 |
-
"formattedType" : "MultiArray (Int32 1 Γ 1)",
|
| 74 |
-
"shortDescription" : "",
|
| 75 |
-
"shape" : "[1, 1]",
|
| 76 |
-
"name" : "targets",
|
| 77 |
-
"type" : "MultiArray"
|
| 78 |
-
},
|
| 79 |
-
{
|
| 80 |
-
"hasShapeFlexibility" : "0",
|
| 81 |
-
"isOptional" : "0",
|
| 82 |
-
"dataType" : "Int32",
|
| 83 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 84 |
-
"shortDescription" : "",
|
| 85 |
-
"shape" : "[1]",
|
| 86 |
-
"name" : "target_length",
|
| 87 |
-
"type" : "MultiArray"
|
| 88 |
-
},
|
| 89 |
-
{
|
| 90 |
-
"hasShapeFlexibility" : "0",
|
| 91 |
-
"isOptional" : "0",
|
| 92 |
-
"dataType" : "Float32",
|
| 93 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1 Γ 640)",
|
| 94 |
-
"shortDescription" : "",
|
| 95 |
-
"shape" : "[1, 1, 640]",
|
| 96 |
-
"name" : "h_in",
|
| 97 |
-
"type" : "MultiArray"
|
| 98 |
-
},
|
| 99 |
-
{
|
| 100 |
-
"hasShapeFlexibility" : "0",
|
| 101 |
-
"isOptional" : "0",
|
| 102 |
-
"dataType" : "Float32",
|
| 103 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1 Γ 640)",
|
| 104 |
-
"shortDescription" : "",
|
| 105 |
-
"shape" : "[1, 1, 640]",
|
| 106 |
-
"name" : "c_in",
|
| 107 |
-
"type" : "MultiArray"
|
| 108 |
-
}
|
| 109 |
-
],
|
| 110 |
-
"userDefinedMetadata" : {
|
| 111 |
-
"com.github.apple.coremltools.version" : "8.3.0",
|
| 112 |
-
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 113 |
-
"com.github.apple.coremltools.source" : "torch==2.9.0"
|
| 114 |
-
},
|
| 115 |
-
"generatedClassName" : "parakeet_decoder",
|
| 116 |
-
"method" : "predict"
|
| 117 |
-
}
|
| 118 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/model.mil
DELETED
|
@@ -1,45 +0,0 @@
|
|
| 1 |
-
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
-
{
|
| 4 |
-
func main<ios17>(tensor<fp32, [1, 1, 640]> c_in, tensor<fp32, [1, 1, 640]> h_in, tensor<int32, [1]> target_length, tensor<int32, [1, 1]> targets) {
|
| 5 |
-
tensor<int32, []> y_axis_0 = const()[name = tensor<string, []>("y_axis_0"), val = tensor<int32, []>(0)];
|
| 6 |
-
tensor<int32, []> y_batch_dims_0 = const()[name = tensor<string, []>("y_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 7 |
-
tensor<bool, []> y_validate_indices_0 = const()[name = tensor<string, []>("y_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 8 |
-
tensor<fp16, [1025, 640]> module_prediction_embed_weight_to_fp16 = const()[name = tensor<string, []>("module_prediction_embed_weight_to_fp16"), val = tensor<fp16, [1025, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 9 |
-
tensor<string, []> targets_to_int16_dtype_0 = const()[name = tensor<string, []>("targets_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 10 |
-
tensor<int16, [1, 1]> targets_to_int16 = cast(dtype = targets_to_int16_dtype_0, x = targets)[name = tensor<string, []>("cast_8")];
|
| 11 |
-
tensor<fp16, [1, 1, 640]> y_cast_fp16_cast_uint16 = gather(axis = y_axis_0, batch_dims = y_batch_dims_0, indices = targets_to_int16, validate_indices = y_validate_indices_0, x = module_prediction_embed_weight_to_fp16)[name = tensor<string, []>("y_cast_fp16_cast_uint16")];
|
| 12 |
-
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([1, 0, 2])];
|
| 13 |
-
tensor<int32, [1]> input_lstm_h0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_h0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 14 |
-
tensor<string, []> h_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("h_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 15 |
-
tensor<fp16, [1, 1, 640]> h_in_to_fp16 = cast(dtype = h_in_to_fp16_dtype_0, x = h_in)[name = tensor<string, []>("cast_7")];
|
| 16 |
-
tensor<fp16, [1, 640]> input_lstm_h0_squeeze_cast_fp16 = squeeze(axes = input_lstm_h0_squeeze_axes_0, x = h_in_to_fp16)[name = tensor<string, []>("input_lstm_h0_squeeze_cast_fp16")];
|
| 17 |
-
tensor<int32, [1]> input_lstm_c0_squeeze_axes_0 = const()[name = tensor<string, []>("input_lstm_c0_squeeze_axes_0"), val = tensor<int32, [1]>([0])];
|
| 18 |
-
tensor<string, []> c_in_to_fp16_dtype_0 = const()[name = tensor<string, []>("c_in_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 19 |
-
tensor<fp16, [1, 1, 640]> c_in_to_fp16 = cast(dtype = c_in_to_fp16_dtype_0, x = c_in)[name = tensor<string, []>("cast_6")];
|
| 20 |
-
tensor<fp16, [1, 640]> input_lstm_c0_squeeze_cast_fp16 = squeeze(axes = input_lstm_c0_squeeze_axes_0, x = c_in_to_fp16)[name = tensor<string, []>("input_lstm_c0_squeeze_cast_fp16")];
|
| 21 |
-
tensor<string, []> input_direction_0 = const()[name = tensor<string, []>("input_direction_0"), val = tensor<string, []>("forward")];
|
| 22 |
-
tensor<bool, []> input_output_sequence_0 = const()[name = tensor<string, []>("input_output_sequence_0"), val = tensor<bool, []>(true)];
|
| 23 |
-
tensor<string, []> input_recurrent_activation_0 = const()[name = tensor<string, []>("input_recurrent_activation_0"), val = tensor<string, []>("sigmoid")];
|
| 24 |
-
tensor<string, []> input_cell_activation_0 = const()[name = tensor<string, []>("input_cell_activation_0"), val = tensor<string, []>("tanh")];
|
| 25 |
-
tensor<string, []> input_activation_0 = const()[name = tensor<string, []>("input_activation_0"), val = tensor<string, []>("tanh")];
|
| 26 |
-
tensor<fp16, [2560, 640]> concat_1_to_fp16 = const()[name = tensor<string, []>("concat_1_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1312128)))];
|
| 27 |
-
tensor<fp16, [2560, 640]> concat_2_to_fp16 = const()[name = tensor<string, []>("concat_2_to_fp16"), val = tensor<fp16, [2560, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(4588992)))];
|
| 28 |
-
tensor<fp16, [2560]> concat_0_to_fp16 = const()[name = tensor<string, []>("concat_0_to_fp16"), val = tensor<fp16, [2560]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(7865856)))];
|
| 29 |
-
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = y_cast_fp16_cast_uint16)[name = tensor<string, []>("transpose_2")];
|
| 30 |
-
tensor<fp16, [1, 1, 640]> input_cast_fp16_0, tensor<fp16, [1, 640]> input_cast_fp16_1, tensor<fp16, [1, 640]> input_cast_fp16_2 = lstm(activation = input_activation_0, bias = concat_0_to_fp16, cell_activation = input_cell_activation_0, direction = input_direction_0, initial_c = input_lstm_c0_squeeze_cast_fp16, initial_h = input_lstm_h0_squeeze_cast_fp16, output_sequence = input_output_sequence_0, recurrent_activation = input_recurrent_activation_0, weight_hh = concat_2_to_fp16, weight_ih = concat_1_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
|
| 31 |
-
tensor<int32, [1]> obj_3_axes_0 = const()[name = tensor<string, []>("obj_3_axes_0"), val = tensor<int32, [1]>([0])];
|
| 32 |
-
tensor<fp16, [1, 1, 640]> obj_3_cast_fp16 = expand_dims(axes = obj_3_axes_0, x = input_cast_fp16_1)[name = tensor<string, []>("obj_3_cast_fp16")];
|
| 33 |
-
tensor<string, []> obj_3_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_3_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 34 |
-
tensor<int32, [1]> obj_axes_0 = const()[name = tensor<string, []>("obj_axes_0"), val = tensor<int32, [1]>([0])];
|
| 35 |
-
tensor<fp16, [1, 1, 640]> obj_cast_fp16 = expand_dims(axes = obj_axes_0, x = input_cast_fp16_2)[name = tensor<string, []>("obj_cast_fp16")];
|
| 36 |
-
tensor<string, []> obj_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("obj_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 37 |
-
tensor<int32, [3]> transpose_0_perm_0 = const()[name = tensor<string, []>("transpose_0_perm_0"), val = tensor<int32, [3]>([1, 2, 0])];
|
| 38 |
-
tensor<string, []> transpose_0_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("transpose_0_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 39 |
-
tensor<fp16, [1, 640, 1]> transpose_0_cast_fp16 = transpose(perm = transpose_0_perm_0, x = input_cast_fp16_0)[name = tensor<string, []>("transpose_1")];
|
| 40 |
-
tensor<fp32, [1, 640, 1]> decoder = cast(dtype = transpose_0_cast_fp16_to_fp32_dtype_0, x = transpose_0_cast_fp16)[name = tensor<string, []>("cast_3")];
|
| 41 |
-
tensor<fp32, [1, 1, 640]> c_out = cast(dtype = obj_cast_fp16_to_fp32_dtype_0, x = obj_cast_fp16)[name = tensor<string, []>("cast_4")];
|
| 42 |
-
tensor<fp32, [1, 1, 640]> h_out = cast(dtype = obj_3_cast_fp16_to_fp32_dtype_0, x = obj_3_cast_fp16)[name = tensor<string, []>("cast_5")];
|
| 43 |
-
tensor<int32, [1]> target_length_tmp = identity(x = target_length)[name = tensor<string, []>("target_length_tmp")];
|
| 44 |
-
} -> (decoder, h_out, c_out);
|
| 45 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Decoder.mlmodelc/weights/weight.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:dd90b58597ee2c172c672dffe13b1110898ba07394c1a15efc96cc8c6b18411b
|
| 3 |
-
size 7871040
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/analytics/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b7ae65e2af616df46066b7efca2d7c19941666ac0685f4ed005666890a052b0d
|
| 3 |
-
size 243
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0713c2d6ac5f8f6fb9582be250351ebd8efc925f71f4261191165f1406f2ee5d
|
| 3 |
-
size 437
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/metadata.json
DELETED
|
@@ -1,105 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "Parakeet 110M encoder (15 s window)",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Float32",
|
| 10 |
-
"formattedType" : "MultiArray (Float32 1 Γ 512 Γ 188)",
|
| 11 |
-
"shortDescription" : "",
|
| 12 |
-
"shape" : "[1, 512, 188]",
|
| 13 |
-
"name" : "encoder_output",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"hasShapeFlexibility" : "0",
|
| 18 |
-
"isOptional" : "0",
|
| 19 |
-
"dataType" : "Int32",
|
| 20 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
-
"shortDescription" : "",
|
| 22 |
-
"shape" : "[1]",
|
| 23 |
-
"name" : "encoder_length",
|
| 24 |
-
"type" : "MultiArray"
|
| 25 |
-
}
|
| 26 |
-
],
|
| 27 |
-
"storagePrecision" : "Float16",
|
| 28 |
-
"modelParameters" : [
|
| 29 |
-
|
| 30 |
-
],
|
| 31 |
-
"author" : "Fluid Inference",
|
| 32 |
-
"specificationVersion" : 8,
|
| 33 |
-
"mlProgramOperationTypeHistogram" : {
|
| 34 |
-
"Ios17.logicalAnd" : 2,
|
| 35 |
-
"Ios17.reshape" : 103,
|
| 36 |
-
"Ios16.softmax" : 17,
|
| 37 |
-
"Ios17.matmul" : 51,
|
| 38 |
-
"Ios17.transpose" : 123,
|
| 39 |
-
"Split" : 17,
|
| 40 |
-
"Ios17.expandDims" : 17,
|
| 41 |
-
"Select" : 51,
|
| 42 |
-
"Ios17.add" : 128,
|
| 43 |
-
"Tile" : 8,
|
| 44 |
-
"Ios17.sliceByIndex" : 34,
|
| 45 |
-
"Ios16.sigmoid" : 17,
|
| 46 |
-
"Pad" : 34,
|
| 47 |
-
"Ios17.logicalNot" : 2,
|
| 48 |
-
"Ios17.layerNorm" : 85,
|
| 49 |
-
"Ios16.silu" : 51,
|
| 50 |
-
"Ios17.less" : 5,
|
| 51 |
-
"Ios17.sub" : 3,
|
| 52 |
-
"Ios17.conv" : 56,
|
| 53 |
-
"Ios16.relu" : 3,
|
| 54 |
-
"Ios17.linear" : 137,
|
| 55 |
-
"Ios17.cast" : 11,
|
| 56 |
-
"Ios17.floorDiv" : 3,
|
| 57 |
-
"Ios17.mul" : 77
|
| 58 |
-
},
|
| 59 |
-
"computePrecision" : "Mixed (Float16, Float32, Int32)",
|
| 60 |
-
"isUpdatable" : "0",
|
| 61 |
-
"stateSchema" : [
|
| 62 |
-
|
| 63 |
-
],
|
| 64 |
-
"availability" : {
|
| 65 |
-
"macOS" : "14.0",
|
| 66 |
-
"tvOS" : "17.0",
|
| 67 |
-
"visionOS" : "1.0",
|
| 68 |
-
"watchOS" : "10.0",
|
| 69 |
-
"iOS" : "17.0",
|
| 70 |
-
"macCatalyst" : "17.0"
|
| 71 |
-
},
|
| 72 |
-
"modelType" : {
|
| 73 |
-
"name" : "MLModelType_mlProgram"
|
| 74 |
-
},
|
| 75 |
-
"inputSchema" : [
|
| 76 |
-
{
|
| 77 |
-
"hasShapeFlexibility" : "0",
|
| 78 |
-
"isOptional" : "0",
|
| 79 |
-
"dataType" : "Float32",
|
| 80 |
-
"formattedType" : "MultiArray (Float32 1 Γ 80 Γ 1501)",
|
| 81 |
-
"shortDescription" : "",
|
| 82 |
-
"shape" : "[1, 80, 1501]",
|
| 83 |
-
"name" : "mel_features",
|
| 84 |
-
"type" : "MultiArray"
|
| 85 |
-
},
|
| 86 |
-
{
|
| 87 |
-
"hasShapeFlexibility" : "0",
|
| 88 |
-
"isOptional" : "0",
|
| 89 |
-
"dataType" : "Int32",
|
| 90 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 91 |
-
"shortDescription" : "",
|
| 92 |
-
"shape" : "[1]",
|
| 93 |
-
"name" : "mel_length",
|
| 94 |
-
"type" : "MultiArray"
|
| 95 |
-
}
|
| 96 |
-
],
|
| 97 |
-
"userDefinedMetadata" : {
|
| 98 |
-
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 99 |
-
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 100 |
-
"com.github.apple.coremltools.version" : "8.3.0"
|
| 101 |
-
},
|
| 102 |
-
"generatedClassName" : "parakeet_encoder",
|
| 103 |
-
"method" : "predict"
|
| 104 |
-
}
|
| 105 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/model.mil
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Encoder.mlmodelc/weights/weight.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:cecf7994b2758397d992802a4f6e5d656e3a1aeb7bbedc2aa430b1316d62474c
|
| 3 |
-
size 215143424
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/analytics/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:983ba26dd9276b8d2d4f75f3475aefb1817c542df87dbd0fdac95bd63647494f
|
| 3 |
-
size 243
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0800e3bdf4ecb1bd46fd27e1826d33125cd574f9ae1e15dd9ff70ea42944ca2d
|
| 3 |
-
size 476
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/metadata.json
DELETED
|
@@ -1,102 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "Parakeet 110M joint + decision head (split, softmax, argmax)",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Int32",
|
| 10 |
-
"formattedType" : "MultiArray (Int32 1 Γ 188 Γ 1)",
|
| 11 |
-
"shortDescription" : "",
|
| 12 |
-
"shape" : "[1, 188, 1]",
|
| 13 |
-
"name" : "token_id",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"hasShapeFlexibility" : "0",
|
| 18 |
-
"isOptional" : "0",
|
| 19 |
-
"dataType" : "Float32",
|
| 20 |
-
"formattedType" : "MultiArray (Float32 1 Γ 188 Γ 1)",
|
| 21 |
-
"shortDescription" : "",
|
| 22 |
-
"shape" : "[1, 188, 1]",
|
| 23 |
-
"name" : "token_prob",
|
| 24 |
-
"type" : "MultiArray"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"hasShapeFlexibility" : "0",
|
| 28 |
-
"isOptional" : "0",
|
| 29 |
-
"dataType" : "Int32",
|
| 30 |
-
"formattedType" : "MultiArray (Int32 1 Γ 188 Γ 1)",
|
| 31 |
-
"shortDescription" : "",
|
| 32 |
-
"shape" : "[1, 188, 1]",
|
| 33 |
-
"name" : "duration",
|
| 34 |
-
"type" : "MultiArray"
|
| 35 |
-
}
|
| 36 |
-
],
|
| 37 |
-
"storagePrecision" : "Float16",
|
| 38 |
-
"modelParameters" : [
|
| 39 |
-
|
| 40 |
-
],
|
| 41 |
-
"author" : "Fluid Inference",
|
| 42 |
-
"specificationVersion" : 8,
|
| 43 |
-
"mlProgramOperationTypeHistogram" : {
|
| 44 |
-
"Ios17.reduceArgmax" : 2,
|
| 45 |
-
"Ios17.squeeze" : 1,
|
| 46 |
-
"Ios17.cast" : 4,
|
| 47 |
-
"Ios17.linear" : 3,
|
| 48 |
-
"Ios17.transpose" : 2,
|
| 49 |
-
"Ios17.sliceByIndex" : 2,
|
| 50 |
-
"Ios17.add" : 1,
|
| 51 |
-
"Ios16.relu" : 1,
|
| 52 |
-
"Ios16.softmax" : 1,
|
| 53 |
-
"Ios17.gatherAlongAxis" : 1,
|
| 54 |
-
"Ios17.expandDims" : 3
|
| 55 |
-
},
|
| 56 |
-
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32)",
|
| 57 |
-
"isUpdatable" : "0",
|
| 58 |
-
"stateSchema" : [
|
| 59 |
-
|
| 60 |
-
],
|
| 61 |
-
"availability" : {
|
| 62 |
-
"macOS" : "14.0",
|
| 63 |
-
"tvOS" : "17.0",
|
| 64 |
-
"visionOS" : "1.0",
|
| 65 |
-
"watchOS" : "10.0",
|
| 66 |
-
"iOS" : "17.0",
|
| 67 |
-
"macCatalyst" : "17.0"
|
| 68 |
-
},
|
| 69 |
-
"modelType" : {
|
| 70 |
-
"name" : "MLModelType_mlProgram"
|
| 71 |
-
},
|
| 72 |
-
"inputSchema" : [
|
| 73 |
-
{
|
| 74 |
-
"hasShapeFlexibility" : "0",
|
| 75 |
-
"isOptional" : "0",
|
| 76 |
-
"dataType" : "Float32",
|
| 77 |
-
"formattedType" : "MultiArray (Float32 1 Γ 512 Γ 188)",
|
| 78 |
-
"shortDescription" : "",
|
| 79 |
-
"shape" : "[1, 512, 188]",
|
| 80 |
-
"name" : "encoder",
|
| 81 |
-
"type" : "MultiArray"
|
| 82 |
-
},
|
| 83 |
-
{
|
| 84 |
-
"hasShapeFlexibility" : "0",
|
| 85 |
-
"isOptional" : "0",
|
| 86 |
-
"dataType" : "Float32",
|
| 87 |
-
"formattedType" : "MultiArray (Float32 1 Γ 640 Γ 1)",
|
| 88 |
-
"shortDescription" : "",
|
| 89 |
-
"shape" : "[1, 640, 1]",
|
| 90 |
-
"name" : "decoder",
|
| 91 |
-
"type" : "MultiArray"
|
| 92 |
-
}
|
| 93 |
-
],
|
| 94 |
-
"userDefinedMetadata" : {
|
| 95 |
-
"com.github.apple.coremltools.version" : "8.3.0",
|
| 96 |
-
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 97 |
-
"com.github.apple.coremltools.source" : "torch==2.9.0"
|
| 98 |
-
},
|
| 99 |
-
"generatedClassName" : "parakeet_joint_decision",
|
| 100 |
-
"method" : "predict"
|
| 101 |
-
}
|
| 102 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/model.mil
DELETED
|
@@ -1,58 +0,0 @@
|
|
| 1 |
-
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
-
{
|
| 4 |
-
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder, tensor<fp32, [1, 512, 188]> encoder) {
|
| 5 |
-
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 6 |
-
tensor<string, []> encoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
-
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 8 |
-
tensor<string, []> decoder_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 9 |
-
tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 10 |
-
tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
|
| 11 |
-
tensor<fp16, [1, 512, 188]> encoder_to_fp16 = cast(dtype = encoder_to_fp16_dtype_0, x = encoder)[name = tensor<string, []>("cast_6")];
|
| 12 |
-
tensor<fp16, [1, 188, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_to_fp16)[name = tensor<string, []>("transpose_1")];
|
| 13 |
-
tensor<fp16, [1, 188, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
|
| 14 |
-
tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
|
| 15 |
-
tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
|
| 16 |
-
tensor<fp16, [1, 640, 1]> decoder_to_fp16 = cast(dtype = decoder_to_fp16_dtype_0, x = decoder)[name = tensor<string, []>("cast_5")];
|
| 17 |
-
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_to_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
-
tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
|
| 19 |
-
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 20 |
-
tensor<fp16, [1, 188, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
|
| 21 |
-
tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
-
tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
|
| 23 |
-
tensor<fp16, [1, 188, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 24 |
-
tensor<fp16, [1, 188, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
|
| 25 |
-
tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
|
| 26 |
-
tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
|
| 27 |
-
tensor<fp16, [1, 188, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
|
| 28 |
-
tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
|
| 29 |
-
tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1025])];
|
| 30 |
-
tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
|
| 31 |
-
tensor<fp16, [1, 188, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
|
| 32 |
-
tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
|
| 33 |
-
tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 188, 1, 1030])];
|
| 34 |
-
tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
|
| 35 |
-
tensor<fp16, [1, 188, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
|
| 36 |
-
tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
|
| 37 |
-
tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 38 |
-
tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 39 |
-
tensor<int32, [1, 188, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
|
| 40 |
-
tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
|
| 41 |
-
tensor<fp16, [1, 188, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
|
| 42 |
-
tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 43 |
-
tensor<int32, [1, 188, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
|
| 44 |
-
tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
|
| 45 |
-
tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 46 |
-
tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 47 |
-
tensor<int16, [1, 188, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_4")];
|
| 48 |
-
tensor<fp16, [1, 188, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
|
| 49 |
-
tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 50 |
-
tensor<fp16, [1, 188, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
|
| 51 |
-
tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 52 |
-
tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
|
| 53 |
-
tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 54 |
-
tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 55 |
-
tensor<int32, [1, 188, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
|
| 56 |
-
tensor<fp32, [1, 188, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_3")];
|
| 57 |
-
} -> (token_id, token_prob, duration);
|
| 58 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecision.mlmodelc/weights/weight.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
|
| 3 |
-
size 2798028
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/analytics/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c7c11c6bb985fab7f835ba687a575f1eb04f4c93b0783155d634adbc49f0e797
|
| 3 |
-
size 243
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:1af2cb9bcc13eec83ce006e4f1c2cf158393745cd9187428333fbcb6917da244
|
| 3 |
-
size 535
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/metadata.json
DELETED
|
@@ -1,123 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "Parakeet 110M single-step joint decision (current frame)",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Int32",
|
| 10 |
-
"formattedType" : "MultiArray (Int32 1 Γ 1 Γ 1)",
|
| 11 |
-
"shortDescription" : "",
|
| 12 |
-
"shape" : "[1, 1, 1]",
|
| 13 |
-
"name" : "token_id",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"hasShapeFlexibility" : "0",
|
| 18 |
-
"isOptional" : "0",
|
| 19 |
-
"dataType" : "Float32",
|
| 20 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1 Γ 1)",
|
| 21 |
-
"shortDescription" : "",
|
| 22 |
-
"shape" : "[1, 1, 1]",
|
| 23 |
-
"name" : "token_prob",
|
| 24 |
-
"type" : "MultiArray"
|
| 25 |
-
},
|
| 26 |
-
{
|
| 27 |
-
"hasShapeFlexibility" : "0",
|
| 28 |
-
"isOptional" : "0",
|
| 29 |
-
"dataType" : "Int32",
|
| 30 |
-
"formattedType" : "MultiArray (Int32 1 Γ 1 Γ 1)",
|
| 31 |
-
"shortDescription" : "",
|
| 32 |
-
"shape" : "[1, 1, 1]",
|
| 33 |
-
"name" : "duration",
|
| 34 |
-
"type" : "MultiArray"
|
| 35 |
-
},
|
| 36 |
-
{
|
| 37 |
-
"hasShapeFlexibility" : "0",
|
| 38 |
-
"isOptional" : "0",
|
| 39 |
-
"dataType" : "Int32",
|
| 40 |
-
"formattedType" : "MultiArray (Int32 1 Γ 1 Γ 1 Γ 64)",
|
| 41 |
-
"shortDescription" : "",
|
| 42 |
-
"shape" : "[1, 1, 1, 64]",
|
| 43 |
-
"name" : "top_k_ids",
|
| 44 |
-
"type" : "MultiArray"
|
| 45 |
-
},
|
| 46 |
-
{
|
| 47 |
-
"hasShapeFlexibility" : "0",
|
| 48 |
-
"isOptional" : "0",
|
| 49 |
-
"dataType" : "Float32",
|
| 50 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1 Γ 1 Γ 64)",
|
| 51 |
-
"shortDescription" : "",
|
| 52 |
-
"shape" : "[1, 1, 1, 64]",
|
| 53 |
-
"name" : "top_k_logits",
|
| 54 |
-
"type" : "MultiArray"
|
| 55 |
-
}
|
| 56 |
-
],
|
| 57 |
-
"storagePrecision" : "Float16",
|
| 58 |
-
"modelParameters" : [
|
| 59 |
-
|
| 60 |
-
],
|
| 61 |
-
"author" : "Fluid Inference",
|
| 62 |
-
"specificationVersion" : 8,
|
| 63 |
-
"mlProgramOperationTypeHistogram" : {
|
| 64 |
-
"Ios17.reduceArgmax" : 2,
|
| 65 |
-
"Ios17.linear" : 3,
|
| 66 |
-
"Ios17.transpose" : 2,
|
| 67 |
-
"Ios17.sliceByIndex" : 2,
|
| 68 |
-
"Ios17.add" : 1,
|
| 69 |
-
"Ios17.topk" : 1,
|
| 70 |
-
"Ios16.relu" : 1,
|
| 71 |
-
"Ios16.softmax" : 1,
|
| 72 |
-
"Ios17.expandDims" : 3,
|
| 73 |
-
"Ios17.squeeze" : 1,
|
| 74 |
-
"Ios17.cast" : 6,
|
| 75 |
-
"Ios17.gatherAlongAxis" : 1
|
| 76 |
-
},
|
| 77 |
-
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
|
| 78 |
-
"isUpdatable" : "0",
|
| 79 |
-
"stateSchema" : [
|
| 80 |
-
|
| 81 |
-
],
|
| 82 |
-
"availability" : {
|
| 83 |
-
"macOS" : "14.0",
|
| 84 |
-
"tvOS" : "17.0",
|
| 85 |
-
"visionOS" : "1.0",
|
| 86 |
-
"watchOS" : "10.0",
|
| 87 |
-
"iOS" : "17.0",
|
| 88 |
-
"macCatalyst" : "17.0"
|
| 89 |
-
},
|
| 90 |
-
"modelType" : {
|
| 91 |
-
"name" : "MLModelType_mlProgram"
|
| 92 |
-
},
|
| 93 |
-
"inputSchema" : [
|
| 94 |
-
{
|
| 95 |
-
"hasShapeFlexibility" : "0",
|
| 96 |
-
"isOptional" : "0",
|
| 97 |
-
"dataType" : "Float32",
|
| 98 |
-
"formattedType" : "MultiArray (Float32 1 Γ 512 Γ 1)",
|
| 99 |
-
"shortDescription" : "",
|
| 100 |
-
"shape" : "[1, 512, 1]",
|
| 101 |
-
"name" : "encoder_step",
|
| 102 |
-
"type" : "MultiArray"
|
| 103 |
-
},
|
| 104 |
-
{
|
| 105 |
-
"hasShapeFlexibility" : "0",
|
| 106 |
-
"isOptional" : "0",
|
| 107 |
-
"dataType" : "Float32",
|
| 108 |
-
"formattedType" : "MultiArray (Float32 1 Γ 640 Γ 1)",
|
| 109 |
-
"shortDescription" : "",
|
| 110 |
-
"shape" : "[1, 640, 1]",
|
| 111 |
-
"name" : "decoder_step",
|
| 112 |
-
"type" : "MultiArray"
|
| 113 |
-
}
|
| 114 |
-
],
|
| 115 |
-
"userDefinedMetadata" : {
|
| 116 |
-
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 117 |
-
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 118 |
-
"com.github.apple.coremltools.version" : "8.3.0"
|
| 119 |
-
},
|
| 120 |
-
"generatedClassName" : "parakeet_joint_decision_single_step",
|
| 121 |
-
"method" : "predict"
|
| 122 |
-
}
|
| 123 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/model.mil
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
-
{
|
| 4 |
-
func main<ios17>(tensor<fp32, [1, 640, 1]> decoder_step, tensor<fp32, [1, 512, 1]> encoder_step) {
|
| 5 |
-
tensor<int32, [3]> input_1_perm_0 = const()[name = tensor<string, []>("input_1_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 6 |
-
tensor<string, []> encoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("encoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 7 |
-
tensor<int32, [3]> input_3_perm_0 = const()[name = tensor<string, []>("input_3_perm_0"), val = tensor<int32, [3]>([0, 2, 1])];
|
| 8 |
-
tensor<string, []> decoder_step_to_fp16_dtype_0 = const()[name = tensor<string, []>("decoder_step_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 9 |
-
tensor<fp16, [640, 512]> joint_module_enc_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_weight_to_fp16"), val = tensor<fp16, [640, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 10 |
-
tensor<fp16, [640]> joint_module_enc_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_enc_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(655488)))];
|
| 11 |
-
tensor<fp16, [1, 512, 1]> encoder_step_to_fp16 = cast(dtype = encoder_step_to_fp16_dtype_0, x = encoder_step)[name = tensor<string, []>("cast_9")];
|
| 12 |
-
tensor<fp16, [1, 1, 512]> input_1_cast_fp16 = transpose(perm = input_1_perm_0, x = encoder_step_to_fp16)[name = tensor<string, []>("transpose_1")];
|
| 13 |
-
tensor<fp16, [1, 1, 640]> linear_0_cast_fp16 = linear(bias = joint_module_enc_bias_to_fp16, weight = joint_module_enc_weight_to_fp16, x = input_1_cast_fp16)[name = tensor<string, []>("linear_0_cast_fp16")];
|
| 14 |
-
tensor<fp16, [640, 640]> joint_module_pred_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_weight_to_fp16"), val = tensor<fp16, [640, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(656832)))];
|
| 15 |
-
tensor<fp16, [640]> joint_module_pred_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_pred_bias_to_fp16"), val = tensor<fp16, [640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1476096)))];
|
| 16 |
-
tensor<fp16, [1, 640, 1]> decoder_step_to_fp16 = cast(dtype = decoder_step_to_fp16_dtype_0, x = decoder_step)[name = tensor<string, []>("cast_8")];
|
| 17 |
-
tensor<fp16, [1, 1, 640]> input_3_cast_fp16 = transpose(perm = input_3_perm_0, x = decoder_step_to_fp16)[name = tensor<string, []>("transpose_0")];
|
| 18 |
-
tensor<fp16, [1, 1, 640]> linear_1_cast_fp16 = linear(bias = joint_module_pred_bias_to_fp16, weight = joint_module_pred_weight_to_fp16, x = input_3_cast_fp16)[name = tensor<string, []>("linear_1_cast_fp16")];
|
| 19 |
-
tensor<int32, [1]> var_23_axes_0 = const()[name = tensor<string, []>("op_23_axes_0"), val = tensor<int32, [1]>([2])];
|
| 20 |
-
tensor<fp16, [1, 1, 1, 640]> var_23_cast_fp16 = expand_dims(axes = var_23_axes_0, x = linear_0_cast_fp16)[name = tensor<string, []>("op_23_cast_fp16")];
|
| 21 |
-
tensor<int32, [1]> var_24_axes_0 = const()[name = tensor<string, []>("op_24_axes_0"), val = tensor<int32, [1]>([1])];
|
| 22 |
-
tensor<fp16, [1, 1, 1, 640]> var_24_cast_fp16 = expand_dims(axes = var_24_axes_0, x = linear_1_cast_fp16)[name = tensor<string, []>("op_24_cast_fp16")];
|
| 23 |
-
tensor<fp16, [1, 1, 1, 640]> input_5_cast_fp16 = add(x = var_23_cast_fp16, y = var_24_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 24 |
-
tensor<fp16, [1, 1, 1, 640]> input_7_cast_fp16 = relu(x = input_5_cast_fp16)[name = tensor<string, []>("input_7_cast_fp16")];
|
| 25 |
-
tensor<fp16, [1030, 640]> joint_module_joint_net_2_weight_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_weight_to_fp16"), val = tensor<fp16, [1030, 640]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(1477440)))];
|
| 26 |
-
tensor<fp16, [1030]> joint_module_joint_net_2_bias_to_fp16 = const()[name = tensor<string, []>("joint_module_joint_net_2_bias_to_fp16"), val = tensor<fp16, [1030]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(2795904)))];
|
| 27 |
-
tensor<fp16, [1, 1, 1, 1030]> linear_2_cast_fp16 = linear(bias = joint_module_joint_net_2_bias_to_fp16, weight = joint_module_joint_net_2_weight_to_fp16, x = input_7_cast_fp16)[name = tensor<string, []>("linear_2_cast_fp16")];
|
| 28 |
-
tensor<int32, [4]> token_logits_begin_0 = const()[name = tensor<string, []>("token_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 0])];
|
| 29 |
-
tensor<int32, [4]> token_logits_end_0 = const()[name = tensor<string, []>("token_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1025])];
|
| 30 |
-
tensor<bool, [4]> token_logits_end_mask_0 = const()[name = tensor<string, []>("token_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, false])];
|
| 31 |
-
tensor<fp16, [1, 1, 1, 1025]> token_logits_cast_fp16 = slice_by_index(begin = token_logits_begin_0, end = token_logits_end_0, end_mask = token_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("token_logits_cast_fp16")];
|
| 32 |
-
tensor<int32, [4]> duration_logits_begin_0 = const()[name = tensor<string, []>("duration_logits_begin_0"), val = tensor<int32, [4]>([0, 0, 0, 1025])];
|
| 33 |
-
tensor<int32, [4]> duration_logits_end_0 = const()[name = tensor<string, []>("duration_logits_end_0"), val = tensor<int32, [4]>([1, 1, 1, 1030])];
|
| 34 |
-
tensor<bool, [4]> duration_logits_end_mask_0 = const()[name = tensor<string, []>("duration_logits_end_mask_0"), val = tensor<bool, [4]>([true, true, true, true])];
|
| 35 |
-
tensor<fp16, [1, 1, 1, 5]> duration_logits_cast_fp16 = slice_by_index(begin = duration_logits_begin_0, end = duration_logits_end_0, end_mask = duration_logits_end_mask_0, x = linear_2_cast_fp16)[name = tensor<string, []>("duration_logits_cast_fp16")];
|
| 36 |
-
tensor<int32, []> var_43_axis_0 = const()[name = tensor<string, []>("op_43_axis_0"), val = tensor<int32, []>(-1)];
|
| 37 |
-
tensor<bool, []> var_43_keep_dims_0 = const()[name = tensor<string, []>("op_43_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 38 |
-
tensor<string, []> var_43_output_dtype_0 = const()[name = tensor<string, []>("op_43_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 39 |
-
tensor<int32, [1, 1, 1]> token_id = reduce_argmax(axis = var_43_axis_0, keep_dims = var_43_keep_dims_0, output_dtype = var_43_output_dtype_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_43_cast_fp16")];
|
| 40 |
-
tensor<int32, []> var_49 = const()[name = tensor<string, []>("op_49"), val = tensor<int32, []>(-1)];
|
| 41 |
-
tensor<fp16, [1, 1, 1, 1025]> token_probs_all_cast_fp16 = softmax(axis = var_49, x = token_logits_cast_fp16)[name = tensor<string, []>("token_probs_all_cast_fp16")];
|
| 42 |
-
tensor<int32, [1]> var_58_axes_0 = const()[name = tensor<string, []>("op_58_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 43 |
-
tensor<int32, [1, 1, 1, 1]> var_58 = expand_dims(axes = var_58_axes_0, x = token_id)[name = tensor<string, []>("op_58")];
|
| 44 |
-
tensor<int32, []> var_59 = const()[name = tensor<string, []>("op_59"), val = tensor<int32, []>(-1)];
|
| 45 |
-
tensor<bool, []> var_61_validate_indices_0 = const()[name = tensor<string, []>("op_61_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 46 |
-
tensor<string, []> var_58_to_int16_dtype_0 = const()[name = tensor<string, []>("op_58_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 47 |
-
tensor<int16, [1, 1, 1, 1]> var_58_to_int16 = cast(dtype = var_58_to_int16_dtype_0, x = var_58)[name = tensor<string, []>("cast_7")];
|
| 48 |
-
tensor<fp16, [1, 1, 1, 1]> var_61_cast_fp16_cast_int16 = gather_along_axis(axis = var_59, indices = var_58_to_int16, validate_indices = var_61_validate_indices_0, x = token_probs_all_cast_fp16)[name = tensor<string, []>("op_61_cast_fp16_cast_int16")];
|
| 49 |
-
tensor<int32, [1]> var_63_axes_0 = const()[name = tensor<string, []>("op_63_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 50 |
-
tensor<fp16, [1, 1, 1]> var_63_cast_fp16 = squeeze(axes = var_63_axes_0, x = var_61_cast_fp16_cast_int16)[name = tensor<string, []>("op_63_cast_fp16")];
|
| 51 |
-
tensor<string, []> var_63_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_63_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 52 |
-
tensor<int32, []> var_66_axis_0 = const()[name = tensor<string, []>("op_66_axis_0"), val = tensor<int32, []>(-1)];
|
| 53 |
-
tensor<bool, []> var_66_keep_dims_0 = const()[name = tensor<string, []>("op_66_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 54 |
-
tensor<string, []> var_66_output_dtype_0 = const()[name = tensor<string, []>("op_66_output_dtype_0"), val = tensor<string, []>("int32")];
|
| 55 |
-
tensor<int32, [1, 1, 1]> duration = reduce_argmax(axis = var_66_axis_0, keep_dims = var_66_keep_dims_0, output_dtype = var_66_output_dtype_0, x = duration_logits_cast_fp16)[name = tensor<string, []>("op_66_cast_fp16")];
|
| 56 |
-
tensor<int32, []> var_72 = const()[name = tensor<string, []>("op_72"), val = tensor<int32, []>(64)];
|
| 57 |
-
tensor<int32, []> var_76_axis_0 = const()[name = tensor<string, []>("op_76_axis_0"), val = tensor<int32, []>(-1)];
|
| 58 |
-
tensor<bool, []> var_76_ascending_0 = const()[name = tensor<string, []>("op_76_ascending_0"), val = tensor<bool, []>(false)];
|
| 59 |
-
tensor<bool, []> var_76_sort_0 = const()[name = tensor<string, []>("op_76_sort_0"), val = tensor<bool, []>(true)];
|
| 60 |
-
tensor<bool, []> var_76_return_indices_0 = const()[name = tensor<string, []>("op_76_return_indices_0"), val = tensor<bool, []>(true)];
|
| 61 |
-
tensor<string, []> var_76_cast_fp16_cast_int16_output_indices_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_output_indices_dtype_0"), val = tensor<string, []>("uint16")];
|
| 62 |
-
tensor<fp16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_0, tensor<uint16, [1, 1, 1, 64]> var_76_cast_fp16_cast_int16_1 = topk(ascending = var_76_ascending_0, axis = var_76_axis_0, k = var_72, output_indices_dtype = var_76_cast_fp16_cast_int16_output_indices_dtype_0, return_indices = var_76_return_indices_0, sort = var_76_sort_0, x = token_logits_cast_fp16)[name = tensor<string, []>("op_76_cast_fp16_cast_int16")];
|
| 63 |
-
tensor<string, []> var_76_cast_fp16_cast_int16_1_to_int32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_cast_int16_1_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 64 |
-
tensor<string, []> var_76_cast_fp16_0_to_fp32_dtype_0 = const()[name = tensor<string, []>("op_76_cast_fp16_0_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 65 |
-
tensor<fp32, [1, 1, 1, 64]> top_k_logits = cast(dtype = var_76_cast_fp16_0_to_fp32_dtype_0, x = var_76_cast_fp16_cast_int16_0)[name = tensor<string, []>("cast_4")];
|
| 66 |
-
tensor<int32, [1, 1, 1, 64]> top_k_ids = cast(dtype = var_76_cast_fp16_cast_int16_1_to_int32_dtype_0, x = var_76_cast_fp16_cast_int16_1)[name = tensor<string, []>("cast_5")];
|
| 67 |
-
tensor<fp32, [1, 1, 1]> token_prob = cast(dtype = var_63_cast_fp16_to_fp32_dtype_0, x = var_63_cast_fp16)[name = tensor<string, []>("cast_6")];
|
| 68 |
-
} -> (token_id, token_prob, duration, top_k_ids, top_k_logits);
|
| 69 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/JointDecisionSingleStep.mlmodelc/weights/weight.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b3f771cb65b190f1873e39629676ed79b65a8361522f451b37bdba8b1106e6ff
|
| 3 |
-
size 2798028
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/analytics/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:a1ac15543fbb9301fba5f018b147e44d767479dec352aaa91dfe7bcf65949693
|
| 3 |
-
size 243
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/coremldata.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4940877938cc1b6d8830bbdd68ac8a49377cc57d75b61308883da5235b6a1914
|
| 3 |
-
size 439
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/metadata.json
DELETED
|
@@ -1,112 +0,0 @@
|
|
| 1 |
-
[
|
| 2 |
-
{
|
| 3 |
-
"metadataOutputVersion" : "3.0",
|
| 4 |
-
"shortDescription" : "Parakeet 110M preprocessor (15 s window)",
|
| 5 |
-
"outputSchema" : [
|
| 6 |
-
{
|
| 7 |
-
"hasShapeFlexibility" : "0",
|
| 8 |
-
"isOptional" : "0",
|
| 9 |
-
"dataType" : "Float32",
|
| 10 |
-
"formattedType" : "MultiArray (Float32)",
|
| 11 |
-
"shortDescription" : "",
|
| 12 |
-
"shape" : "[]",
|
| 13 |
-
"name" : "mel_features",
|
| 14 |
-
"type" : "MultiArray"
|
| 15 |
-
},
|
| 16 |
-
{
|
| 17 |
-
"hasShapeFlexibility" : "0",
|
| 18 |
-
"isOptional" : "0",
|
| 19 |
-
"dataType" : "Int32",
|
| 20 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 21 |
-
"shortDescription" : "",
|
| 22 |
-
"shape" : "[1]",
|
| 23 |
-
"name" : "mel_length",
|
| 24 |
-
"type" : "MultiArray"
|
| 25 |
-
}
|
| 26 |
-
],
|
| 27 |
-
"storagePrecision" : "Float16",
|
| 28 |
-
"modelParameters" : [
|
| 29 |
-
|
| 30 |
-
],
|
| 31 |
-
"author" : "Fluid Inference",
|
| 32 |
-
"specificationVersion" : 8,
|
| 33 |
-
"mlProgramOperationTypeHistogram" : {
|
| 34 |
-
"Range1d" : 3,
|
| 35 |
-
"Ios17.equal" : 1,
|
| 36 |
-
"Ios17.notEqual" : 1,
|
| 37 |
-
"Ios17.reshape" : 2,
|
| 38 |
-
"Identity" : 1,
|
| 39 |
-
"Ios17.matmul" : 1,
|
| 40 |
-
"Select" : 6,
|
| 41 |
-
"Ios17.expandDims" : 12,
|
| 42 |
-
"Ios17.add" : 3,
|
| 43 |
-
"Tile" : 2,
|
| 44 |
-
"Ios17.sliceByIndex" : 3,
|
| 45 |
-
"Ios16.reduceSum" : 4,
|
| 46 |
-
"Shape" : 4,
|
| 47 |
-
"Ios17.gather" : 4,
|
| 48 |
-
"Ios17.logicalNot" : 1,
|
| 49 |
-
"Pad" : 1,
|
| 50 |
-
"Ios17.log" : 1,
|
| 51 |
-
"Ios17.less" : 2,
|
| 52 |
-
"Ios17.sub" : 4,
|
| 53 |
-
"Ios17.conv" : 2,
|
| 54 |
-
"Ios17.pow" : 2,
|
| 55 |
-
"Ios17.cast" : 10,
|
| 56 |
-
"Ios17.concat" : 3,
|
| 57 |
-
"Stack" : 1,
|
| 58 |
-
"Ios17.floorDiv" : 1,
|
| 59 |
-
"Ios17.realDiv" : 4,
|
| 60 |
-
"Ios17.sqrt" : 1,
|
| 61 |
-
"Ios17.greaterEqual" : 1,
|
| 62 |
-
"Ios17.mul" : 1
|
| 63 |
-
},
|
| 64 |
-
"computePrecision" : "Mixed (Float16, Float32, Int16, Int32, UInt16)",
|
| 65 |
-
"isUpdatable" : "0",
|
| 66 |
-
"stateSchema" : [
|
| 67 |
-
|
| 68 |
-
],
|
| 69 |
-
"availability" : {
|
| 70 |
-
"macOS" : "14.0",
|
| 71 |
-
"tvOS" : "17.0",
|
| 72 |
-
"visionOS" : "1.0",
|
| 73 |
-
"watchOS" : "10.0",
|
| 74 |
-
"iOS" : "17.0",
|
| 75 |
-
"macCatalyst" : "17.0"
|
| 76 |
-
},
|
| 77 |
-
"modelType" : {
|
| 78 |
-
"name" : "MLModelType_mlProgram"
|
| 79 |
-
},
|
| 80 |
-
"inputSchema" : [
|
| 81 |
-
{
|
| 82 |
-
"dataType" : "Float32",
|
| 83 |
-
"hasShapeFlexibility" : "1",
|
| 84 |
-
"isOptional" : "0",
|
| 85 |
-
"shapeFlexibility" : "1 Γ 1...240000",
|
| 86 |
-
"shapeRange" : "[[1, 1], [1, 240000]]",
|
| 87 |
-
"formattedType" : "MultiArray (Float32 1 Γ 1)",
|
| 88 |
-
"type" : "MultiArray",
|
| 89 |
-
"shape" : "[1, 1]",
|
| 90 |
-
"name" : "audio",
|
| 91 |
-
"shortDescription" : ""
|
| 92 |
-
},
|
| 93 |
-
{
|
| 94 |
-
"hasShapeFlexibility" : "0",
|
| 95 |
-
"isOptional" : "0",
|
| 96 |
-
"dataType" : "Int32",
|
| 97 |
-
"formattedType" : "MultiArray (Int32 1)",
|
| 98 |
-
"shortDescription" : "",
|
| 99 |
-
"shape" : "[1]",
|
| 100 |
-
"name" : "audio_length",
|
| 101 |
-
"type" : "MultiArray"
|
| 102 |
-
}
|
| 103 |
-
],
|
| 104 |
-
"userDefinedMetadata" : {
|
| 105 |
-
"com.github.apple.coremltools.source_dialect" : "TorchScript",
|
| 106 |
-
"com.github.apple.coremltools.source" : "torch==2.9.0",
|
| 107 |
-
"com.github.apple.coremltools.version" : "8.3.0"
|
| 108 |
-
},
|
| 109 |
-
"generatedClassName" : "parakeet_preprocessor",
|
| 110 |
-
"method" : "predict"
|
| 111 |
-
}
|
| 112 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/model.mil
DELETED
|
@@ -1,191 +0,0 @@
|
|
| 1 |
-
program(1.0)
|
| 2 |
-
[buildInfo = dict<tensor<string, []>, tensor<string, []>>({{"coremlc-component-MIL", "3510.2.1"}, {"coremlc-version", "3500.32.1"}, {"coremltools-component-torch", "2.9.0"}, {"coremltools-source-dialect", "TorchScript"}, {"coremltools-version", "8.3.0"}})]
|
| 3 |
-
{
|
| 4 |
-
func main<ios17>(tensor<fp32, [1, ?]> audio, tensor<int32, [1]> audio_length) [FlexibleShapeInformation = tuple<tuple<tensor<string, []>, dict<tensor<string, []>, tensor<int32, [?]>>>, tuple<tensor<string, []>, dict<tensor<string, []>, list<tensor<int32, [2]>, ?>>>>((("DefaultShapes", {{"audio", [1, 1]}}), ("RangeDims", {{"audio", [[1, 1], [1, 240000]]}})))] {
|
| 5 |
-
tensor<int32, []> var_9 = const()[name = tensor<string, []>("op_9"), val = tensor<int32, []>(1)];
|
| 6 |
-
tensor<int32, []> var_10 = const()[name = tensor<string, []>("op_10"), val = tensor<int32, []>(160)];
|
| 7 |
-
tensor<int32, []> var_12 = const()[name = tensor<string, []>("op_12"), val = tensor<int32, []>(0)];
|
| 8 |
-
tensor<int32, []> var_34 = const()[name = tensor<string, []>("op_34"), val = tensor<int32, []>(512)];
|
| 9 |
-
tensor<int32, [1]> var_35 = add(x = audio_length, y = var_34)[name = tensor<string, []>("op_35")];
|
| 10 |
-
tensor<int32, []> var_36 = const()[name = tensor<string, []>("op_36"), val = tensor<int32, []>(512)];
|
| 11 |
-
tensor<int32, [1]> var_37 = sub(x = var_35, y = var_36)[name = tensor<string, []>("op_37")];
|
| 12 |
-
tensor<int32, [1]> floor_div_0 = floor_div(x = var_37, y = var_10)[name = tensor<string, []>("floor_div_0")];
|
| 13 |
-
tensor<bool, [1]> var_40 = equal(x = audio_length, y = var_12)[name = tensor<string, []>("op_40")];
|
| 14 |
-
tensor<int32, [1]> var_41 = const()[name = tensor<string, []>("op_41"), val = tensor<int32, [1]>([0])];
|
| 15 |
-
tensor<int32, [1]> mel_length = select(a = var_41, b = floor_div_0, cond = var_40)[name = tensor<string, []>("seq_len")];
|
| 16 |
-
tensor<string, []> audio_to_fp16_dtype_0 = const()[name = tensor<string, []>("audio_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 17 |
-
tensor<fp16, [1, ?]> audio_to_fp16 = cast(dtype = audio_to_fp16_dtype_0, x = audio)[name = tensor<string, []>("cast_27")];
|
| 18 |
-
tensor<int32, [2]> var_43_shape_cast_fp16 = shape(x = audio_to_fp16)[name = tensor<string, []>("op_43_shape_cast_fp16")];
|
| 19 |
-
tensor<int32, []> gather_0_axis_0 = const()[name = tensor<string, []>("gather_0_axis_0"), val = tensor<int32, []>(0)];
|
| 20 |
-
tensor<int32, []> gather_0_batch_dims_0 = const()[name = tensor<string, []>("gather_0_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 21 |
-
tensor<bool, []> gather_0_validate_indices_0 = const()[name = tensor<string, []>("gather_0_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 22 |
-
tensor<string, []> var_43_shape_cast_fp16_to_int16_dtype_0 = const()[name = tensor<string, []>("op_43_shape_cast_fp16_to_int16_dtype_0"), val = tensor<string, []>("int16")];
|
| 23 |
-
tensor<uint16, []> select_0_to_uint16 = const()[name = tensor<string, []>("select_0_to_uint16"), val = tensor<uint16, []>(1)];
|
| 24 |
-
tensor<int16, [2]> var_43_shape_cast_fp16_to_int16 = cast(dtype = var_43_shape_cast_fp16_to_int16_dtype_0, x = var_43_shape_cast_fp16)[name = tensor<string, []>("cast_26")];
|
| 25 |
-
tensor<int16, []> gather_0_cast_uint16 = gather(axis = gather_0_axis_0, batch_dims = gather_0_batch_dims_0, indices = select_0_to_uint16, validate_indices = gather_0_validate_indices_0, x = var_43_shape_cast_fp16_to_int16)[name = tensor<string, []>("gather_0_cast_uint16")];
|
| 26 |
-
tensor<string, []> gather_0_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_0_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 27 |
-
tensor<int32, []> const_0 = const()[name = tensor<string, []>("const_0"), val = tensor<int32, []>(0)];
|
| 28 |
-
tensor<int32, []> const_1 = const()[name = tensor<string, []>("const_1"), val = tensor<int32, []>(1)];
|
| 29 |
-
tensor<int32, []> gather_0_cast_uint16_to_int32 = cast(dtype = gather_0_cast_uint16_to_int32_dtype_0, x = gather_0_cast_uint16)[name = tensor<string, []>("cast_25")];
|
| 30 |
-
tensor<int32, [?]> var_44 = range_1d(end = gather_0_cast_uint16_to_int32, start = const_0, step = const_1)[name = tensor<string, []>("op_44")];
|
| 31 |
-
tensor<int32, [1]> var_45_axes_0 = const()[name = tensor<string, []>("op_45_axes_0"), val = tensor<int32, [1]>([0])];
|
| 32 |
-
tensor<int32, [1, ?]> var_45 = expand_dims(axes = var_45_axes_0, x = var_44)[name = tensor<string, []>("op_45")];
|
| 33 |
-
tensor<int32, [1]> var_46_axes_0 = const()[name = tensor<string, []>("op_46_axes_0"), val = tensor<int32, [1]>([1])];
|
| 34 |
-
tensor<int32, [1, 1]> var_46 = expand_dims(axes = var_46_axes_0, x = audio_length)[name = tensor<string, []>("op_46")];
|
| 35 |
-
tensor<bool, [1, ?]> timemask = less(x = var_45, y = var_46)[name = tensor<string, []>("timemask")];
|
| 36 |
-
tensor<int32, [2]> var_49_begin_0 = const()[name = tensor<string, []>("op_49_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 37 |
-
tensor<int32, [2]> var_49_end_0 = const()[name = tensor<string, []>("op_49_end_0"), val = tensor<int32, [2]>([1, 1])];
|
| 38 |
-
tensor<bool, [2]> var_49_end_mask_0 = const()[name = tensor<string, []>("op_49_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 39 |
-
tensor<bool, [2]> var_49_squeeze_mask_0 = const()[name = tensor<string, []>("op_49_squeeze_mask_0"), val = tensor<bool, [2]>([false, true])];
|
| 40 |
-
tensor<fp16, [1]> var_49_cast_fp16 = slice_by_index(begin = var_49_begin_0, end = var_49_end_0, end_mask = var_49_end_mask_0, squeeze_mask = var_49_squeeze_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_49_cast_fp16")];
|
| 41 |
-
tensor<int32, [1]> var_50_axes_0 = const()[name = tensor<string, []>("op_50_axes_0"), val = tensor<int32, [1]>([1])];
|
| 42 |
-
tensor<fp16, [1, 1]> var_50_cast_fp16 = expand_dims(axes = var_50_axes_0, x = var_49_cast_fp16)[name = tensor<string, []>("op_50_cast_fp16")];
|
| 43 |
-
tensor<int32, [2]> var_52_begin_0 = const()[name = tensor<string, []>("op_52_begin_0"), val = tensor<int32, [2]>([0, 1])];
|
| 44 |
-
tensor<int32, [2]> var_52_end_0 = const()[name = tensor<string, []>("op_52_end_0"), val = tensor<int32, [2]>([1, 0])];
|
| 45 |
-
tensor<bool, [2]> var_52_end_mask_0 = const()[name = tensor<string, []>("op_52_end_mask_0"), val = tensor<bool, [2]>([true, true])];
|
| 46 |
-
tensor<fp16, [1, ?]> var_52_cast_fp16 = slice_by_index(begin = var_52_begin_0, end = var_52_end_0, end_mask = var_52_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_52_cast_fp16")];
|
| 47 |
-
tensor<int32, [2]> var_54_begin_0 = const()[name = tensor<string, []>("op_54_begin_0"), val = tensor<int32, [2]>([0, 0])];
|
| 48 |
-
tensor<int32, [2]> var_54_end_0 = const()[name = tensor<string, []>("op_54_end_0"), val = tensor<int32, [2]>([1, -1])];
|
| 49 |
-
tensor<bool, [2]> var_54_end_mask_0 = const()[name = tensor<string, []>("op_54_end_mask_0"), val = tensor<bool, [2]>([true, false])];
|
| 50 |
-
tensor<fp16, [1, ?]> var_54_cast_fp16 = slice_by_index(begin = var_54_begin_0, end = var_54_end_0, end_mask = var_54_end_mask_0, x = audio_to_fp16)[name = tensor<string, []>("op_54_cast_fp16")];
|
| 51 |
-
tensor<fp16, []> var_55_to_fp16 = const()[name = tensor<string, []>("op_55_to_fp16"), val = tensor<fp16, []>(0x1.f0cp-1)];
|
| 52 |
-
tensor<fp16, [1, ?]> var_56_cast_fp16 = mul(x = var_54_cast_fp16, y = var_55_to_fp16)[name = tensor<string, []>("op_56_cast_fp16")];
|
| 53 |
-
tensor<fp16, [1, ?]> var_57_cast_fp16 = sub(x = var_52_cast_fp16, y = var_56_cast_fp16)[name = tensor<string, []>("op_57_cast_fp16")];
|
| 54 |
-
tensor<bool, []> x_3_interleave_0 = const()[name = tensor<string, []>("x_3_interleave_0"), val = tensor<bool, []>(false)];
|
| 55 |
-
tensor<fp16, [1, ?]> x_3_cast_fp16 = concat(axis = var_9, interleave = x_3_interleave_0, values = (var_50_cast_fp16, var_57_cast_fp16))[name = tensor<string, []>("x_3_cast_fp16")];
|
| 56 |
-
tensor<bool, [1, ?]> var_60 = logical_not(x = timemask)[name = tensor<string, []>("op_60")];
|
| 57 |
-
tensor<fp16, []> var_16_to_fp16 = const()[name = tensor<string, []>("op_16_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
|
| 58 |
-
tensor<fp16, [1, ?]> input_1_cast_fp16 = select(a = var_16_to_fp16, b = x_3_cast_fp16, cond = var_60)[name = tensor<string, []>("input_1_cast_fp16")];
|
| 59 |
-
tensor<int32, [3]> concat_1x = const()[name = tensor<string, []>("concat_1x"), val = tensor<int32, [3]>([1, 1, -1])];
|
| 60 |
-
tensor<fp16, [1, 1, ?]> input_3_cast_fp16 = reshape(shape = concat_1x, x = input_1_cast_fp16)[name = tensor<string, []>("input_3_cast_fp16")];
|
| 61 |
-
tensor<int32, [6]> input_5_pad_0 = const()[name = tensor<string, []>("input_5_pad_0"), val = tensor<int32, [6]>([0, 0, 0, 0, 256, 256])];
|
| 62 |
-
tensor<string, []> input_5_mode_0 = const()[name = tensor<string, []>("input_5_mode_0"), val = tensor<string, []>("constant")];
|
| 63 |
-
tensor<fp16, []> const_3_to_fp16 = const()[name = tensor<string, []>("const_3_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
|
| 64 |
-
tensor<fp16, [1, 1, ?]> input_5_cast_fp16 = pad(constant_val = const_3_to_fp16, mode = input_5_mode_0, pad = input_5_pad_0, x = input_3_cast_fp16)[name = tensor<string, []>("input_5_cast_fp16")];
|
| 65 |
-
tensor<int32, [2]> concat_2x = const()[name = tensor<string, []>("concat_2x"), val = tensor<int32, [2]>([1, -1])];
|
| 66 |
-
tensor<fp16, [1, ?]> input_cast_fp16 = reshape(shape = concat_2x, x = input_5_cast_fp16)[name = tensor<string, []>("input_cast_fp16")];
|
| 67 |
-
tensor<int32, [1]> expand_dims_3 = const()[name = tensor<string, []>("expand_dims_3"), val = tensor<int32, [1]>([160])];
|
| 68 |
-
tensor<int32, [1]> expand_dims_4_axes_0 = const()[name = tensor<string, []>("expand_dims_4_axes_0"), val = tensor<int32, [1]>([1])];
|
| 69 |
-
tensor<fp16, [1, 1, ?]> expand_dims_4_cast_fp16 = expand_dims(axes = expand_dims_4_axes_0, x = input_cast_fp16)[name = tensor<string, []>("expand_dims_4_cast_fp16")];
|
| 70 |
-
tensor<string, []> conv_0_pad_type_0 = const()[name = tensor<string, []>("conv_0_pad_type_0"), val = tensor<string, []>("valid")];
|
| 71 |
-
tensor<int32, [2]> conv_0_pad_0 = const()[name = tensor<string, []>("conv_0_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 72 |
-
tensor<int32, [1]> conv_0_dilations_0 = const()[name = tensor<string, []>("conv_0_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 73 |
-
tensor<int32, []> conv_0_groups_0 = const()[name = tensor<string, []>("conv_0_groups_0"), val = tensor<int32, []>(1)];
|
| 74 |
-
tensor<fp16, [257, 1, 512]> expand_dims_1_to_fp16 = const()[name = tensor<string, []>("expand_dims_1_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(64)))];
|
| 75 |
-
tensor<fp16, [1, 257, ?]> conv_0_cast_fp16 = conv(dilations = conv_0_dilations_0, groups = conv_0_groups_0, pad = conv_0_pad_0, pad_type = conv_0_pad_type_0, strides = expand_dims_3, weight = expand_dims_1_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_0_cast_fp16")];
|
| 76 |
-
tensor<string, []> conv_1_pad_type_0 = const()[name = tensor<string, []>("conv_1_pad_type_0"), val = tensor<string, []>("valid")];
|
| 77 |
-
tensor<int32, [2]> conv_1_pad_0 = const()[name = tensor<string, []>("conv_1_pad_0"), val = tensor<int32, [2]>([0, 0])];
|
| 78 |
-
tensor<int32, [1]> conv_1_dilations_0 = const()[name = tensor<string, []>("conv_1_dilations_0"), val = tensor<int32, [1]>([1])];
|
| 79 |
-
tensor<int32, []> conv_1_groups_0 = const()[name = tensor<string, []>("conv_1_groups_0"), val = tensor<int32, []>(1)];
|
| 80 |
-
tensor<fp16, [257, 1, 512]> expand_dims_2_to_fp16 = const()[name = tensor<string, []>("expand_dims_2_to_fp16"), val = tensor<fp16, [257, 1, 512]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(263296)))];
|
| 81 |
-
tensor<fp16, [1, 257, ?]> conv_1_cast_fp16 = conv(dilations = conv_1_dilations_0, groups = conv_1_groups_0, pad = conv_1_pad_0, pad_type = conv_1_pad_type_0, strides = expand_dims_3, weight = expand_dims_2_to_fp16, x = expand_dims_4_cast_fp16)[name = tensor<string, []>("conv_1_cast_fp16")];
|
| 82 |
-
tensor<int32, []> stack_0_axis_0 = const()[name = tensor<string, []>("stack_0_axis_0"), val = tensor<int32, []>(-1)];
|
| 83 |
-
tensor<fp16, [1, 257, ?, 2]> stack_0_cast_fp16 = stack(axis = stack_0_axis_0, values = (conv_0_cast_fp16, conv_1_cast_fp16))[name = tensor<string, []>("stack_0_cast_fp16")];
|
| 84 |
-
tensor<fp16, []> var_19_promoted_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
|
| 85 |
-
tensor<fp16, [1, 257, ?, 2]> var_75_cast_fp16 = pow(x = stack_0_cast_fp16, y = var_19_promoted_to_fp16)[name = tensor<string, []>("op_75_cast_fp16")];
|
| 86 |
-
tensor<int32, [1]> var_77_axes_0 = const()[name = tensor<string, []>("op_77_axes_0"), val = tensor<int32, [1]>([-1])];
|
| 87 |
-
tensor<bool, []> var_77_keep_dims_0 = const()[name = tensor<string, []>("op_77_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 88 |
-
tensor<fp16, [1, 257, ?]> var_77_cast_fp16 = reduce_sum(axes = var_77_axes_0, keep_dims = var_77_keep_dims_0, x = var_75_cast_fp16)[name = tensor<string, []>("op_77_cast_fp16")];
|
| 89 |
-
tensor<fp16, [1, 257, ?]> x_11_cast_fp16 = identity(x = var_77_cast_fp16)[name = tensor<string, []>("x_11_cast_fp16")];
|
| 90 |
-
tensor<bool, []> x_13_transpose_x_0 = const()[name = tensor<string, []>("x_13_transpose_x_0"), val = tensor<bool, []>(false)];
|
| 91 |
-
tensor<bool, []> x_13_transpose_y_0 = const()[name = tensor<string, []>("x_13_transpose_y_0"), val = tensor<bool, []>(false)];
|
| 92 |
-
tensor<fp16, [1, 80, 257]> const_4_to_fp16 = const()[name = tensor<string, []>("const_4_to_fp16"), val = tensor<fp16, [1, 80, 257]>(BLOBFILE(path = tensor<string, []>("@model_path/weights/weight.bin"), offset = tensor<uint64, []>(526528)))];
|
| 93 |
-
tensor<fp16, [1, 80, ?]> x_13_cast_fp16 = matmul(transpose_x = x_13_transpose_x_0, transpose_y = x_13_transpose_y_0, x = const_4_to_fp16, y = x_11_cast_fp16)[name = tensor<string, []>("x_13_cast_fp16")];
|
| 94 |
-
tensor<fp16, []> var_84_to_fp16 = const()[name = tensor<string, []>("op_84_to_fp16"), val = tensor<fp16, []>(0x1p-24)];
|
| 95 |
-
tensor<fp16, [1, 80, ?]> var_85_cast_fp16 = add(x = x_13_cast_fp16, y = var_84_to_fp16)[name = tensor<string, []>("op_85_cast_fp16")];
|
| 96 |
-
tensor<fp32, []> x_15_epsilon_0 = const()[name = tensor<string, []>("x_15_epsilon_0"), val = tensor<fp32, []>(0x1p-149)];
|
| 97 |
-
tensor<fp16, [1, 80, ?]> x_15_cast_fp16 = log(epsilon = x_15_epsilon_0, x = var_85_cast_fp16)[name = tensor<string, []>("x_15_cast_fp16")];
|
| 98 |
-
tensor<int32, [3]> var_87_shape_cast_fp16 = shape(x = x_15_cast_fp16)[name = tensor<string, []>("op_87_shape_cast_fp16")];
|
| 99 |
-
tensor<int32, []> gather_5 = const()[name = tensor<string, []>("gather_5"), val = tensor<int32, []>(1)];
|
| 100 |
-
tensor<int32, []> gather_6_axis_0 = const()[name = tensor<string, []>("gather_6_axis_0"), val = tensor<int32, []>(0)];
|
| 101 |
-
tensor<int32, []> gather_6_batch_dims_0 = const()[name = tensor<string, []>("gather_6_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 102 |
-
tensor<bool, []> gather_6_validate_indices_0 = const()[name = tensor<string, []>("gather_6_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 103 |
-
tensor<string, []> var_87_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_87_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
|
| 104 |
-
tensor<uint16, []> select_6_to_uint16 = const()[name = tensor<string, []>("select_6_to_uint16"), val = tensor<uint16, []>(2)];
|
| 105 |
-
tensor<uint16, [3]> var_87_shape_cast_fp16_to_uint16 = cast(dtype = var_87_shape_cast_fp16_to_uint16_dtype_0, x = var_87_shape_cast_fp16)[name = tensor<string, []>("cast_24")];
|
| 106 |
-
tensor<uint16, []> gather_6_cast_uint16 = gather(axis = gather_6_axis_0, batch_dims = gather_6_batch_dims_0, indices = select_6_to_uint16, validate_indices = gather_6_validate_indices_0, x = var_87_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_6_cast_uint16")];
|
| 107 |
-
tensor<string, []> gather_6_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_6_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 108 |
-
tensor<int32, []> const_5 = const()[name = tensor<string, []>("const_5"), val = tensor<int32, []>(0)];
|
| 109 |
-
tensor<int32, []> const_6 = const()[name = tensor<string, []>("const_6"), val = tensor<int32, []>(1)];
|
| 110 |
-
tensor<int32, []> gather_6_cast_uint16_to_int32 = cast(dtype = gather_6_cast_uint16_to_int32_dtype_0, x = gather_6_cast_uint16)[name = tensor<string, []>("cast_23")];
|
| 111 |
-
tensor<int32, [?]> var_89 = range_1d(end = gather_6_cast_uint16_to_int32, start = const_5, step = const_6)[name = tensor<string, []>("op_89")];
|
| 112 |
-
tensor<int32, [1]> var_90_axes_0 = const()[name = tensor<string, []>("op_90_axes_0"), val = tensor<int32, [1]>([0])];
|
| 113 |
-
tensor<int32, [1, ?]> var_90 = expand_dims(axes = var_90_axes_0, x = var_89)[name = tensor<string, []>("op_90")];
|
| 114 |
-
tensor<int32, []> concat_3_axis_0 = const()[name = tensor<string, []>("concat_3_axis_0"), val = tensor<int32, []>(0)];
|
| 115 |
-
tensor<bool, []> concat_3_interleave_0 = const()[name = tensor<string, []>("concat_3_interleave_0"), val = tensor<bool, []>(false)];
|
| 116 |
-
tensor<int32, [2]> concat_3 = concat(axis = concat_3_axis_0, interleave = concat_3_interleave_0, values = (gather_5, gather_6_cast_uint16_to_int32))[name = tensor<string, []>("concat_3")];
|
| 117 |
-
tensor<int32, [2]> shape_8 = shape(x = var_90)[name = tensor<string, []>("shape_8")];
|
| 118 |
-
tensor<int32, [2]> real_div_0 = real_div(x = concat_3, y = shape_8)[name = tensor<string, []>("real_div_0")];
|
| 119 |
-
tensor<int32, [?, ?]> time_steps = tile(reps = real_div_0, x = var_90)[name = tensor<string, []>("time_steps")];
|
| 120 |
-
tensor<int32, [1]> var_93_axes_0 = const()[name = tensor<string, []>("op_93_axes_0"), val = tensor<int32, [1]>([1])];
|
| 121 |
-
tensor<int32, [1, 1]> var_93 = expand_dims(axes = var_93_axes_0, x = mel_length)[name = tensor<string, []>("op_93")];
|
| 122 |
-
tensor<bool, [?, ?]> valid_mask = less(x = time_steps, y = var_93)[name = tensor<string, []>("valid_mask")];
|
| 123 |
-
tensor<int32, [1]> var_95_axes_0 = const()[name = tensor<string, []>("op_95_axes_0"), val = tensor<int32, [1]>([1])];
|
| 124 |
-
tensor<bool, [?, 1, ?]> var_95 = expand_dims(axes = var_95_axes_0, x = valid_mask)[name = tensor<string, []>("op_95")];
|
| 125 |
-
tensor<fp16, [1, 80, ?]> var_96_cast_fp16 = select(a = x_15_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_96_cast_fp16")];
|
| 126 |
-
tensor<int32, [1]> x_mean_numerator_axes_0 = const()[name = tensor<string, []>("x_mean_numerator_axes_0"), val = tensor<int32, [1]>([2])];
|
| 127 |
-
tensor<bool, []> x_mean_numerator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_numerator_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 128 |
-
tensor<fp16, [1, 80]> x_mean_numerator_cast_fp16 = reduce_sum(axes = x_mean_numerator_axes_0, keep_dims = x_mean_numerator_keep_dims_0, x = var_96_cast_fp16)[name = tensor<string, []>("x_mean_numerator_cast_fp16")];
|
| 129 |
-
tensor<int32, [1]> x_mean_denominator_axes_0 = const()[name = tensor<string, []>("x_mean_denominator_axes_0"), val = tensor<int32, [1]>([1])];
|
| 130 |
-
tensor<bool, []> x_mean_denominator_keep_dims_0 = const()[name = tensor<string, []>("x_mean_denominator_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 131 |
-
tensor<string, []> cast_6_to_fp16_dtype_0 = const()[name = tensor<string, []>("cast_6_to_fp16_dtype_0"), val = tensor<string, []>("fp16")];
|
| 132 |
-
tensor<fp16, [?, ?]> valid_mask_to_fp16 = cast(dtype = cast_6_to_fp16_dtype_0, x = valid_mask)[name = tensor<string, []>("cast_22")];
|
| 133 |
-
tensor<fp16, [?]> x_mean_denominator_cast_fp16 = reduce_sum(axes = x_mean_denominator_axes_0, keep_dims = x_mean_denominator_keep_dims_0, x = valid_mask_to_fp16)[name = tensor<string, []>("x_mean_denominator_cast_fp16")];
|
| 134 |
-
tensor<int32, [1]> var_101_axes_0 = const()[name = tensor<string, []>("op_101_axes_0"), val = tensor<int32, [1]>([1])];
|
| 135 |
-
tensor<fp16, [?, 1]> var_101_cast_fp16 = expand_dims(axes = var_101_axes_0, x = x_mean_denominator_cast_fp16)[name = tensor<string, []>("op_101_cast_fp16")];
|
| 136 |
-
tensor<fp16, [?, 80]> x_mean_cast_fp16 = real_div(x = x_mean_numerator_cast_fp16, y = var_101_cast_fp16)[name = tensor<string, []>("x_mean_cast_fp16")];
|
| 137 |
-
tensor<int32, [1]> var_104_axes_0 = const()[name = tensor<string, []>("op_104_axes_0"), val = tensor<int32, [1]>([2])];
|
| 138 |
-
tensor<fp16, [?, 80, 1]> var_104_cast_fp16 = expand_dims(axes = var_104_axes_0, x = x_mean_cast_fp16)[name = tensor<string, []>("op_104_cast_fp16")];
|
| 139 |
-
tensor<fp16, [?, 80, ?]> var_105_cast_fp16 = sub(x = x_15_cast_fp16, y = var_104_cast_fp16)[name = tensor<string, []>("op_105_cast_fp16")];
|
| 140 |
-
tensor<fp16, [?, 80, ?]> var_106_cast_fp16 = select(a = var_105_cast_fp16, b = var_16_to_fp16, cond = var_95)[name = tensor<string, []>("op_106_cast_fp16")];
|
| 141 |
-
tensor<fp16, []> var_19_promoted_1_to_fp16 = const()[name = tensor<string, []>("op_19_promoted_1_to_fp16"), val = tensor<fp16, []>(0x1p+1)];
|
| 142 |
-
tensor<fp16, [?, 80, ?]> var_107_cast_fp16 = pow(x = var_106_cast_fp16, y = var_19_promoted_1_to_fp16)[name = tensor<string, []>("op_107_cast_fp16")];
|
| 143 |
-
tensor<int32, [1]> var_109_axes_0 = const()[name = tensor<string, []>("op_109_axes_0"), val = tensor<int32, [1]>([2])];
|
| 144 |
-
tensor<bool, []> var_109_keep_dims_0 = const()[name = tensor<string, []>("op_109_keep_dims_0"), val = tensor<bool, []>(false)];
|
| 145 |
-
tensor<fp16, [?, 80]> var_109_cast_fp16 = reduce_sum(axes = var_109_axes_0, keep_dims = var_109_keep_dims_0, x = var_107_cast_fp16)[name = tensor<string, []>("op_109_cast_fp16")];
|
| 146 |
-
tensor<fp16, []> var_111_to_fp16 = const()[name = tensor<string, []>("op_111_to_fp16"), val = tensor<fp16, []>(0x1p+0)];
|
| 147 |
-
tensor<fp16, [?, 1]> var_112_cast_fp16 = sub(x = var_101_cast_fp16, y = var_111_to_fp16)[name = tensor<string, []>("op_112_cast_fp16")];
|
| 148 |
-
tensor<fp16, [?, 80]> var_113_cast_fp16 = real_div(x = var_109_cast_fp16, y = var_112_cast_fp16)[name = tensor<string, []>("op_113_cast_fp16")];
|
| 149 |
-
tensor<fp16, [?, 80]> x_std_1_cast_fp16 = sqrt(x = var_113_cast_fp16)[name = tensor<string, []>("x_std_1_cast_fp16")];
|
| 150 |
-
tensor<bool, [?, 80]> var_115_cast_fp16 = not_equal(x = x_std_1_cast_fp16, y = x_std_1_cast_fp16)[name = tensor<string, []>("op_115_cast_fp16")];
|
| 151 |
-
tensor<fp16, [?, 80]> x_std_3_cast_fp16 = select(a = var_16_to_fp16, b = x_std_1_cast_fp16, cond = var_115_cast_fp16)[name = tensor<string, []>("x_std_3_cast_fp16")];
|
| 152 |
-
tensor<fp16, []> var_25_to_fp16 = const()[name = tensor<string, []>("op_25_to_fp16"), val = tensor<fp16, []>(0x1.5p-17)];
|
| 153 |
-
tensor<fp16, [?, 80]> x_std_cast_fp16 = add(x = x_std_3_cast_fp16, y = var_25_to_fp16)[name = tensor<string, []>("x_std_cast_fp16")];
|
| 154 |
-
tensor<int32, [1]> var_120_axes_0 = const()[name = tensor<string, []>("op_120_axes_0"), val = tensor<int32, [1]>([2])];
|
| 155 |
-
tensor<fp16, [?, 80, 1]> var_120_cast_fp16 = expand_dims(axes = var_120_axes_0, x = x_std_cast_fp16)[name = tensor<string, []>("op_120_cast_fp16")];
|
| 156 |
-
tensor<fp16, [?, 80, ?]> x_cast_fp16 = real_div(x = var_105_cast_fp16, y = var_120_cast_fp16)[name = tensor<string, []>("x_cast_fp16")];
|
| 157 |
-
tensor<int32, [3]> var_122_shape_cast_fp16 = shape(x = x_cast_fp16)[name = tensor<string, []>("op_122_shape_cast_fp16")];
|
| 158 |
-
tensor<int32, []> gather_7_axis_0 = const()[name = tensor<string, []>("gather_7_axis_0"), val = tensor<int32, []>(0)];
|
| 159 |
-
tensor<int32, []> gather_7_batch_dims_0 = const()[name = tensor<string, []>("gather_7_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 160 |
-
tensor<bool, []> gather_7_validate_indices_0 = const()[name = tensor<string, []>("gather_7_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 161 |
-
tensor<string, []> var_122_shape_cast_fp16_to_uint16_dtype_0 = const()[name = tensor<string, []>("op_122_shape_cast_fp16_to_uint16_dtype_0"), val = tensor<string, []>("uint16")];
|
| 162 |
-
tensor<uint16, []> select_7_to_uint16 = const()[name = tensor<string, []>("select_7_to_uint16"), val = tensor<uint16, []>(2)];
|
| 163 |
-
tensor<uint16, [3]> var_122_shape_cast_fp16_to_uint16 = cast(dtype = var_122_shape_cast_fp16_to_uint16_dtype_0, x = var_122_shape_cast_fp16)[name = tensor<string, []>("cast_21")];
|
| 164 |
-
tensor<uint16, []> gather_7_cast_uint16 = gather(axis = gather_7_axis_0, batch_dims = gather_7_batch_dims_0, indices = select_7_to_uint16, validate_indices = gather_7_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_7_cast_uint16")];
|
| 165 |
-
tensor<string, []> gather_7_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_7_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 166 |
-
tensor<int32, []> const_7 = const()[name = tensor<string, []>("const_7"), val = tensor<int32, []>(0)];
|
| 167 |
-
tensor<int32, []> const_8 = const()[name = tensor<string, []>("const_8"), val = tensor<int32, []>(1)];
|
| 168 |
-
tensor<int32, []> gather_7_cast_uint16_to_int32 = cast(dtype = gather_7_cast_uint16_to_int32_dtype_0, x = gather_7_cast_uint16)[name = tensor<string, []>("cast_20")];
|
| 169 |
-
tensor<int32, [?]> mask_1 = range_1d(end = gather_7_cast_uint16_to_int32, start = const_7, step = const_8)[name = tensor<string, []>("mask_1")];
|
| 170 |
-
tensor<int32, []> gather_8_axis_0 = const()[name = tensor<string, []>("gather_8_axis_0"), val = tensor<int32, []>(0)];
|
| 171 |
-
tensor<int32, []> gather_8_batch_dims_0 = const()[name = tensor<string, []>("gather_8_batch_dims_0"), val = tensor<int32, []>(0)];
|
| 172 |
-
tensor<bool, []> gather_8_validate_indices_0 = const()[name = tensor<string, []>("gather_8_validate_indices_0"), val = tensor<bool, []>(false)];
|
| 173 |
-
tensor<uint16, []> select_8_to_uint16 = const()[name = tensor<string, []>("select_8_to_uint16"), val = tensor<uint16, []>(0)];
|
| 174 |
-
tensor<uint16, []> gather_8_cast_uint16 = gather(axis = gather_8_axis_0, batch_dims = gather_8_batch_dims_0, indices = select_8_to_uint16, validate_indices = gather_8_validate_indices_0, x = var_122_shape_cast_fp16_to_uint16)[name = tensor<string, []>("gather_8_cast_uint16")];
|
| 175 |
-
tensor<string, []> gather_8_cast_uint16_to_int32_dtype_0 = const()[name = tensor<string, []>("gather_8_cast_uint16_to_int32_dtype_0"), val = tensor<string, []>("int32")];
|
| 176 |
-
tensor<int32, []> concat_4_axis_0 = const()[name = tensor<string, []>("concat_4_axis_0"), val = tensor<int32, []>(0)];
|
| 177 |
-
tensor<bool, []> concat_4_interleave_0 = const()[name = tensor<string, []>("concat_4_interleave_0"), val = tensor<bool, []>(false)];
|
| 178 |
-
tensor<int32, []> gather_8_cast_uint16_to_int32 = cast(dtype = gather_8_cast_uint16_to_int32_dtype_0, x = gather_8_cast_uint16)[name = tensor<string, []>("cast_19")];
|
| 179 |
-
tensor<int32, [2]> concat_4 = concat(axis = concat_4_axis_0, interleave = concat_4_interleave_0, values = (gather_8_cast_uint16_to_int32, var_9))[name = tensor<string, []>("concat_4")];
|
| 180 |
-
tensor<int32, [1]> expand_dims_0_axes_0 = const()[name = tensor<string, []>("expand_dims_0_axes_0"), val = tensor<int32, [1]>([0])];
|
| 181 |
-
tensor<int32, [1, ?]> expand_dims_0 = expand_dims(axes = expand_dims_0_axes_0, x = mask_1)[name = tensor<string, []>("expand_dims_0")];
|
| 182 |
-
tensor<int32, [?, ?]> var_126 = tile(reps = concat_4, x = expand_dims_0)[name = tensor<string, []>("op_126")];
|
| 183 |
-
tensor<bool, [?, ?]> mask = greater_equal(x = var_126, y = var_93)[name = tensor<string, []>("mask")];
|
| 184 |
-
tensor<int32, [1]> var_129_axes_0 = const()[name = tensor<string, []>("op_129_axes_0"), val = tensor<int32, [1]>([1])];
|
| 185 |
-
tensor<bool, [?, 1, ?]> var_129 = expand_dims(axes = var_129_axes_0, x = mask)[name = tensor<string, []>("op_129")];
|
| 186 |
-
tensor<fp16, []> cast_15_to_fp16 = const()[name = tensor<string, []>("cast_15_to_fp16"), val = tensor<fp16, []>(0x0p+0)];
|
| 187 |
-
tensor<fp16, [?, 80, ?]> processed_signal_cast_fp16 = select(a = cast_15_to_fp16, b = x_cast_fp16, cond = var_129)[name = tensor<string, []>("processed_signal_cast_fp16")];
|
| 188 |
-
tensor<string, []> processed_signal_cast_fp16_to_fp32_dtype_0 = const()[name = tensor<string, []>("processed_signal_cast_fp16_to_fp32_dtype_0"), val = tensor<string, []>("fp32")];
|
| 189 |
-
tensor<fp32, [?, 80, ?]> mel_features = cast(dtype = processed_signal_cast_fp16_to_fp32_dtype_0, x = processed_signal_cast_fp16)[name = tensor<string, []>("cast_18")];
|
| 190 |
-
} -> (mel_features, mel_length);
|
| 191 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/Preprocessor.mlmodelc/weights/weight.bin
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c062338de852a26607ce4101f74e6895de3a4134a57b07232bd72bfc6f1d7f1a
|
| 3 |
-
size 567712
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/metadata.json
DELETED
|
@@ -1,247 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"model_id": "nvidia/parakeet-tdt_ctc-110m",
|
| 3 |
-
"model_type": "hybrid_rnnt_ctc",
|
| 4 |
-
"sample_rate": 16000,
|
| 5 |
-
"max_audio_seconds": 15.0,
|
| 6 |
-
"max_audio_samples": 240000,
|
| 7 |
-
"max_symbol_steps": 1,
|
| 8 |
-
"vocab_size": 1024,
|
| 9 |
-
"joint_extra_outputs": 5,
|
| 10 |
-
"encoder_dim": 512,
|
| 11 |
-
"decoder_dim": 640,
|
| 12 |
-
"decoder_hidden": 640,
|
| 13 |
-
"decoder_layers": 1,
|
| 14 |
-
"blank_id": 1024,
|
| 15 |
-
"checkpoint": {
|
| 16 |
-
"type": "pretrained",
|
| 17 |
-
"model_id": "nvidia/parakeet-tdt_ctc-110m"
|
| 18 |
-
},
|
| 19 |
-
"coreml": {
|
| 20 |
-
"compute_units": "CPU_ONLY",
|
| 21 |
-
"compute_precision": "FLOAT32"
|
| 22 |
-
},
|
| 23 |
-
"components": {
|
| 24 |
-
"preprocessor": {
|
| 25 |
-
"inputs": {
|
| 26 |
-
"audio_signal": [
|
| 27 |
-
1,
|
| 28 |
-
240000
|
| 29 |
-
],
|
| 30 |
-
"audio_length": [
|
| 31 |
-
1
|
| 32 |
-
]
|
| 33 |
-
},
|
| 34 |
-
"outputs": {
|
| 35 |
-
"mel": [
|
| 36 |
-
1,
|
| 37 |
-
80,
|
| 38 |
-
1501
|
| 39 |
-
],
|
| 40 |
-
"mel_length": [
|
| 41 |
-
1
|
| 42 |
-
]
|
| 43 |
-
},
|
| 44 |
-
"path": "parakeet_preprocessor.mlpackage"
|
| 45 |
-
},
|
| 46 |
-
"encoder": {
|
| 47 |
-
"inputs": {
|
| 48 |
-
"mel": [
|
| 49 |
-
1,
|
| 50 |
-
80,
|
| 51 |
-
1501
|
| 52 |
-
],
|
| 53 |
-
"mel_length": [
|
| 54 |
-
1
|
| 55 |
-
]
|
| 56 |
-
},
|
| 57 |
-
"outputs": {
|
| 58 |
-
"encoder": [
|
| 59 |
-
1,
|
| 60 |
-
512,
|
| 61 |
-
188
|
| 62 |
-
],
|
| 63 |
-
"encoder_length": [
|
| 64 |
-
1
|
| 65 |
-
]
|
| 66 |
-
},
|
| 67 |
-
"path": "parakeet_encoder.mlpackage"
|
| 68 |
-
},
|
| 69 |
-
"ctc_head": {
|
| 70 |
-
"inputs": {
|
| 71 |
-
"encoder": [
|
| 72 |
-
1,
|
| 73 |
-
512,
|
| 74 |
-
188
|
| 75 |
-
]
|
| 76 |
-
},
|
| 77 |
-
"outputs": {
|
| 78 |
-
"log_probs": [
|
| 79 |
-
1,
|
| 80 |
-
188,
|
| 81 |
-
1025
|
| 82 |
-
]
|
| 83 |
-
},
|
| 84 |
-
"path": "parakeet_ctc_head.mlpackage"
|
| 85 |
-
},
|
| 86 |
-
"mel_encoder": {
|
| 87 |
-
"inputs": {
|
| 88 |
-
"audio_signal": [
|
| 89 |
-
1,
|
| 90 |
-
240000
|
| 91 |
-
],
|
| 92 |
-
"audio_length": [
|
| 93 |
-
1
|
| 94 |
-
]
|
| 95 |
-
},
|
| 96 |
-
"outputs": {
|
| 97 |
-
"encoder": [
|
| 98 |
-
1,
|
| 99 |
-
512,
|
| 100 |
-
188
|
| 101 |
-
],
|
| 102 |
-
"encoder_length": [
|
| 103 |
-
1
|
| 104 |
-
]
|
| 105 |
-
},
|
| 106 |
-
"path": "parakeet_mel_encoder.mlpackage"
|
| 107 |
-
},
|
| 108 |
-
"decoder": {
|
| 109 |
-
"inputs": {
|
| 110 |
-
"targets": [
|
| 111 |
-
1,
|
| 112 |
-
1
|
| 113 |
-
],
|
| 114 |
-
"target_length": [
|
| 115 |
-
1
|
| 116 |
-
],
|
| 117 |
-
"h_in": [
|
| 118 |
-
1,
|
| 119 |
-
1,
|
| 120 |
-
640
|
| 121 |
-
],
|
| 122 |
-
"c_in": [
|
| 123 |
-
1,
|
| 124 |
-
1,
|
| 125 |
-
640
|
| 126 |
-
]
|
| 127 |
-
},
|
| 128 |
-
"outputs": {
|
| 129 |
-
"decoder": [
|
| 130 |
-
1,
|
| 131 |
-
640,
|
| 132 |
-
1
|
| 133 |
-
],
|
| 134 |
-
"h_out": [
|
| 135 |
-
1,
|
| 136 |
-
1,
|
| 137 |
-
640
|
| 138 |
-
],
|
| 139 |
-
"c_out": [
|
| 140 |
-
1,
|
| 141 |
-
1,
|
| 142 |
-
640
|
| 143 |
-
]
|
| 144 |
-
},
|
| 145 |
-
"path": "parakeet_decoder.mlpackage"
|
| 146 |
-
},
|
| 147 |
-
"joint": {
|
| 148 |
-
"inputs": {
|
| 149 |
-
"encoder": [
|
| 150 |
-
1,
|
| 151 |
-
512,
|
| 152 |
-
188
|
| 153 |
-
],
|
| 154 |
-
"decoder": [
|
| 155 |
-
1,
|
| 156 |
-
640,
|
| 157 |
-
1
|
| 158 |
-
]
|
| 159 |
-
},
|
| 160 |
-
"outputs": {
|
| 161 |
-
"logits": [
|
| 162 |
-
1,
|
| 163 |
-
188,
|
| 164 |
-
1,
|
| 165 |
-
1030
|
| 166 |
-
]
|
| 167 |
-
},
|
| 168 |
-
"path": "parakeet_joint.mlpackage"
|
| 169 |
-
},
|
| 170 |
-
"joint_decision": {
|
| 171 |
-
"inputs": {
|
| 172 |
-
"encoder": [
|
| 173 |
-
1,
|
| 174 |
-
512,
|
| 175 |
-
188
|
| 176 |
-
],
|
| 177 |
-
"decoder": [
|
| 178 |
-
1,
|
| 179 |
-
640,
|
| 180 |
-
1
|
| 181 |
-
]
|
| 182 |
-
},
|
| 183 |
-
"outputs": {
|
| 184 |
-
"token_id": [
|
| 185 |
-
1,
|
| 186 |
-
188,
|
| 187 |
-
1
|
| 188 |
-
],
|
| 189 |
-
"token_prob": [
|
| 190 |
-
1,
|
| 191 |
-
188,
|
| 192 |
-
1
|
| 193 |
-
],
|
| 194 |
-
"duration": [
|
| 195 |
-
1,
|
| 196 |
-
188,
|
| 197 |
-
1
|
| 198 |
-
]
|
| 199 |
-
},
|
| 200 |
-
"path": "parakeet_joint_decision.mlpackage"
|
| 201 |
-
},
|
| 202 |
-
"joint_decision_single_step": {
|
| 203 |
-
"inputs": {
|
| 204 |
-
"encoder_step": [
|
| 205 |
-
1,
|
| 206 |
-
512,
|
| 207 |
-
1
|
| 208 |
-
],
|
| 209 |
-
"decoder_step": [
|
| 210 |
-
1,
|
| 211 |
-
640,
|
| 212 |
-
1
|
| 213 |
-
]
|
| 214 |
-
},
|
| 215 |
-
"outputs": {
|
| 216 |
-
"token_id": [
|
| 217 |
-
1,
|
| 218 |
-
1,
|
| 219 |
-
1
|
| 220 |
-
],
|
| 221 |
-
"token_prob": [
|
| 222 |
-
1,
|
| 223 |
-
1,
|
| 224 |
-
1
|
| 225 |
-
],
|
| 226 |
-
"duration": [
|
| 227 |
-
1,
|
| 228 |
-
1,
|
| 229 |
-
1
|
| 230 |
-
],
|
| 231 |
-
"top_k_ids": [
|
| 232 |
-
1,
|
| 233 |
-
1,
|
| 234 |
-
1,
|
| 235 |
-
64
|
| 236 |
-
],
|
| 237 |
-
"top_k_logits": [
|
| 238 |
-
1,
|
| 239 |
-
1,
|
| 240 |
-
1,
|
| 241 |
-
64
|
| 242 |
-
]
|
| 243 |
-
},
|
| 244 |
-
"path": "parakeet_joint_decision_single_step.mlpackage"
|
| 245 |
-
}
|
| 246 |
-
}
|
| 247 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/compiled_models/vocab.json
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
{"0": "<unk>", "1": "βt", "2": "βth", "3": "βa", "4": "in", "5": "re", "6": "βthe", "7": "βw", "8": "βs", "9": "βo", "10": "er", "11": "ou", "12": "at", "13": "nd", "14": "it", "15": "βh", "16": "βc", "17": "βb", "18": "is", "19": "en", "20": "on", "21": "ing", "22": "βf", "23": "βto", "24": "βm", "25": "es", "26": "βp", "27": "or", "28": "an", "29": "βd", "30": "ll", "31": "βI", "32": "ed", "33": "βand", "34": "βl", "35": "βof", "36": "βin", "37": "βy", "38": "ar", "39": "βg", "40": "βyou", "41": "as", "42": "om", "43": "βn", "44": "ve", "45": "βthat", "46": "le", "47": "ic", "48": "us", "49": "ow", "50": "et", "51": "al", "52": "βe", "53": "ut", "54": "βit", "55": "ot", "56": "βbe", "57": "βT", "58": "ion", "59": "βis", "60": "βwh", "61": "βre", "62": "βon", "63": "βwe", "64": "ent", "65": "βA", "66": "ay", "67": "βha", "68": "βTh", "69": "id", "70": "βS", "71": "ac", "72": "gh", "73": "ver", "74": "ke", "75": "βfor", "76": "im", "77": "ly", "78": "ur", "79": "ld", "80": "βhe", "81": "βst", "82": "all", "83": "ro", "84": "st", "85": "se", "86": "ct", "87": "ith", "88": "ir", "89": "am", "90": "βthis", "91": "if", "92": "βW", "93": "oo", "94": "ri", "95": "βwas", "96": "ght", "97": "βu", "98": "βwith", "99": "ad", "100": "ch", "101": "βse", "102": "βk", "103": "βan", "104": "βThe", "105": "βli", "106": "βdo", "107": "βB", "108": "βhave", "109": "βas", "110": "th", "111": "βare", "112": "βsh", "113": "ust", "114": "ce", "115": "ally", "116": "ill", "117": "βH", "118": "βj", "119": "ter", "120": "βgo", "121": "βAnd", "122": "ation", "123": "βC", "124": "βso", "125": "ome", "126": "βnot", "127": "op", "128": "il", "129": "ore", "130": "βne", "131": "βcan", "132": "βme", "133": "βat", "134": "ould", "135": "ant", "136": "βM", "137": "βlike", "138": "ere", "139": "βthey", "140": "ra", "141": "ers", "142": "βab", "143": "βde", "144": "βkn", "145": "ge", "146": "βY", "147": "βch", "148": "ul", "149": "pp", "150": "βor", "151": "βal", "152": "βcon", "153": "βcom", "154": "ess", "155": "βsu", "156": "out", "157": "βyour", "158": "βSo", "159": "ate", "160": "βone", "161": "βall", "162": "βex", "163": "est", "164": "βfr", "165": "βjust", "166": "βpro", "167": "βknow", "168": "βO", "169": "ain", "170": "βbut", "171": "ol", "172": "ive", "173": "βv", "174": "use", "175": "very", "176": "art", "177": "qu", "178": "βmy", "179": "el", "180": "βN", "181": "nt", "182": "βIt", "183": "βwhat", "184": "ab", "185": "βP", "186": "βwor", "187": "βout", "188": "βthere", "189": "βup", "190": "um", "191": "βfrom", "192": "pe", "193": "βtw", "194": "βr", "195": "and", "196": "ight", "197": "ort", "198": "un", "199": "βL", "200": "ist", "201": "βabout", "202": "ide", "203": "ig", "204": "ake", "205": "βD", "206": "em", "207": "os", "208": "king", "209": "rou", "210": "ind", "211": "our", "212": "res", "213": "βWe", "214": "βget", "215": "βE", "216": "βG", "217": "ack", "218": "βle", "219": "ity", "220": "od", "221": "βF", "222": "ard", "223": "βpl", "224": "βour", "225": "βint", "226": "ment", "227": "βwill", "228": "ies", "229": "βby", "230": "ink", "231": "ca", "232": "βif", "233": "red", "234": "her", "235": "ie", "236": "βus", "237": "βsome", "238": "βdon", "239": "ven", "240": "ood", "241": "ast", "242": "βR", "243": "βhis", "244": "βtim", "245": "βtr", "246": "βmore", "247": "ich", "248": "ous", "249": "ame", "250": "βgoing", "251": "βhad", "252": "βthem", "253": "ook", "254": "βpe", "255": "βWh", "256": "βYou", "257": "βBut", "258": "ine", "259": "βhere", "260": "βwould", "261": "cause", "262": "right", "263": "so", "264": "ost", "265": "ure", "266": "βhas", "267": "ect", "268": "βthink", "269": "βfe", "270": "ong", "271": "βsee", "272": "βwhen", "273": "βwho", "274": "βwere", "275": "βreally", "276": "βtheir", "277": "βwant", "278": "one", "279": "ople", "280": "βthen", "281": "βtime", "282": "βsa", "283": "ap", "284": "βte", "285": "βHe", "286": "βye", "287": "ck", "288": "βher", "289": "βthing", "290": "βright", "291": "βwhich", "292": "itt", "293": "ice", "294": "act", "295": "βpeople", "296": "ty", "297": "βtwo", "298": "βJ", "299": "βim", "300": "ther", "301": "ci", "302": "ose", "303": "βcl", "304": "βqu", "305": "βman", "306": "βalso", "307": "ree", "308": "βen", "309": "ud", "310": "βhow", "311": "reat", "312": "ak", "313": "hing", "314": "ag", "315": "βany", "316": "ff", "317": "ace", "318": "per", "319": "βbecause", "320": "βvery", "321": "own", "322": "βad", "323": "βact", "324": "βbeen", "325": "βnow", "326": "βag", "327": "βinto", "328": "βcomp", "329": "ars", "330": "ions", "331": "are", "332": "ite", "333": "iv", "334": "βthese", "335": "ays", "336": "ep", "337": "βThis", "338": "βshe", "339": "ans", "340": "ah", "341": "een", "342": "βover", "343": "ry", "344": "βlo", "345": "age", "346": "βpr", "347": "βsp", "348": "ue", "349": "βco", "350": "ick", "351": "ber", "352": "βdid", "353": "ip", "354": "ach", "355": "βback", "356": "βno", "357": "βcont", "358": "βother", "359": "βevery", "360": "pt", "361": "βneed", "362": "βhim", "363": "βU", "364": "βIn", "365": "βwork", "366": "irst", "367": "βpart", "368": "βlook", "369": "ittle", "370": "ble", "371": "iz", "372": "βun", "373": "βmake", "374": "omet", "375": "nder", "376": "ish", "377": "na", "378": "βlittle", "379": "βoff", "380": "βthan", "381": "βgot", "382": "ually", "383": "βper", "384": "βgood", "385": "βway", "386": "βcould", "387": "βac", "388": "βimp", "389": "able", "390": "βwhere", "391": "iff", "392": "βThat", "393": "βres", "394": "ount", "395": "pl", "396": "ance", "397": "βfirst", "398": "βro", "399": "βpre", "400": "ass", "401": "βsay", "402": "int", "403": "ated", "404": "ire", "405": "uch", "406": "ase", "407": "βsomet", "408": "ound", "409": "βdown", "410": "βdiff", "411": "sel", "412": "βgu", "413": "βam", "414": "ress", "415": "βlot", "416": "ence", "417": "βdis", "418": "orm", "419": "ix", "420": "βpo", "421": "ving", "422": "enty", "423": "βK", "424": "βspe", "425": "und", "426": "he", "427": "βmuch", "428": "βar", "429": "round", "430": "βapp", "431": "co", "432": "ark", "433": "βnew", "434": "ater", "435": "ult", "436": "end", "437": "βeven", "438": "βstart", "439": "ations", "440": "rough", "441": "ile", "442": "fter", "443": "βwell", "444": "be", "445": "βThey", "446": "βthree", "447": "ign", "448": "ild", "449": "βsaid", "450": "ough", "451": "ang", "452": "βtoo", "453": "ade", "454": "βbl", "455": "ens", "456": "βinc", "457": "ia", "458": "βthose", "459": "βmo", "460": "βtake", "461": "βthrough", "462": "βfl", "463": "βkind", "464": "βthings", "465": "βbet", "466": "βonly", "467": "βSt", "468": "βlet", "469": "cess", "470": "βCh", "471": "ary", "472": "vel", "473": "βIf", "474": "xt", "475": "other", "476": "av", "477": "ical", "478": "ord", "479": "βagain", "480": "βsomething", "481": "onna", "482": "fore", "483": "βmay", "484": "ting", "485": "βbu", "486": "βdiffere", "487": "urn", "488": "βgonna", "489": "βdoes", "490": "uct", "491": "og", "492": "βtwenty", "493": "βgr", "494": "βYe", "495": "wn", "496": "βshould", "497": "βcomm", "498": "ition", "499": "βunder", "500": "βhel", "501": "ory", "502": "βfo", "503": "βuse", "504": "igh", "505": "ife", "506": "βactually", "507": "βtal", "508": "βcall", "509": "ents", "510": "ious", "511": "ull", "512": "βThere", "513": "βYeah", "514": "βmost", "515": "βke", "516": "ors", "517": "ved", "518": "ys", "519": "βsc", "520": "βhapp", "521": "ope", "522": "βhelp", "523": "atch", "524": "βWhat", "525": "βrem", "526": "ple", "527": "βNow", "528": "βbr", "529": "ool", "530": "oth", "531": "βfour", "532": "self", "533": "βstr", "534": "ne", "535": "thing", "536": "βput", "537": "ial", "538": "βgreat", "539": "ail", "540": "ub", "541": "ning", "542": "βsm", "543": "βfeel", "544": "βfive", "545": "ody", "546": "undred", "547": "iss", "548": "ank", "549": "get", "550": "aking", "551": "βmany", "552": "βhundred", "553": "βyears", "554": "βbeing", "555": "βcome", "556": "βmean", "557": "ily", "558": "βdifferent", "559": "βafter", "560": "βser", "561": "βshow", "562": "form", "563": "ful", "564": "oy", "565": "βsix", "566": "βvide", "567": "βV", "568": "βits", "569": "βpoint", "570": "βday", "571": "βdes", "572": "ons", "573": "βbit", "574": "βbel", "575": "βbefore", "576": "βaw", "577": "βend", "578": "βOh", "579": "βstill", "580": "ath", "581": "βlong", "582": "β'", "583": "ise", "584": "ob", "585": "day", "586": "βadd", "587": "ft", "588": "ves", "589": "ces", "590": "ady", "591": "βcr", "592": "βaround", "593": "βtry", "594": "les", "595": "vers", "596": "kay", "597": "ian", "598": "ates", "599": "βfind", "600": "ward", "601": "βAs", "602": "βeight", "603": "lic", "604": "βsame", "605": "βpos", "606": "βem", "607": "βmade", "608": "βsupp", "609": "βlife", "610": "βBe", "611": "pect", "612": "βdec", "613": "βplay", "614": "ange", "615": "βatt", "616": "βpers", "617": "ways", "618": "βhigh", "619": "βhand", "620": "βnext", "621": "βcons", "622": "βown", "623": "βinv", "624": "ower", "625": "βind", "626": "ert", "627": "ng", "628": "ave", "629": "βyear", "630": "βbig", "631": "ating", "632": "βworld", "633": "βrel", "634": "βsure", "635": "βtra", "636": "ew", "637": "ered", "638": "βfin", "639": "βWell", "640": "βsl", "641": "βdoing", "642": "bs", "643": "βset", "644": "βrec", "645": "ual", "646": "cial", "647": "βph", "648": "erm", "649": "βlove", "650": "ph", "651": "βreal", "652": "βlast", "653": "ict", "654": "βbo", "655": "βra", "656": "ible", "657": "βwr", "658": "mer", "659": "βcount", "660": "ities", "661": "βalways", "662": "inet", "663": "ments", "664": "uc", "665": "βmight", "666": "βinter", "667": "βvideo", "668": "gin", "669": "βtell", "670": "βnever", "671": "vent", "672": "βimport", "673": "ied", "674": "βsy", "675": "βHow", "676": "ically", "677": "ought", "678": "βthir", "679": "βrep", "680": "ks", "681": "ib", "682": "βfam", "683": "ject", "684": "βbas", "685": "βShe", "686": "βgive", "687": "akes", "688": "βninet", "689": "βreg", "690": "βmin", "691": "βop", "692": "βdef", "693": "βdidn", "694": "te", "695": "βcour", "696": "βwhy", "697": "βent", "698": "βplace", "699": "βins", "700": "βcar", "701": "ather", "702": "βperson", "703": "ular", "704": "βinst", "705": "βprod", "706": "lect", "707": "βAl", "708": "βtoday", "709": "βbec", "710": "βsur", "711": "βAll", "712": "βanother", "713": "βbus", "714": "βkeep", "715": "ell", "716": "ese", "717": "riend", "718": "βquest", "719": "βtalk", "720": "als", "721": "ings", "722": "βmon", "723": "cond", "724": "old", "725": "βacc", "726": "βla", "727": "βnum", "728": "ident", "729": "βche", "730": "iness", "731": "βturn", "732": "βear", "733": "βNo", "734": "ousand", "735": "βbetter", "736": "ific", "737": "βloo", "738": "βgl", "739": "oc", "740": "βimportant", "741": "ited", "742": "βAn", "743": "βthousand", "744": "ility", "745": "llow", "746": "βused", "747": "βgen", "748": "βsim", "749": "li", "750": "βhappen", "751": "βUn", "752": "βLet", "753": "air", "754": "ock", "755": "ably", "756": "gg", "757": "βwatch", "758": "βFor", "759": "βsw", "760": "ren", "761": "ute", "762": "ever", "763": "βpol", "764": "βsch", "765": "βWhen", "766": "βsuch", "767": "βfif", "768": "βhome", "769": "βcle", "770": "βcontin", "771": "ouse", "772": "βfriend", "773": "uring", "774": "βOkay", "775": "gr", "776": "βable", "777": "βstud", "778": "βeff", "779": "hip", "780": "body", "781": "βtop", "782": "ness", "783": "βexper", "784": "βpret", "785": "βboth", "786": "βdone", "787": "cri", "788": "βmark", "789": "βwhile", "790": "βold", "791": "ros", "792": "ont", "793": "βsecond", "794": "ative", "795": "βthought", "796": "βbest", "797": "βfound", "798": "iew", "799": "βbelie", "800": "βeach", "801": "erest", "802": "βtri", "803": "βeas", "804": "βca", "805": "βfact", "806": "βcare", "807": "βfun", "808": "atter", "809": "ures", "810": "βhead", "811": "βlear", "812": "βwater", "813": "βhard", "814": "βfew", "815": "βside", "816": "ween", "817": "βexp", "818": "βaway", "819": "its", "820": "βext", "821": "lud", "822": "βrun", "823": "βtrans", "824": "ince", "825": "βsk", "826": "βopen", "827": "cus", "828": "βbetween", "829": "βcalled", "830": "βwee", "831": "βpretty", "832": "ason", "833": "βfar", "834": "ember", "835": "omm", "836": "βinterest", "837": "any", "838": "ner", "839": "uff", "840": "βpres", "841": "βcur", "842": "βchild", "843": "ee", "844": "βtoget", "845": "βtogether", "846": "olog", "847": "βGod", "848": "ond", "849": "βchar", "850": "βlooking", "851": "stem", "852": "az", "853": "cent", "854": "βob", "855": "βass", "856": "land", "857": "βdoesn", "858": "βbusiness", "859": "βcourse", "860": "βten", "861": "ps", "862": "arch", "863": "ced", "864": "ms", "865": "ize", "866": "nce", "867": "βref", "868": "βname", "869": "ross", "870": "βgrow", "871": "oney", "872": "βwent", "873": "ics", "874": "teen", "875": "βcou", "876": "βprob", "877": "βret", "878": "βguys", "879": "βcame", "880": "ash", "881": "led", "882": "βEur", "883": "ues", "884": "βide", "885": "gan", "886": "βeverything", "887": "βgetting", "888": "βask", "889": "βcor", "890": "βbuild", "891": "βsign", "892": "βsmall", "893": "uck", "894": "βel", "895": "βcol", "896": "βIs", "897": "ational", "898": "stand", "899": "cy", "900": "βconf", "901": "der", "902": "βbre", "903": "βcap", "904": "βmod", "905": "ets", "906": "ike", "907": "βnumber", "908": "βcomple", "909": "ertain", "910": "βever", "911": "βcoll", "912": "βhum", "913": "βEurope", "914": "βcre", "915": "βmet", "916": "βexam", "917": "βmove", "918": "βpass", "919": "βleft", "920": "βsystem", "921": "βinclud", "922": "βThank", "923": "cept", "924": "βwom", "925": "βproduct", "926": "ten", "927": "βrest", "928": "βprobably", "929": "βdri", "930": "βDo", "931": "βgener", "932": "βanything", "933": "βlar", "934": "βMy", "935": "βschool", "936": "βlead", "937": "βsub", "938": "βty", "939": "βplan", "940": "βseem", "941": "βwhole", "942": "irect", "943": "βlight", "944": "βmust", "945": "βmom", "946": "βopp", "947": "βsupport", "948": "βfamily", "949": "ices", "950": "amp", "951": "βproble", "952": "βdr", "953": "ready", "954": "βusing", "955": "ense", "956": "βprov", "957": "ush", "958": "ax", "959": "βpower", "960": "βRe", "961": "alth", "962": "βev", "963": "βstand", "964": "οΏ½οΏ½war", "965": "ts", "966": "β", "967": "e", "968": "t", "969": "o", "970": "a", "971": "n", "972": "i", "973": "s", "974": "r", "975": "h", "976": "l", "977": "d", "978": "u", "979": "c", "980": "m", "981": "y", "982": "g", "983": "w", "984": "f", "985": "p", "986": ".", "987": "b", "988": ",", "989": "v", "990": "k", "991": "'", "992": "I", "993": "T", "994": "A", "995": "S", "996": "x", "997": "W", "998": "j", "999": "B", "1000": "C", "1001": "H", "1002": "?", "1003": "M", "1004": "O", "1005": "Y", "1006": "N", "1007": "P", "1008": "E", "1009": "q", "1010": "L", "1011": "D", "1012": "z", "1013": "G", "1014": "F", "1015": "R", "1016": "!", "1017": "J", "1018": "U", "1019": "K", "1020": "V", "1021": "Q", "1022": "Z", "1023": "X"}
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/convert-parakeet.py
DELETED
|
@@ -1,697 +0,0 @@
|
|
| 1 |
-
#!/usr/bin/env python3
|
| 2 |
-
"""CLI for exporting Parakeet TDT-CTC 110M Hybrid components to CoreML."""
|
| 3 |
-
from __future__ import annotations
|
| 4 |
-
|
| 5 |
-
import json
|
| 6 |
-
from dataclasses import asdict
|
| 7 |
-
from pathlib import Path
|
| 8 |
-
from typing import Dict, Optional, Tuple
|
| 9 |
-
|
| 10 |
-
import coremltools as ct
|
| 11 |
-
import numpy as np
|
| 12 |
-
import soundfile as sf
|
| 13 |
-
import torch
|
| 14 |
-
import typer
|
| 15 |
-
|
| 16 |
-
import nemo.collections.asr as nemo_asr
|
| 17 |
-
|
| 18 |
-
from individual_components import (
|
| 19 |
-
CTCHeadWrapper,
|
| 20 |
-
DecoderWrapper,
|
| 21 |
-
EncoderWrapper,
|
| 22 |
-
ExportSettings,
|
| 23 |
-
JointWrapper,
|
| 24 |
-
JointDecisionWrapper,
|
| 25 |
-
JointDecisionSingleStep,
|
| 26 |
-
PreprocessorWrapper,
|
| 27 |
-
MelEncoderWrapper,
|
| 28 |
-
_coreml_convert,
|
| 29 |
-
)
|
| 30 |
-
|
| 31 |
-
DEFAULT_MODEL_ID = "nvidia/parakeet-tdt_ctc-110m"
|
| 32 |
-
AUTHOR = "Fluid Inference"
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
def _compute_length(seconds: float, sample_rate: int) -> int:
|
| 36 |
-
return int(round(seconds * sample_rate))
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
def _prepare_audio(
|
| 40 |
-
validation_audio: Optional[Path],
|
| 41 |
-
sample_rate: int,
|
| 42 |
-
max_samples: int,
|
| 43 |
-
seed: Optional[int],
|
| 44 |
-
) -> torch.Tensor:
|
| 45 |
-
if validation_audio is None:
|
| 46 |
-
if seed is not None:
|
| 47 |
-
torch.manual_seed(seed)
|
| 48 |
-
audio = torch.randn(1, max_samples, dtype=torch.float32)
|
| 49 |
-
return audio
|
| 50 |
-
|
| 51 |
-
data, sr = sf.read(str(validation_audio), dtype="float32")
|
| 52 |
-
if sr != sample_rate:
|
| 53 |
-
raise typer.BadParameter(
|
| 54 |
-
f"Validation audio sample rate {sr} does not match model rate {sample_rate}"
|
| 55 |
-
)
|
| 56 |
-
|
| 57 |
-
if data.ndim > 1:
|
| 58 |
-
data = data[:, 0]
|
| 59 |
-
|
| 60 |
-
if data.size == 0:
|
| 61 |
-
raise typer.BadParameter("Validation audio is empty")
|
| 62 |
-
|
| 63 |
-
if data.size < max_samples:
|
| 64 |
-
pad_width = max_samples - data.size
|
| 65 |
-
data = np.pad(data, (0, pad_width))
|
| 66 |
-
elif data.size > max_samples:
|
| 67 |
-
data = data[:max_samples]
|
| 68 |
-
|
| 69 |
-
audio = torch.from_numpy(data).unsqueeze(0).to(dtype=torch.float32)
|
| 70 |
-
return audio
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
def _save_mlpackage(model: ct.models.MLModel, path: Path, description: str) -> None:
|
| 74 |
-
# Ensure iOS 17+ target for MLProgram ops and ANE readiness
|
| 75 |
-
try:
|
| 76 |
-
model.minimum_deployment_target = ct.target.iOS17
|
| 77 |
-
except Exception:
|
| 78 |
-
pass
|
| 79 |
-
model.short_description = description
|
| 80 |
-
model.author = AUTHOR
|
| 81 |
-
path.parent.mkdir(parents=True, exist_ok=True)
|
| 82 |
-
model.save(str(path))
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
def _tensor_shape(tensor: torch.Tensor) -> Tuple[int, ...]:
|
| 86 |
-
return tuple(int(dim) for dim in tensor.shape)
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
def _parse_compute_units(name: str) -> ct.ComputeUnit:
|
| 90 |
-
"""Parse a human-friendly compute units string into ct.ComputeUnit.
|
| 91 |
-
|
| 92 |
-
Accepted (case-insensitive): ALL, CPU_ONLY, CPU_AND_GPU, CPU_AND_NE.
|
| 93 |
-
"""
|
| 94 |
-
normalized = str(name).strip().upper()
|
| 95 |
-
mapping = {
|
| 96 |
-
"ALL": ct.ComputeUnit.ALL,
|
| 97 |
-
"CPU_ONLY": ct.ComputeUnit.CPU_ONLY,
|
| 98 |
-
"CPU_AND_GPU": ct.ComputeUnit.CPU_AND_GPU,
|
| 99 |
-
"CPU_AND_NE": ct.ComputeUnit.CPU_AND_NE,
|
| 100 |
-
"CPU_AND_NEURALENGINE": ct.ComputeUnit.CPU_AND_NE,
|
| 101 |
-
}
|
| 102 |
-
if normalized not in mapping:
|
| 103 |
-
raise typer.BadParameter(
|
| 104 |
-
f"Unknown compute units '{name}'. Choose from: " + ", ".join(mapping.keys())
|
| 105 |
-
)
|
| 106 |
-
return mapping[normalized]
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
def _parse_compute_precision(name: Optional[str]) -> Optional[ct.precision]:
|
| 110 |
-
"""Parse compute precision string into ct.precision or None.
|
| 111 |
-
|
| 112 |
-
Accepted (case-insensitive): FLOAT32, FLOAT16. If None/empty, returns None (tool default).
|
| 113 |
-
"""
|
| 114 |
-
if name is None:
|
| 115 |
-
return None
|
| 116 |
-
normalized = str(name).strip().upper()
|
| 117 |
-
if normalized == "":
|
| 118 |
-
return None
|
| 119 |
-
mapping = {
|
| 120 |
-
"FLOAT32": ct.precision.FLOAT32,
|
| 121 |
-
"FLOAT16": ct.precision.FLOAT16,
|
| 122 |
-
}
|
| 123 |
-
if normalized not in mapping:
|
| 124 |
-
raise typer.BadParameter(
|
| 125 |
-
f"Unknown compute precision '{name}'. Choose from: " + ", ".join(mapping.keys())
|
| 126 |
-
)
|
| 127 |
-
return mapping[normalized]
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
# Fixed export choices: CPU_ONLY + FP32, min target iOS17
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
app = typer.Typer(add_completion=False, pretty_exceptions_show_locals=False)
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
@app.command()
|
| 137 |
-
def convert(
|
| 138 |
-
nemo_path: Optional[Path] = typer.Option(
|
| 139 |
-
None,
|
| 140 |
-
"--nemo-path",
|
| 141 |
-
exists=True,
|
| 142 |
-
resolve_path=True,
|
| 143 |
-
help="Path to parakeet-tdt_ctc-110m .nemo checkpoint (skip to auto-download)",
|
| 144 |
-
),
|
| 145 |
-
model_id: str = typer.Option(
|
| 146 |
-
DEFAULT_MODEL_ID,
|
| 147 |
-
"--model-id",
|
| 148 |
-
help="Model identifier to download when --nemo-path is omitted",
|
| 149 |
-
),
|
| 150 |
-
output_dir: Path = typer.Option(Path("parakeet_110m_coreml"), help="Directory where mlpackages and metadata will be written"),
|
| 151 |
-
preprocessor_cu: str = typer.Option(
|
| 152 |
-
"CPU_ONLY",
|
| 153 |
-
"--preprocessor-cu",
|
| 154 |
-
help="Compute units for preprocessor (default CPU_ONLY)",
|
| 155 |
-
),
|
| 156 |
-
mel_encoder_cu: str = typer.Option(
|
| 157 |
-
"CPU_ONLY",
|
| 158 |
-
"--mel-encoder-cu",
|
| 159 |
-
help="Compute units for fused mel+encoder (default CPU_ONLY)",
|
| 160 |
-
),
|
| 161 |
-
compute_precision: Optional[str] = typer.Option(
|
| 162 |
-
None,
|
| 163 |
-
"--compute-precision",
|
| 164 |
-
help="Export precision: FLOAT32 (default) or FLOAT16 to shrink non-quantized weights.",
|
| 165 |
-
),
|
| 166 |
-
) -> None:
|
| 167 |
-
"""Export all Parakeet TDT-CTC 110M Hybrid sub-modules to CoreML with a fixed 15-second window.
|
| 168 |
-
|
| 169 |
-
This exports both CTC and TDT components from the hybrid model.
|
| 170 |
-
"""
|
| 171 |
-
# Runtime CoreML contract keeps U=1 so the prediction net matches the streaming decoder.
|
| 172 |
-
export_settings = ExportSettings(
|
| 173 |
-
output_dir=output_dir,
|
| 174 |
-
compute_units=ct.ComputeUnit.CPU_ONLY, # Default: CPU-only for all components
|
| 175 |
-
deployment_target=ct.target.iOS17, # iOS 17+ features and kernels
|
| 176 |
-
compute_precision=_parse_compute_precision(compute_precision),
|
| 177 |
-
max_audio_seconds=15.0,
|
| 178 |
-
max_symbol_steps=1,
|
| 179 |
-
)
|
| 180 |
-
|
| 181 |
-
typer.echo("Export configuration:")
|
| 182 |
-
typer.echo(asdict(export_settings))
|
| 183 |
-
|
| 184 |
-
output_dir.mkdir(parents=True, exist_ok=True)
|
| 185 |
-
pre_cu = _parse_compute_units(preprocessor_cu)
|
| 186 |
-
melenc_cu = _parse_compute_units(mel_encoder_cu)
|
| 187 |
-
|
| 188 |
-
if nemo_path is not None:
|
| 189 |
-
typer.echo(f"Loading NeMo model from {nemo_path}β¦")
|
| 190 |
-
# 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
|
| 191 |
-
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.restore_from(
|
| 192 |
-
str(nemo_path), map_location="cpu"
|
| 193 |
-
)
|
| 194 |
-
checkpoint_meta = {
|
| 195 |
-
"type": "file",
|
| 196 |
-
"path": str(nemo_path),
|
| 197 |
-
}
|
| 198 |
-
else:
|
| 199 |
-
typer.echo(f"Downloading NeMo model via {model_id}β¦")
|
| 200 |
-
# 110M is a hybrid model: EncDecHybridRNNTCTCBPEModel
|
| 201 |
-
asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
|
| 202 |
-
model_id, map_location="cpu"
|
| 203 |
-
)
|
| 204 |
-
checkpoint_meta = {
|
| 205 |
-
"type": "pretrained",
|
| 206 |
-
"model_id": model_id,
|
| 207 |
-
}
|
| 208 |
-
asr_model.eval()
|
| 209 |
-
|
| 210 |
-
sample_rate = int(asr_model.cfg.preprocessor.sample_rate)
|
| 211 |
-
max_samples = _compute_length(export_settings.max_audio_seconds, sample_rate)
|
| 212 |
-
|
| 213 |
-
# Look for a bundled 15s 16kHz audio file
|
| 214 |
-
default_audio = (Path(__file__).parent / "audio" / "yc_first_minute_16k_15s.wav").resolve()
|
| 215 |
-
if default_audio.exists():
|
| 216 |
-
typer.echo(f"Using trace audio: {default_audio}")
|
| 217 |
-
audio_tensor = _prepare_audio(default_audio, sample_rate, max_samples, seed=None)
|
| 218 |
-
else:
|
| 219 |
-
typer.echo("No trace audio found, using random noise for tracing")
|
| 220 |
-
audio_tensor = _prepare_audio(None, sample_rate, max_samples, seed=42)
|
| 221 |
-
audio_length = torch.tensor([max_samples], dtype=torch.int32)
|
| 222 |
-
|
| 223 |
-
preprocessor = PreprocessorWrapper(asr_model.preprocessor.eval())
|
| 224 |
-
encoder = EncoderWrapper(asr_model.encoder.eval())
|
| 225 |
-
decoder = DecoderWrapper(asr_model.decoder.eval())
|
| 226 |
-
joint = JointWrapper(asr_model.joint.eval())
|
| 227 |
-
# CTC head for hybrid model
|
| 228 |
-
ctc_head = CTCHeadWrapper(asr_model.ctc_decoder.eval())
|
| 229 |
-
|
| 230 |
-
decoder_export_flag = getattr(asr_model.decoder, "_rnnt_export", False)
|
| 231 |
-
asr_model.decoder._rnnt_export = True
|
| 232 |
-
|
| 233 |
-
try:
|
| 234 |
-
with torch.inference_mode():
|
| 235 |
-
mel_ref, mel_length_ref = preprocessor(audio_tensor, audio_length)
|
| 236 |
-
mel_length_ref = mel_length_ref.to(dtype=torch.int32)
|
| 237 |
-
encoder_ref, encoder_length_ref = encoder(mel_ref, mel_length_ref)
|
| 238 |
-
encoder_length_ref = encoder_length_ref.to(dtype=torch.int32)
|
| 239 |
-
# CTC log probs
|
| 240 |
-
ctc_log_probs_ref = ctc_head(encoder_ref)
|
| 241 |
-
|
| 242 |
-
# Clone Tensors to drop the inference tensor flag before tracing
|
| 243 |
-
mel_ref = mel_ref.clone()
|
| 244 |
-
mel_length_ref = mel_length_ref.clone()
|
| 245 |
-
encoder_ref = encoder_ref.clone()
|
| 246 |
-
encoder_length_ref = encoder_length_ref.clone()
|
| 247 |
-
ctc_log_probs_ref = ctc_log_probs_ref.clone()
|
| 248 |
-
|
| 249 |
-
vocab_size = int(asr_model.tokenizer.vocab_size)
|
| 250 |
-
num_extra = int(asr_model.joint.num_extra_outputs)
|
| 251 |
-
decoder_hidden = int(asr_model.decoder.pred_hidden)
|
| 252 |
-
decoder_layers = int(asr_model.decoder.pred_rnn_layers)
|
| 253 |
-
|
| 254 |
-
typer.echo(f"Model info:")
|
| 255 |
-
typer.echo(f" Vocab size: {vocab_size}")
|
| 256 |
-
typer.echo(f" Num extra (duration bins): {num_extra}")
|
| 257 |
-
typer.echo(f" Decoder hidden: {decoder_hidden}")
|
| 258 |
-
typer.echo(f" Decoder layers: {decoder_layers}")
|
| 259 |
-
typer.echo(f" Encoder output shape: {_tensor_shape(encoder_ref)}")
|
| 260 |
-
|
| 261 |
-
targets = torch.full(
|
| 262 |
-
(1, export_settings.max_symbol_steps),
|
| 263 |
-
fill_value=asr_model.decoder.blank_idx,
|
| 264 |
-
dtype=torch.int32,
|
| 265 |
-
)
|
| 266 |
-
target_lengths = torch.tensor(
|
| 267 |
-
[export_settings.max_symbol_steps], dtype=torch.int32
|
| 268 |
-
)
|
| 269 |
-
zero_state = torch.zeros(
|
| 270 |
-
decoder_layers,
|
| 271 |
-
1,
|
| 272 |
-
decoder_hidden,
|
| 273 |
-
dtype=torch.float32,
|
| 274 |
-
)
|
| 275 |
-
|
| 276 |
-
with torch.inference_mode():
|
| 277 |
-
decoder_ref, h_ref, c_ref = decoder(targets, target_lengths, zero_state, zero_state)
|
| 278 |
-
joint_ref = joint(encoder_ref, decoder_ref)
|
| 279 |
-
|
| 280 |
-
decoder_ref = decoder_ref.clone()
|
| 281 |
-
h_ref = h_ref.clone()
|
| 282 |
-
c_ref = c_ref.clone()
|
| 283 |
-
joint_ref = joint_ref.clone()
|
| 284 |
-
|
| 285 |
-
typer.echo(f" Decoder output shape: {_tensor_shape(decoder_ref)}")
|
| 286 |
-
typer.echo(f" Joint output shape: {_tensor_shape(joint_ref)}")
|
| 287 |
-
typer.echo(f" CTC log probs shape: {_tensor_shape(ctc_log_probs_ref)}")
|
| 288 |
-
|
| 289 |
-
typer.echo("Tracing and converting preprocessorβ¦")
|
| 290 |
-
# Ensure tracing happens on CPU explicitly
|
| 291 |
-
preprocessor = preprocessor.cpu()
|
| 292 |
-
audio_tensor = audio_tensor.cpu()
|
| 293 |
-
audio_length = audio_length.cpu()
|
| 294 |
-
traced_preprocessor = torch.jit.trace(
|
| 295 |
-
preprocessor, (audio_tensor, audio_length), strict=False
|
| 296 |
-
)
|
| 297 |
-
traced_preprocessor.eval()
|
| 298 |
-
preprocessor_inputs = [
|
| 299 |
-
# Allow variable-length audio up to the fixed 15s window using RangeDim
|
| 300 |
-
ct.TensorType(
|
| 301 |
-
name="audio",
|
| 302 |
-
shape=(1, ct.RangeDim(1, max_samples)),
|
| 303 |
-
dtype=np.float32,
|
| 304 |
-
),
|
| 305 |
-
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 306 |
-
]
|
| 307 |
-
preprocessor_outputs = [
|
| 308 |
-
ct.TensorType(name="mel_features", dtype=np.float32),
|
| 309 |
-
ct.TensorType(name="mel_length", dtype=np.int32),
|
| 310 |
-
]
|
| 311 |
-
# Preprocessor compute units (parametrized; default CPU_ONLY)
|
| 312 |
-
preprocessor_model = _coreml_convert(
|
| 313 |
-
traced_preprocessor,
|
| 314 |
-
preprocessor_inputs,
|
| 315 |
-
preprocessor_outputs,
|
| 316 |
-
export_settings,
|
| 317 |
-
compute_units_override=pre_cu,
|
| 318 |
-
)
|
| 319 |
-
preprocessor_path = output_dir / "parakeet_preprocessor.mlpackage"
|
| 320 |
-
_save_mlpackage(
|
| 321 |
-
preprocessor_model,
|
| 322 |
-
preprocessor_path,
|
| 323 |
-
"Parakeet 110M preprocessor (15 s window)",
|
| 324 |
-
)
|
| 325 |
-
|
| 326 |
-
typer.echo("Tracing and converting encoderβ¦")
|
| 327 |
-
traced_encoder = torch.jit.trace(
|
| 328 |
-
encoder, (mel_ref, mel_length_ref), strict=False
|
| 329 |
-
)
|
| 330 |
-
traced_encoder.eval()
|
| 331 |
-
encoder_inputs = [
|
| 332 |
-
ct.TensorType(name="mel_features", shape=_tensor_shape(mel_ref), dtype=np.float32),
|
| 333 |
-
ct.TensorType(name="mel_length", shape=(1,), dtype=np.int32),
|
| 334 |
-
]
|
| 335 |
-
encoder_outputs = [
|
| 336 |
-
ct.TensorType(name="encoder_output", dtype=np.float32),
|
| 337 |
-
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 338 |
-
]
|
| 339 |
-
# Encoder: CPU only
|
| 340 |
-
encoder_model = _coreml_convert(
|
| 341 |
-
traced_encoder,
|
| 342 |
-
encoder_inputs,
|
| 343 |
-
encoder_outputs,
|
| 344 |
-
export_settings,
|
| 345 |
-
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 346 |
-
)
|
| 347 |
-
encoder_path = output_dir / "parakeet_encoder.mlpackage"
|
| 348 |
-
_save_mlpackage(
|
| 349 |
-
encoder_model,
|
| 350 |
-
encoder_path,
|
| 351 |
-
"Parakeet 110M encoder (15 s window)",
|
| 352 |
-
)
|
| 353 |
-
|
| 354 |
-
# CTC Head for hybrid model
|
| 355 |
-
typer.echo("Tracing and converting CTC headβ¦")
|
| 356 |
-
traced_ctc_head = torch.jit.trace(
|
| 357 |
-
ctc_head, (encoder_ref,), strict=False
|
| 358 |
-
)
|
| 359 |
-
traced_ctc_head.eval()
|
| 360 |
-
ctc_head_inputs = [
|
| 361 |
-
ct.TensorType(name="encoder_output", shape=_tensor_shape(encoder_ref), dtype=np.float32),
|
| 362 |
-
]
|
| 363 |
-
ctc_head_outputs = [
|
| 364 |
-
ct.TensorType(name="ctc_logits", dtype=np.float32),
|
| 365 |
-
]
|
| 366 |
-
ctc_head_model = _coreml_convert(
|
| 367 |
-
traced_ctc_head,
|
| 368 |
-
ctc_head_inputs,
|
| 369 |
-
ctc_head_outputs,
|
| 370 |
-
export_settings,
|
| 371 |
-
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 372 |
-
)
|
| 373 |
-
ctc_head_path = output_dir / "parakeet_ctc_head.mlpackage"
|
| 374 |
-
_save_mlpackage(
|
| 375 |
-
ctc_head_model,
|
| 376 |
-
ctc_head_path,
|
| 377 |
-
"Parakeet 110M CTC decoder head",
|
| 378 |
-
)
|
| 379 |
-
|
| 380 |
-
# Optional fused export: Preprocessor + Encoder
|
| 381 |
-
typer.echo("Tracing and converting fused mel+encoderβ¦")
|
| 382 |
-
mel_encoder = MelEncoderWrapper(preprocessor, encoder)
|
| 383 |
-
traced_mel_encoder = torch.jit.trace(
|
| 384 |
-
mel_encoder, (audio_tensor, audio_length), strict=False
|
| 385 |
-
)
|
| 386 |
-
traced_mel_encoder.eval()
|
| 387 |
-
mel_encoder_inputs = [
|
| 388 |
-
# Keep fixed 15s window for fused Mel+Encoder
|
| 389 |
-
ct.TensorType(name="audio", shape=(1, max_samples), dtype=np.float32),
|
| 390 |
-
ct.TensorType(name="audio_length", shape=(1,), dtype=np.int32),
|
| 391 |
-
]
|
| 392 |
-
mel_encoder_outputs = [
|
| 393 |
-
ct.TensorType(name="encoder_output", dtype=np.float32),
|
| 394 |
-
ct.TensorType(name="encoder_length", dtype=np.int32),
|
| 395 |
-
]
|
| 396 |
-
# Fused mel+encoder compute units (parametrized; default CPU_ONLY)
|
| 397 |
-
mel_encoder_model = _coreml_convert(
|
| 398 |
-
traced_mel_encoder,
|
| 399 |
-
mel_encoder_inputs,
|
| 400 |
-
mel_encoder_outputs,
|
| 401 |
-
export_settings,
|
| 402 |
-
compute_units_override=melenc_cu,
|
| 403 |
-
)
|
| 404 |
-
mel_encoder_path = output_dir / "parakeet_mel_encoder.mlpackage"
|
| 405 |
-
_save_mlpackage(
|
| 406 |
-
mel_encoder_model,
|
| 407 |
-
mel_encoder_path,
|
| 408 |
-
"Parakeet 110M fused Mel+Encoder (15 s window)",
|
| 409 |
-
)
|
| 410 |
-
|
| 411 |
-
typer.echo("Tracing and converting decoderβ¦")
|
| 412 |
-
traced_decoder = torch.jit.trace(
|
| 413 |
-
decoder,
|
| 414 |
-
(targets, target_lengths, zero_state, zero_state),
|
| 415 |
-
strict=False,
|
| 416 |
-
)
|
| 417 |
-
traced_decoder.eval()
|
| 418 |
-
decoder_inputs = [
|
| 419 |
-
ct.TensorType(name="targets", shape=_tensor_shape(targets), dtype=np.int32),
|
| 420 |
-
ct.TensorType(name="target_length", shape=(1,), dtype=np.int32),
|
| 421 |
-
ct.TensorType(name="h_in", shape=_tensor_shape(zero_state), dtype=np.float32),
|
| 422 |
-
ct.TensorType(name="c_in", shape=_tensor_shape(zero_state), dtype=np.float32),
|
| 423 |
-
]
|
| 424 |
-
decoder_outputs = [
|
| 425 |
-
ct.TensorType(name="decoder", dtype=np.float32),
|
| 426 |
-
ct.TensorType(name="h_out", dtype=np.float32),
|
| 427 |
-
ct.TensorType(name="c_out", dtype=np.float32),
|
| 428 |
-
]
|
| 429 |
-
# Decoder: CPU only
|
| 430 |
-
decoder_model = _coreml_convert(
|
| 431 |
-
traced_decoder,
|
| 432 |
-
decoder_inputs,
|
| 433 |
-
decoder_outputs,
|
| 434 |
-
export_settings,
|
| 435 |
-
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 436 |
-
)
|
| 437 |
-
decoder_path = output_dir / "parakeet_decoder.mlpackage"
|
| 438 |
-
_save_mlpackage(
|
| 439 |
-
decoder_model,
|
| 440 |
-
decoder_path,
|
| 441 |
-
"Parakeet 110M decoder (RNNT prediction network)",
|
| 442 |
-
)
|
| 443 |
-
|
| 444 |
-
typer.echo("Tracing and converting jointβ¦")
|
| 445 |
-
traced_joint = torch.jit.trace(
|
| 446 |
-
joint,
|
| 447 |
-
(encoder_ref, decoder_ref),
|
| 448 |
-
strict=False,
|
| 449 |
-
)
|
| 450 |
-
traced_joint.eval()
|
| 451 |
-
joint_inputs = [
|
| 452 |
-
ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
|
| 453 |
-
ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
|
| 454 |
-
]
|
| 455 |
-
joint_outputs = [
|
| 456 |
-
ct.TensorType(name="logits", dtype=np.float32),
|
| 457 |
-
]
|
| 458 |
-
# Joint: CPU only
|
| 459 |
-
joint_model = _coreml_convert(
|
| 460 |
-
traced_joint,
|
| 461 |
-
joint_inputs,
|
| 462 |
-
joint_outputs,
|
| 463 |
-
export_settings,
|
| 464 |
-
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 465 |
-
)
|
| 466 |
-
joint_path = output_dir / "parakeet_joint.mlpackage"
|
| 467 |
-
_save_mlpackage(
|
| 468 |
-
joint_model,
|
| 469 |
-
joint_path,
|
| 470 |
-
"Parakeet 110M joint network (RNNT)",
|
| 471 |
-
)
|
| 472 |
-
|
| 473 |
-
# Joint + decision head (split logits, softmax, argmax)
|
| 474 |
-
typer.echo("Tracing and converting joint decision headβ¦")
|
| 475 |
-
vocab_size = int(asr_model.tokenizer.vocab_size)
|
| 476 |
-
num_extra = int(asr_model.joint.num_extra_outputs)
|
| 477 |
-
joint_decision = JointDecisionWrapper(joint, vocab_size=vocab_size, num_extra=num_extra)
|
| 478 |
-
traced_joint_decision = torch.jit.trace(
|
| 479 |
-
joint_decision,
|
| 480 |
-
(encoder_ref, decoder_ref),
|
| 481 |
-
strict=False,
|
| 482 |
-
)
|
| 483 |
-
traced_joint_decision.eval()
|
| 484 |
-
joint_decision_inputs = [
|
| 485 |
-
ct.TensorType(name="encoder", shape=_tensor_shape(encoder_ref), dtype=np.float32),
|
| 486 |
-
ct.TensorType(name="decoder", shape=_tensor_shape(decoder_ref), dtype=np.float32),
|
| 487 |
-
]
|
| 488 |
-
joint_decision_outputs = [
|
| 489 |
-
ct.TensorType(name="token_id", dtype=np.int32),
|
| 490 |
-
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 491 |
-
ct.TensorType(name="duration", dtype=np.int32),
|
| 492 |
-
]
|
| 493 |
-
# JointDecision: CPU only
|
| 494 |
-
joint_decision_model = _coreml_convert(
|
| 495 |
-
traced_joint_decision,
|
| 496 |
-
joint_decision_inputs,
|
| 497 |
-
joint_decision_outputs,
|
| 498 |
-
export_settings,
|
| 499 |
-
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 500 |
-
)
|
| 501 |
-
joint_decision_path = output_dir / "parakeet_joint_decision.mlpackage"
|
| 502 |
-
_save_mlpackage(
|
| 503 |
-
joint_decision_model,
|
| 504 |
-
joint_decision_path,
|
| 505 |
-
"Parakeet 110M joint + decision head (split, softmax, argmax)",
|
| 506 |
-
)
|
| 507 |
-
|
| 508 |
-
# Single-step JointDecision for [1,512,1] x [1,640,1] -> [1,1,1]
|
| 509 |
-
# Note: 110M encoder dim is 512 (not 1024 like 0.6B)
|
| 510 |
-
typer.echo("Tracing and converting single-step joint decisionβ¦")
|
| 511 |
-
jd_single = JointDecisionSingleStep(joint, vocab_size=vocab_size, num_extra=num_extra)
|
| 512 |
-
# Create single-step slices from refs
|
| 513 |
-
enc_step = encoder_ref[:, :, :1].contiguous()
|
| 514 |
-
dec_step = decoder_ref[:, :, :1].contiguous()
|
| 515 |
-
traced_jd_single = torch.jit.trace(
|
| 516 |
-
jd_single,
|
| 517 |
-
(enc_step, dec_step),
|
| 518 |
-
strict=False,
|
| 519 |
-
)
|
| 520 |
-
traced_jd_single.eval()
|
| 521 |
-
jd_single_inputs = [
|
| 522 |
-
ct.TensorType(name="encoder_step", shape=(1, enc_step.shape[1], 1), dtype=np.float32),
|
| 523 |
-
ct.TensorType(name="decoder_step", shape=(1, dec_step.shape[1], 1), dtype=np.float32),
|
| 524 |
-
]
|
| 525 |
-
jd_single_outputs = [
|
| 526 |
-
ct.TensorType(name="token_id", dtype=np.int32),
|
| 527 |
-
ct.TensorType(name="token_prob", dtype=np.float32),
|
| 528 |
-
ct.TensorType(name="duration", dtype=np.int32),
|
| 529 |
-
ct.TensorType(name="top_k_ids", dtype=np.int32),
|
| 530 |
-
ct.TensorType(name="top_k_logits", dtype=np.float32),
|
| 531 |
-
]
|
| 532 |
-
# Single-step JointDecision: CPU only
|
| 533 |
-
jd_single_model = _coreml_convert(
|
| 534 |
-
traced_jd_single,
|
| 535 |
-
jd_single_inputs,
|
| 536 |
-
jd_single_outputs,
|
| 537 |
-
export_settings,
|
| 538 |
-
compute_units_override=ct.ComputeUnit.CPU_ONLY,
|
| 539 |
-
)
|
| 540 |
-
jd_single_path = output_dir / "parakeet_joint_decision_single_step.mlpackage"
|
| 541 |
-
_save_mlpackage(
|
| 542 |
-
jd_single_model,
|
| 543 |
-
jd_single_path,
|
| 544 |
-
"Parakeet 110M single-step joint decision (current frame)",
|
| 545 |
-
)
|
| 546 |
-
|
| 547 |
-
# Export vocabulary
|
| 548 |
-
typer.echo("Exporting vocabularyβ¦")
|
| 549 |
-
vocab_path = output_dir / "vocab.json"
|
| 550 |
-
vocab_dict = {
|
| 551 |
-
"vocab_size": vocab_size,
|
| 552 |
-
"blank_id": int(asr_model.decoder.blank_idx),
|
| 553 |
-
"tokens": asr_model.tokenizer.vocab,
|
| 554 |
-
}
|
| 555 |
-
vocab_path.write_text(json.dumps(vocab_dict, indent=2, ensure_ascii=False))
|
| 556 |
-
|
| 557 |
-
metadata: Dict[str, object] = {
|
| 558 |
-
"model_id": model_id,
|
| 559 |
-
"model_type": "hybrid_rnnt_ctc",
|
| 560 |
-
"sample_rate": sample_rate,
|
| 561 |
-
"max_audio_seconds": export_settings.max_audio_seconds,
|
| 562 |
-
"max_audio_samples": max_samples,
|
| 563 |
-
"max_symbol_steps": export_settings.max_symbol_steps,
|
| 564 |
-
"vocab_size": vocab_size,
|
| 565 |
-
"joint_extra_outputs": num_extra,
|
| 566 |
-
"encoder_dim": int(encoder_ref.shape[1]), # 512 for 110M
|
| 567 |
-
"decoder_dim": int(decoder_ref.shape[1]), # 640
|
| 568 |
-
"decoder_hidden": decoder_hidden,
|
| 569 |
-
"decoder_layers": decoder_layers,
|
| 570 |
-
"blank_id": int(asr_model.decoder.blank_idx),
|
| 571 |
-
"checkpoint": checkpoint_meta,
|
| 572 |
-
"coreml": {
|
| 573 |
-
"compute_units": export_settings.compute_units.name,
|
| 574 |
-
"compute_precision": (
|
| 575 |
-
export_settings.compute_precision.name
|
| 576 |
-
if export_settings.compute_precision is not None
|
| 577 |
-
else "FLOAT32"
|
| 578 |
-
),
|
| 579 |
-
},
|
| 580 |
-
"components": {
|
| 581 |
-
"preprocessor": {
|
| 582 |
-
"inputs": {
|
| 583 |
-
"audio_signal": list(_tensor_shape(audio_tensor)),
|
| 584 |
-
"audio_length": [1],
|
| 585 |
-
},
|
| 586 |
-
"outputs": {
|
| 587 |
-
"mel": list(_tensor_shape(mel_ref)),
|
| 588 |
-
"mel_length": [1],
|
| 589 |
-
},
|
| 590 |
-
"path": preprocessor_path.name,
|
| 591 |
-
},
|
| 592 |
-
"encoder": {
|
| 593 |
-
"inputs": {
|
| 594 |
-
"mel": list(_tensor_shape(mel_ref)),
|
| 595 |
-
"mel_length": [1],
|
| 596 |
-
},
|
| 597 |
-
"outputs": {
|
| 598 |
-
"encoder": list(_tensor_shape(encoder_ref)),
|
| 599 |
-
"encoder_length": [1],
|
| 600 |
-
},
|
| 601 |
-
"path": encoder_path.name,
|
| 602 |
-
},
|
| 603 |
-
"ctc_head": {
|
| 604 |
-
"inputs": {
|
| 605 |
-
"encoder": list(_tensor_shape(encoder_ref)),
|
| 606 |
-
},
|
| 607 |
-
"outputs": {
|
| 608 |
-
"log_probs": list(_tensor_shape(ctc_log_probs_ref)),
|
| 609 |
-
},
|
| 610 |
-
"path": ctc_head_path.name,
|
| 611 |
-
},
|
| 612 |
-
"mel_encoder": {
|
| 613 |
-
"inputs": {
|
| 614 |
-
"audio_signal": [1, max_samples],
|
| 615 |
-
"audio_length": [1],
|
| 616 |
-
},
|
| 617 |
-
"outputs": {
|
| 618 |
-
"encoder": list(_tensor_shape(encoder_ref)),
|
| 619 |
-
"encoder_length": [1],
|
| 620 |
-
},
|
| 621 |
-
"path": mel_encoder_path.name,
|
| 622 |
-
},
|
| 623 |
-
"decoder": {
|
| 624 |
-
"inputs": {
|
| 625 |
-
"targets": list(_tensor_shape(targets)),
|
| 626 |
-
"target_length": [1],
|
| 627 |
-
"h_in": list(_tensor_shape(zero_state)),
|
| 628 |
-
"c_in": list(_tensor_shape(zero_state)),
|
| 629 |
-
},
|
| 630 |
-
"outputs": {
|
| 631 |
-
"decoder": list(_tensor_shape(decoder_ref)),
|
| 632 |
-
"h_out": list(_tensor_shape(h_ref)),
|
| 633 |
-
"c_out": list(_tensor_shape(c_ref)),
|
| 634 |
-
},
|
| 635 |
-
"path": decoder_path.name,
|
| 636 |
-
},
|
| 637 |
-
"joint": {
|
| 638 |
-
"inputs": {
|
| 639 |
-
"encoder": list(_tensor_shape(encoder_ref)),
|
| 640 |
-
"decoder": list(_tensor_shape(decoder_ref)),
|
| 641 |
-
},
|
| 642 |
-
"outputs": {
|
| 643 |
-
"logits": list(_tensor_shape(joint_ref)),
|
| 644 |
-
},
|
| 645 |
-
"path": joint_path.name,
|
| 646 |
-
},
|
| 647 |
-
"joint_decision": {
|
| 648 |
-
"inputs": {
|
| 649 |
-
"encoder": list(_tensor_shape(encoder_ref)),
|
| 650 |
-
"decoder": list(_tensor_shape(decoder_ref)),
|
| 651 |
-
},
|
| 652 |
-
"outputs": {
|
| 653 |
-
"token_id": [
|
| 654 |
-
_tensor_shape(encoder_ref)[0],
|
| 655 |
-
_tensor_shape(encoder_ref)[2],
|
| 656 |
-
_tensor_shape(decoder_ref)[2],
|
| 657 |
-
],
|
| 658 |
-
"token_prob": [
|
| 659 |
-
_tensor_shape(encoder_ref)[0],
|
| 660 |
-
_tensor_shape(encoder_ref)[2],
|
| 661 |
-
_tensor_shape(decoder_ref)[2],
|
| 662 |
-
],
|
| 663 |
-
"duration": [
|
| 664 |
-
_tensor_shape(encoder_ref)[0],
|
| 665 |
-
_tensor_shape(encoder_ref)[2],
|
| 666 |
-
_tensor_shape(decoder_ref)[2],
|
| 667 |
-
],
|
| 668 |
-
},
|
| 669 |
-
"path": joint_decision_path.name,
|
| 670 |
-
},
|
| 671 |
-
"joint_decision_single_step": {
|
| 672 |
-
"inputs": {
|
| 673 |
-
"encoder_step": [1, int(encoder_ref.shape[1]), 1],
|
| 674 |
-
"decoder_step": [1, int(decoder_ref.shape[1]), 1],
|
| 675 |
-
},
|
| 676 |
-
"outputs": {
|
| 677 |
-
"token_id": [1, 1, 1],
|
| 678 |
-
"token_prob": [1, 1, 1],
|
| 679 |
-
"duration": [1, 1, 1],
|
| 680 |
-
"top_k_ids": [1, 1, 1, 64],
|
| 681 |
-
"top_k_logits": [1, 1, 1, 64],
|
| 682 |
-
},
|
| 683 |
-
"path": jd_single_path.name,
|
| 684 |
-
},
|
| 685 |
-
},
|
| 686 |
-
}
|
| 687 |
-
|
| 688 |
-
metadata_path = output_dir / "metadata.json"
|
| 689 |
-
metadata_path.write_text(json.dumps(metadata, indent=2))
|
| 690 |
-
typer.echo(f"Export complete. Metadata written to {metadata_path}")
|
| 691 |
-
|
| 692 |
-
finally:
|
| 693 |
-
asr_model.decoder._rnnt_export = decoder_export_flag
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
if __name__ == "__main__":
|
| 697 |
-
app()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
convert/parakeet-tdt-ctc-110m/coreml/hybrid_earnings_benchmark.json
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"approach" : "single-encoder",
|
| 3 |
-
"model" : "parakeet-tdt-ctc-110m-hybrid",
|
| 4 |
-
"results" : [
|
| 5 |
-
{
|
| 6 |
-
"audioLength" : 15,
|
| 7 |
-
"ctcDetections" : [
|
| 8 |
-
{
|
| 9 |
-
"endTime" : 6.0800000000000001,
|
| 10 |
-
"inReference" : true,
|
| 11 |
-
"score" : -8.3699999999999992,
|
| 12 |
-
"source" : "ctc",
|
| 13 |
-
"startTime" : 4.96,
|
| 14 |
-
"word" : "LATAM Airlines"
|
| 15 |
-
}
|
| 16 |
-
],
|
| 17 |
-
"dictFound" : 1,
|
| 18 |
-
"dictTotal" : 1,
|
| 19 |
-
"fileId" : "4329526_chunk0",
|
| 20 |
-
"hypothesis" : "goodday everyone and welcome to latam airlines group earnings release confonference call just as a reminder this conference is being recorded lat tam airlines group eararnings releaseed for the",
|
| 21 |
-
"processingTime" : 0.070000000000000007,
|
| 22 |
-
"reference" : "good day everyone and welcome to latam airlines group earnings release conference call just as a reminder this conference is being recorded latam airlines group earnings released for the",
|
| 23 |
-
"wer" : 24.140000000000001
|
| 24 |
-
}
|
| 25 |
-
],
|
| 26 |
-
"summary" : {
|
| 27 |
-
"avgWer" : 24.140000000000001,
|
| 28 |
-
"dictPass" : 1,
|
| 29 |
-
"dictRate" : 100,
|
| 30 |
-
"dictTotal" : 1,
|
| 31 |
-
"totalAudioDuration" : 15,
|
| 32 |
-
"totalProcessingTime" : 0.070000000000000007,
|
| 33 |
-
"totalTests" : 1
|
| 34 |
-
}
|
| 35 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|