import ArgumentParser import CoreML import Foundation import Tokenizers // MARK: - CLI Arguments @main struct PlapreCLI: AsyncParsableCommand { static var configuration = CommandConfiguration( commandName: "plapre-cli", abstract: "Plapre Pico CoreML TTS Pipeline", discussion: "Danish text-to-speech using CoreML models." ) @Argument(help: "Text to synthesize") var text: String = "Hej, mit navn er Daniel." @Option(name: .shortAndLong, help: "Speaker voice (available: tor, ida, liv, ask, kaj)") var speaker: String = "tor" @Option(name: .shortAndLong, help: "Output WAV file path") var output: String = "output.wav" @Flag(name: .long, help: "Use int8 quantized model (smaller)") var int8 = false // MARK: - Run Pipeline mutating func run() async throws { print("Plapre Pico CoreML TTS Pipeline") print("================================\n") print("Text: \(text)") print("Speaker: \(speaker)") print("Output: \(output)\n") let pipelineStart = CFAbsoluteTimeGetCurrent() // Load speaker and tokenize let speakerEmb = try loadSpeaker(speaker) print("Loaded speaker embedding (\(speakerEmb.count) dims)") let tokenizer = try await measureAsync("Tokenizer load") { try await AutoTokenizer.from(modelFolder: PlapreConfig.repoRoot) } let textTokens = tokenizer.encode(text: text, addSpecialTokens: false).map { Int32($0) } print("Tokenized: \(textTokens.count) tokens: \(textTokens)") // Build input sequence: [EOS, text_marker, tokens..., audio_marker] var inputSeq: [Int32] = [PlapreConfig.eosToken, PlapreConfig.textMarkerToken] + textTokens + [ PlapreConfig.audioMarkerToken ] let inputLen = inputSeq.count print("Input sequence: \(inputLen) tokens") while inputSeq.count < PlapreConfig.prefillSequenceLength { inputSeq.append(PlapreConfig.eosToken) } // Load RoPE tables print("\nLoading RoPE tables...") let ropeCosF32 = try loadRopeTable("rope_cos.npy") let ropeSinF32 = try loadRopeTable("rope_sin.npy") let ropeCos16: [Float16] = ropeCosF32.map { Float16($0) } let ropeSin16: [Float16] = ropeSinF32.map { Float16($0) } print("RoPE cos: \(ropeCos16.count) values, sin: \(ropeSin16.count) values") // Compile models print("\nCompiling models...") let decodeModel = try measure("Compile PlaprePico") { try compileModel( at: PlapreConfig.modelURL(for: "PlaprePico", useInt8: int8)) } let kanadeModel = try measure("Compile KanadeDecoder") { try compileModel( at: PlapreConfig.modelURL(for: "KanadeDecoder", useInt8: false)) } let vocoderModel = try measure("Compile Vocoder") { try compileModel( at: PlapreConfig.modelURL(for: "Vocoder", useInt8: false)) } // Pre-allocate MLMultiArrays (performance-critical: single allocation, reused for all steps) let pInputIds = try! MLMultiArray(shape: [1, 1], dataType: .int32) let pCausalMask = try! MLMultiArray( shape: [1, 1, 1, NSNumber(value: PlapreConfig.maxContextLength)], dataType: .float16) let pCos = try! MLMultiArray( shape: [1, 1, 1, NSNumber(value: PlapreConfig.headDimension)], dataType: .float16) let pSin = try! MLMultiArray( shape: [1, 1, 1, NSNumber(value: PlapreConfig.headDimension)], dataType: .float16) let pUpdateMask = try! MLMultiArray( shape: [1, 1, NSNumber(value: PlapreConfig.maxContextLength), 1], dataType: .float16) let pSpeakerEmb = makeFloat16Array( speakerEmb, shape: [1, PlapreConfig.speakerEmbeddingDimension]) let pIsSpeaker = try! MLMultiArray(shape: [1], dataType: .float16) // Initialize causal mask to all -inf pCausalMask.withUnsafeMutableBufferPointer(ofType: Float16.self) { ptr, _ in for i in 0.. 0 { ptr[pos - 1] = Float16(0.0) } ptr[pos] = Float16(1.0) } pIsSpeaker.withUnsafeMutableBufferPointer(ofType: Float16.self) { ptr, _ in ptr[0] = isSpeaker ? Float16(1.0) : Float16(0.0) } let output = try decodeModel.prediction(from: inputProvider, using: state) let arr = output.featureValue(for: "logits")!.multiArrayValue! arr.withUnsafeBufferPointer(ofType: Float16.self) { ptr in for i in 0..= PlapreConfig.audioTokenOffset && nextToken <= PlapreConfig.audioTokenMax { consecutiveNonAudio = 0 } else { consecutiveNonAudio += 1 if consecutiveNonAudio >= PlapreConfig.nonAudioStopThreshold { print( " Stopping: \(PlapreConfig.nonAudioStopThreshold) consecutive non-audio tokens at step \(step)" ) break } } if step % 25 == 0 { let elapsed = CFAbsoluteTimeGetCurrent() - decodeStart let tokPerSec = Double(step) / elapsed print( " Step \(step) (\(Float(step) / Float(PlapreConfig.audioTokensPerSecond))s audio) — \(formatTime(elapsed)) elapsed, \(String(format: "%.1f", tokPerSec)) tok/s" ) } } let decodeElapsed = CFAbsoluteTimeGetCurrent() - decodeStart let decodeSteps = generatedTokens.count - 1 let decodeTokPerSec = Double(decodeSteps) / decodeElapsed let audioSeconds = Float( generatedTokens.filter { $0 >= PlapreConfig.audioTokenOffset && $0 <= PlapreConfig.audioTokenMax }.count) / Float(PlapreConfig.audioTokensPerSecond) let rtf = Float(decodeElapsed) / audioSeconds print( " ⏱ Decode: \(formatTime(decodeElapsed)) (\(decodeSteps) steps, \(String(format: "%.1f", decodeTokPerSec)) tok/s)" ) print( " ⏱ Audio generated: \(String(format: "%.1f", audioSeconds))s — RTF \(String(format: "%.2f", rtf))x (1.0 = realtime)" ) // === Audio synthesis === let audioTokens = generatedTokens.filter { $0 >= PlapreConfig.audioTokenOffset && $0 <= PlapreConfig.audioTokenMax } print( "\nGenerated \(generatedTokens.count) tokens, \(audioTokens.count) audio (\(Float(audioTokens.count) / Float(PlapreConfig.audioTokensPerSecond))s)" ) guard !audioTokens.isEmpty else { throw PipelineError.noAudioTokensGenerated } // === Kanade + Vocoder in chunks === let numChunks = (audioTokens.count + PlapreConfig.kanadeChunkSize - 1) / PlapreConfig.kanadeChunkSize print("\n--- Kanade + Vocoder (\(numChunks) chunk\(numChunks == 1 ? "" : "s")) ---") var waveform: [Float] = [] let audioDecodeStart = CFAbsoluteTimeGetCurrent() for chunkIdx in 0..