aoiandroid
/

vibevoice-1.5-coreml

Core ML

Model card Files Files and versions

xet

Community

aoiandroid commited on Jan 31

Commit

0ccfeb8

verified ·

1 Parent(s): 72c78a2

Upload VibeVoicePipeline.swift with huggingface_hub

Browse files

Files changed (1) hide show

VibeVoicePipeline.swift +212 -0

VibeVoicePipeline.swift ADDED Viewed

	@@ -0,0 +1,212 @@

+//
+//  VibeVoicePipeline.swift
+//  VibeVoice CoreML Pipeline
+//
+//  Auto-generated interface for VibeVoice TTS model
+//
+import Foundation
+import CoreML
+import Accelerate
+/// VibeVoice TTS Pipeline for CoreML
+public class VibeVoicePipeline {
+    // MARK: - Models
+    private var acousticEncoder: MLModel?
+    private var acousticDecoder: MLModel?
+    private var semanticEncoder: MLModel?
+    private var acousticConnector: MLModel?
+    private var semanticConnector: MLModel?
+    private var llm: MLModel?
+    private var diffusionHead: MLModel?
+    // MARK: - Configuration
+    public let sampleRate: Int = 24000
+    public let downsampleFactor: Int = 3200
+    public let latentDim: Int = 64
+    public let semanticDim: Int = 128
+    public let hiddenDim: Int = 1536
+    public let diffusionSteps: Int = 20
+    // MARK: - Initialization
+    public init(modelDirectory: URL, configuration: MLModelConfiguration = .init()) throws {
+        let config = configuration
+        config.computeUnits = .all
+        // Load models
+        acousticEncoder = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_acoustic_encoder.mlpackage"), configuration: config)
+        acousticDecoder = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_acoustic_decoder.mlpackage"), configuration: config)
+        semanticEncoder = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_semantic_encoder.mlpackage"), configuration: config)
+        acousticConnector = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_acoustic_connector.mlpackage"), configuration: config)
+        semanticConnector = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_semantic_connector.mlpackage"), configuration: config)
+        llm = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_llm.mlpackage"), configuration: config)
+        diffusionHead = try? MLModel(contentsOf: modelDirectory.appendingPathComponent("vibevoice_diffusion_head.mlpackage"), configuration: config)
+        guard acousticDecoder != nil, llm != nil, diffusionHead != nil else {
+            throw PipelineError.modelLoadFailed
+        }
+    }
+    // MARK: - Inference
+    /// Encode audio to acoustic latent representation.
+    /// Input must be fixed length: (1, 1, 24000) — 1 sec at 24 kHz; trim or pad before calling.
+    public func encodeAcoustic(_ audio: MLMultiArray) throws -> MLMultiArray {
+        guard let encoder = acousticEncoder else {
+            throw PipelineError.modelNotLoaded("acousticEncoder")
+        }
+        let input = try MLDictionaryFeatureProvider(dictionary: ["audio": audio])
+        let output = try encoder.prediction(from: input)
+        guard let latent = output.featureValue(for: "acoustic_latent")?.multiArrayValue else {
+            throw PipelineError.outputMissing("acoustic_latent")
+        }
+        return latent
+    }
+    /// Decode acoustic latent to audio waveform
+    public func decodeAcoustic(_ latent: MLMultiArray) throws -> MLMultiArray {
+        guard let decoder = acousticDecoder else {
+            throw PipelineError.modelNotLoaded("acousticDecoder")
+        }
+        let input = try MLDictionaryFeatureProvider(dictionary: ["acoustic_latent": latent])
+        let output = try decoder.prediction(from: input)
+        guard let audio = output.featureValue(for: "audio")?.multiArrayValue else {
+            throw PipelineError.outputMissing("audio")
+        }
+        return audio
+    }
+    /// Run LLM forward pass
+    public func runLLM(inputIds: MLMultiArray, attentionMask: MLMultiArray) throws -> (hiddenStates: MLMultiArray, logits: MLMultiArray) {
+        guard let model = llm else {
+            throw PipelineError.modelNotLoaded("llm")
+        }
+        let input = try MLDictionaryFeatureProvider(dictionary: [
+            "input_ids": inputIds,
+            "attention_mask": attentionMask
+        ])
+        let output = try model.prediction(from: input)
+        guard let hiddenStates = output.featureValue(for: "hidden_states")?.multiArrayValue,
+              let logits = output.featureValue(for: "logits")?.multiArrayValue else {
+            throw PipelineError.outputMissing("hidden_states or logits")
+        }
+        return (hiddenStates, logits)
+    }
+    /// Single diffusion denoising step
+    public func diffusionStep(noisyLatent: MLMultiArray, timestep: Float, condition: MLMultiArray) throws -> MLMultiArray {
+        guard let head = diffusionHead else {
+            throw PipelineError.modelNotLoaded("diffusionHead")
+        }
+        let timestepArray = try MLMultiArray(shape: [1], dataType: .float32)
+        timestepArray[0] = NSNumber(value: timestep)
+        let input = try MLDictionaryFeatureProvider(dictionary: [
+            "noisy_latent": noisyLatent,
+            "timestep": timestepArray,
+            "condition": condition
+        ])
+        let output = try head.prediction(from: input)
+        guard let prediction = output.featureValue(for: "prediction")?.multiArrayValue else {
+            throw PipelineError.outputMissing("prediction")
+        }
+        return prediction
+    }
+    // MARK: - Errors
+    public enum PipelineError: Error {
+        case modelLoadFailed
+        case modelNotLoaded(String)
+        case outputMissing(String)
+        case invalidInput(String)
+    }
+}
+// MARK: - DPM-Solver Scheduler
+/// Simple DPM-Solver scheduler for diffusion inference
+public class DPMSolverScheduler {
+    public let numTrainTimesteps: Int
+    public let numInferenceSteps: Int
+    public let betaSchedule: String
+    private var timesteps: [Float] = []
+    private var alphasCumprod: [Float] = []
+    public init(numTrainTimesteps: Int = 1000, numInferenceSteps: Int = 20, betaSchedule: String = "cosine") {
+        self.numTrainTimesteps = numTrainTimesteps
+        self.numInferenceSteps = numInferenceSteps
+        self.betaSchedule = betaSchedule
+        setupScheduler()
+    }
+    private func setupScheduler() {
+        // Compute betas based on schedule
+        var betas: [Float] = []
+        if betaSchedule == "cosine" {
+            let steps = numTrainTimesteps + 1
+            for i in 0..<steps {
+                let t = Float(i) / Float(numTrainTimesteps)
+                let alphaBar = cos((t + 0.008) / 1.008 * Float.pi / 2).pow(2)
+                betas.append(min(1 - alphaBar, 0.999))
+            }
+        }
+        // Compute alphas_cumprod
+        var alphaCumprod: Float = 1.0
+        for beta in betas {
+            alphaCumprod *= (1 - beta)
+            alphasCumprod.append(alphaCumprod)
+        }
+        // Compute timesteps for inference
+        let stepRatio = Float(numTrainTimesteps) / Float(numInferenceSteps)
+        timesteps = (0..<numInferenceSteps).map { Float(numTrainTimesteps - 1) - Float($0) * stepRatio }
+    }
+    public func getTimesteps() -> [Float] {
+        return timesteps
+    }
+    public func step(modelOutput: MLMultiArray, timestep: Float, sample: MLMultiArray) -> MLMultiArray {
+        // Simplified DDPM step - full implementation would use DPM-Solver++
+        let timestepIdx = Int(timestep)
+        let alphaProd = alphasCumprod[min(timestepIdx, alphasCumprod.count - 1)]
+        let alphaProdPrev = timestepIdx > 0 ? alphasCumprod[timestepIdx - 1] : 1.0
+        // v_prediction to epsilon
+        let sqrtAlphaProd = sqrt(alphaProd)
+        let sqrtOneMinusAlphaProd = sqrt(1 - alphaProd)
+        // Compute predicted original sample
+        // For v_prediction: x0 = sqrt(alpha) * sample - sqrt(1-alpha) * v
+        // Then get previous sample
+        // Simplified: just return model output scaled (placeholder)
+        return modelOutput
+    }
+}
+extension Float {
+    func pow(_ exponent: Float) -> Float {
+        return Foundation.pow(self, exponent)
+    }
+}