File size: 4,936 Bytes

6f91e60

# Prediction interface for Cog ⚙️
# https://cog.run/python
import os
import subprocess
import time

import torch
import torchaudio
from cog import BasePredictor, Input, Path

from boson_multimodal.data_types import ChatMLSample, Message, AudioContent
from boson_multimodal.serve.serve_engine import HiggsAudioResponse, HiggsAudioServeEngine


MODEL_PATH = "higgs-audio-v2-generation-3B-base"
AUDIO_TOKENIZER_PATH = "higgs-audio-v2-tokenizer"
MODEL_URL = "https://weights.replicate.delivery/default/bosonai/higgs-audio-v2-generation-3B-base/model.tar"
TOKENIZER_URL = "https://weights.replicate.delivery/default/bosonai/higgs-audio-v2-tokenizer/model.tar"


def download_weights(url, dest):
    start = time.time()
    print("downloading url: ", url)
    print("downloading to: ", dest)
    subprocess.check_call(["pget", "-xf", url, dest], close_fds=False)
    print("downloading took: ", time.time() - start)


class Predictor(BasePredictor):
    def setup(self) -> None:
        """Load the model into memory to make running multiple predictions efficient"""
        # Download weights
        if not os.path.exists(MODEL_PATH):
            download_weights(MODEL_URL, MODEL_PATH)
        if not os.path.exists(AUDIO_TOKENIZER_PATH):
            download_weights(TOKENIZER_URL, AUDIO_TOKENIZER_PATH)

        # Set device
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")

        # Initialize the serve engine
        self.serve_engine = HiggsAudioServeEngine(
            MODEL_PATH,
            AUDIO_TOKENIZER_PATH,
            device=self.device)
        print("Higgs Audio V2 model loaded successfully")

    def predict(
        self,
        text: str = Input(
            description="Text to convert to speech",
            default="The sun rises in the east and sets in the west",
        ),
        temperature: float = Input(
            description="Controls randomness in generation. Lower values are more deterministic.",
            ge=0.1,
            le=1.0,
            default=0.3,
        ),
        top_p: float = Input(
            description="Nucleus sampling parameter. Controls diversity of generated audio.",
            ge=0.1,
            le=1.0,
            default=0.95,
        ),
        top_k: int = Input(
            description="Top-k sampling parameter. Limits vocabulary to top k tokens.", ge=1, le=100, default=50
        ),
        max_new_tokens: int = Input(
            description="Maximum number of audio tokens to generate", ge=256, le=2048, default=1024
        ),
        scene_description: str = Input(
            description="Scene description for audio context", default="Audio is recorded from a quiet room."
        ),
        system_message: str = Input(description="Custom system message (optional)", default=""),
        ref_audio: Path = Input(
            description="Reference audio file for voice cloning (optional). Supports WAV, MP3, etc.",
            default=None,
        ),
    ) -> Path:
        """Run a single prediction on the model"""
        try:
            # Construct system prompt
            if system_message:
                system_prompt = system_message
            else:
                system_prompt = f"Generate audio following instruction.\n\n<|scene_desc_start|>\n{scene_description}\n<|scene_desc_end|>"

            # Prepare messages
            messages = [
                Message(
                    role="system",
                    content=system_prompt,
                ),
            ]

            # Add reference audio message if provided (voice cloning)
            if ref_audio is not None:
                messages.append(
                    Message(
                        role="assistant",
                        content=AudioContent(audio_url=str(ref_audio)),
                    )
                )

            # Add user text message
            messages.append(
                Message(
                    role="user",
                    content=text,
                )
            )

            # Generate audio
            output: HiggsAudioResponse = self.serve_engine.generate(
                chat_ml_sample=ChatMLSample(messages=messages),
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=top_p,
                top_k=top_k,
                stop_strings=["<|end_of_text|>", "<|eot_id|>"],
            )
            # Save output audio to a temporary file with a clear filename
            output_path = "/tmp/audio_output.wav"
            # Convert output audio to tensor and save
            audio_tensor = torch.from_numpy(output.audio)[None, :]
            torchaudio.save(output_path, audio_tensor, output.sampling_rate, format="wav")
            return Path(output_path)

        except Exception as e:
            raise RuntimeError(f"Audio generation failed: {str(e)}")