| |
|
|
| import torch |
| import numpy as np |
| from huggingface_hub import hf_hub_download |
| from moshi.models import loaders, LMGen |
| import sentencepiece |
|
|
| HF_REPO = "nvidia/personaplex-7b-v1" |
| DEVICE = "cuda" |
|
|
| VOICE = "NATM2" |
| PERSONA = ( |
| "You are a warm, friendly male assistant. " |
| "Speak naturally, be engaging, supportive, and conversational." |
| ) |
|
|
| class PersonaPlexBrain: |
| def __init__(self): |
| self._load_models() |
|
|
| def _load_models(self): |
| mimi_w = hf_hub_download(HF_REPO, loaders.MIMI_NAME) |
| moshi_w = hf_hub_download(HF_REPO, loaders.MOSHI_NAME) |
| tok_path = hf_hub_download(HF_REPO, loaders.TEXT_TOKENIZER_NAME) |
| voices = hf_hub_download(HF_REPO, "voices.tgz") |
|
|
| self.tokenizer = sentencepiece.SentencePieceProcessor(tok_path) |
|
|
| self.mimi = loaders.get_mimi(mimi_w, DEVICE) |
| self.other_mimi = loaders.get_mimi(mimi_w, DEVICE) |
| lm = loaders.get_moshi_lm(moshi_w, device=DEVICE) |
| lm.eval() |
|
|
| self.lm_gen = LMGen( |
| lm, |
| sample_rate=self.mimi.sample_rate, |
| device=DEVICE, |
| frame_rate=self.mimi.frame_rate, |
| ) |
|
|
| |
| import tarfile, os |
| import pathlib |
|
|
| vdir = pathlib.Path(voices).parent / "voices" |
| if not vdir.exists(): |
| with tarfile.open(voices) as t: |
| t.extractall(vdir.parent) |
|
|
| voice_path = vdir / f"{VOICE}.pt" |
| self.lm_gen.load_voice_prompt_embeddings(str(voice_path)) |
|
|
| |
| text = f"<system> {PERSONA} <system>" |
| self.lm_gen.text_prompt_tokens = self.tokenizer.encode(text) |
|
|
| def process_audio_frame(self, frame: np.ndarray): |
| """Process one audio frame → return response frame""" |
|
|
| x = torch.from_numpy(frame).float().to(DEVICE)[None, None, :] |
| codes = self.mimi.encode(x) |
|
|
| tokens = self.lm_gen.step(codes[:, :, 0:1]) |
| if tokens is None: |
| return None |
|
|
| audio_tokens = tokens[:, 1:9, :] |
| pcm = self.other_mimi.decode(audio_tokens) |
|
|
| return pcm[0, 0].cpu().numpy() |