HUMAN_STREAMING_AI / app /ai_brain.py
drrobot9's picture
Upload folder using huggingface_hub
63730eb verified
# app/ai_brain.py
import torch
import numpy as np
from huggingface_hub import hf_hub_download
from moshi.models import loaders, LMGen
import sentencepiece
HF_REPO = "nvidia/personaplex-7b-v1"
DEVICE = "cuda"
VOICE = "NATM2"
PERSONA = (
"You are a warm, friendly male assistant. "
"Speak naturally, be engaging, supportive, and conversational."
)
class PersonaPlexBrain:
def __init__(self):
self._load_models()
def _load_models(self):
mimi_w = hf_hub_download(HF_REPO, loaders.MIMI_NAME)
moshi_w = hf_hub_download(HF_REPO, loaders.MOSHI_NAME)
tok_path = hf_hub_download(HF_REPO, loaders.TEXT_TOKENIZER_NAME)
voices = hf_hub_download(HF_REPO, "voices.tgz")
self.tokenizer = sentencepiece.SentencePieceProcessor(tok_path)
self.mimi = loaders.get_mimi(mimi_w, DEVICE)
self.other_mimi = loaders.get_mimi(mimi_w, DEVICE)
lm = loaders.get_moshi_lm(moshi_w, device=DEVICE)
lm.eval()
self.lm_gen = LMGen(
lm,
sample_rate=self.mimi.sample_rate,
device=DEVICE,
frame_rate=self.mimi.frame_rate,
)
# Load male voice embedding
import tarfile, os
import pathlib
vdir = pathlib.Path(voices).parent / "voices"
if not vdir.exists():
with tarfile.open(voices) as t:
t.extractall(vdir.parent)
voice_path = vdir / f"{VOICE}.pt"
self.lm_gen.load_voice_prompt_embeddings(str(voice_path))
# Persona conditioning
text = f"<system> {PERSONA} <system>"
self.lm_gen.text_prompt_tokens = self.tokenizer.encode(text)
def process_audio_frame(self, frame: np.ndarray):
"""Process one audio frame → return response frame"""
x = torch.from_numpy(frame).float().to(DEVICE)[None, None, :]
codes = self.mimi.encode(x)
tokens = self.lm_gen.step(codes[:, :, 0:1])
if tokens is None:
return None
audio_tokens = tokens[:, 1:9, :]
pcm = self.other_mimi.decode(audio_tokens)
return pcm[0, 0].cpu().numpy()