import os from loss import check_status from fastapi import FastAPI, HTTPException from fastapi.middleware.cors import CORSMiddleware from pydantic import BaseModel, Field from transformers import AutoModelForCausalLM import traceback from wrapper import WhisperWrapper from wrapper import AutoTokenizerWrapper import librosa import numpy as np import torch import uvicorn import base64 import io from voxcpm import VoxCPM from helper import check_copy torch.set_num_threads(4) _original_torch_load = torch.load def _patched_torch_load(*args, **kwargs): # Always set weights_only=False if not explicitly provided if 'weights_only' not in kwargs: kwargs['weights_only'] = False return _original_torch_load(*args, **kwargs) torch.load = _patched_torch_load assert torch.load is _patched_torch_load, "torch.load patch failed!" asr_model = WhisperWrapper("models/wpt/wpt.pt", "models/dsp/config.json") model_name = "models/Llama-3.2-1B-Instruct" tok = AutoTokenizerWrapper.from_pretrained(model_name) lm = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="cuda", ).eval() tts = VoxCPM.from_pretrained( "models/VoxCPM-0.5B", local_files_only=True, load_denoiser=True, zipenhancer_model_id="models/iic/speech_zipenhancer_ans_multiloss_16k_base" ) def chat(system_prompt: str, user_prompt: str) -> str: print("LLM init...") messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ] inputs = tok.apply_chat_template( messages, add_generation_prompt=True, return_tensors="pt", return_dict=True ) input_ids = inputs["input_ids"].to(lm.device) attention_mask = inputs["attention_mask"].to(lm.device) with torch.inference_mode(): output_ids = lm.generate( input_ids=input_ids, attention_mask=attention_mask, pad_token_id=tok.eos_token_id, max_new_tokens=2048, do_sample=True, temperature=0.2, repetition_penalty=1.1, top_k=100, top_p=0.95, ) answer = tok.decode( output_ids[0][input_ids.shape[-1]:], skip_special_tokens=True, clean_up_tokenization_spaces=True, ) print("LLM answer done.") return answer.strip() def gt(audio: np.ndarray, sr: int): print("Starting ASR transcription...") ss = audio.squeeze().astype(np.float32) if sr != 16_000: ss = librosa.resample(audio, orig_sr=sr, target_sr=16_000) result = asr_model.transcribe(ss) transcribed_text = result["text"].strip() # print(f"ASR done. Transcribed: '{transcribed_text}'") return transcribed_text def sample(rr: str) -> str: if rr.strip() == "": rr = "Hello " inputs = tok(rr, return_tensors="pt").to(lm.device) with torch.inference_mode(): out_ids = lm.generate( **inputs, max_new_tokens=2048, do_sample=True, temperature=0.2, repetition_penalty=1.1, top_k=100, top_p=0.95, ) return tok.decode( out_ids[0][inputs.input_ids.shape[-1]:], skip_special_tokens=True ) INITIALIZATION_STATUS = {"model_loaded": True, "error": None} class GenerateRequest(BaseModel): audio_data: str = Field(..., description="") sample_rate: int = Field(..., description="") class GenerateResponse(BaseModel): audio_data: str = Field(..., description="") app = FastAPI(title="V1", version="0.1") app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) def b64(b64: str) -> np.ndarray: raw = base64.b64decode(b64) return np.load(io.BytesIO(raw), allow_pickle=False) def ab64(arr: np.ndarray, sr: int) -> str: buf = io.BytesIO() resampled = librosa.resample(arr, orig_sr=16000, target_sr=sr) np.save(buf, resampled.astype(np.float32)) return base64.b64encode(buf.getvalue()).decode() @app.get("/api/v1/health") def health_check(): return { "status": "healthy", "model_loaded": INITIALIZATION_STATUS["model_loaded"], "error": INITIALIZATION_STATUS["error"], } @app.post("/api/v1/v2v", response_model=GenerateResponse) def generate_audio(req: GenerateRequest): print("=== V2V Request Started ===") audio_np = b64(req.audio_data) if audio_np.ndim == 1: audio_np = audio_np.reshape(1, -1) if not check_status(): return audio_np print(f"Audio shape: {audio_np.shape}, Sample rate: {req.sample_rate}") system_prompt = ( "You are a helpful assistant who tries to help answer the user's question. " "This is a part of voice assistant system, don't generate anything other than pure text." ) try: text = gt(audio_np, req.sample_rate) response_text = chat(system_prompt, user_prompt=text) print(f"LLM response len chars: '{len(response_text)}'") # print(f"LLM response: '{response_text}'") import time start_time = time.perf_counter() audio_out = tts.generate( text=response_text, prompt_wav_path=None, prompt_text=None, cfg_value=2.0, inference_timesteps=10, normalize=True, denoise=True, retry_badcase=True, retry_badcase_max_times=3, retry_badcase_ratio_threshold=6.0, ) print("TTS generation complete.") end_time = time.perf_counter() print(f"TTS generation took {end_time - start_time:.2f} seconds.") print("=== V2V Request Complete ===") except Exception as e: print(f"ERROR in V2V: {e}") traceback.print_exc() raise HTTPException(status_code=500, detail=f"{e}") return GenerateResponse(audio_data=ab64(audio_out, req.sample_rate)) @app.post("/api/v1/v2t") def generate_text(req: GenerateRequest): if not check_status(): return {"text": "assistant is not available"} audio_np = b64(req.audio_data) if audio_np.ndim == 1: audio_np = audio_np.reshape(1, -1) try: text = gt(audio_np, req.sample_rate) # print(f"Transcribed text: {text}") system_prompt = "You are a helpful assistant who tries to help answer the user's question." response_text = chat(system_prompt, user_prompt=text) except Exception as e: traceback.print_exc() raise HTTPException(status_code=500, detail=f"{e}") return {"text": response_text} if __name__ == "__main__": uvicorn.run("server:app", host="0.0.0.0", port=8000, reload=False)