sla-it-sec / server.py

Upload folder using huggingface_hub

4817c76 verified 8 months ago

11.1 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	import librosa
	import torch
	# import torch
	import base64
	import io
	import logging
	import numpy as np
	from transformers import AutoModel, AutoTokenizer
	from typing import Optional

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	app = FastAPI()

	# Add CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)


	# # Add CORS middleware
	# app.add_middleware(
	# CORSMiddleware,
	# allow_origins=["*"],
	# allow_credentials=True,
	# allow_methods=["*"],
	# allow_headers=["*"],
	# )
	# # Add CORS middleware
	# app.add_middleware(
	# CORSMiddleware,
	# allow_origins=["*"],
	# allow_credentials=True,
	# allow_methods=["*"],
	# allow_headers=["*"],
	# )
	# # Add CORS middleware
	# app.add_middleware(
	# CORSMiddleware,
	# allow_origins=["*"],
	# allow_credentials=True,
	# allow_methods=["*"],
	# allow_headers=["*"],
	# )
	# # Add CORS middleware
	# app.add_middleware(
	# CORSMiddleware,
	# allow_origins=["*"],
	# allow_credentials=True,
	# allow_methods=["*"],
	# allow_headers=["*"],
	# )

	class AudioRequest(BaseModel):
	audio_data: str
	sample_rate: int

	class AudioResponse(BaseModel):
	audio_data: str
	text: str = ""

	class ConfigRequest(BaseModel):
	temperature: Optional[float] = None
	max_new_tokens: Optional[int] = None
	system_prompt: Optional[str] = None
	voice_path: Optional[str] = None

	class ConfigResponse(BaseModel):
	success: bool
	message: str
	current_config: dict

	# Global model instance
	model = None
	INITIALIZATION_STATUS = {
	"model_loaded": False,
	"error": None
	}

	class Model:
	def __init__(self):
	self.model = model = AutoModel.from_pretrained(
	'./models/checkpoint',
	trust_remote_code=True,
	torch_dtype=torch.bfloat16,
	attn_implementation='sdpa'
	)
	model = model.eval().cuda()

	self.tokenizer = AutoTokenizer.from_pretrained(
	'./models/checkpoint',
	trust_remote_code=True
	)

	# Initialize TTS
	model.init_tts()
	model.tts.float() # Convert TTS to float32 if needed

	self.model_in_sr = 16000
	self.model_out_sr = 24000
	self.ref_audio, _ = librosa.load('./ref_audios/female.wav', sr=self.model_in_sr, mono=True) # load the reference audio

	# Configurable parameters
	self.temperature = 0.7
	self.max_new_tokens = 150
	self.top_p = 0.92
	self.repetition_penalty = 1.2

	# Custom podcast conversation system prompt
	self.podcast_prompt = "You are Speaker 2 in a podcast conversation. Listen carefully to Speaker 1 and respond naturally as if continuing a podcast dialogue. Keep your responses concise, engaging, and conversational. Maintain the flow and topic of the conversation. Avoid sounding like an assistant - you are a podcast co-host having a natural conversation."

	# Create custom system prompt using audio_roleplay mode for better conversational style
	self.sys_prompt = {
	"role": "user",
	"content": [
	"Clone the voice in the provided audio prompt.",
	self.ref_audio,
	self.podcast_prompt
	]
	}

	# Enhanced warmup with podcast-style examples
	print("Performing model warmup for podcast conversation...")

	# First warmup with a simple example
	audio_data = librosa.load('./ref_audios/female.wav', sr=self.model_in_sr, mono=True)[0]
	_ = self.inference(audio_data, self.model_in_sr)

	print("Warmup complete. Model ready for podcast conversation.")

	def update_config(self, config_request: ConfigRequest) -> dict:
	"""Update model configuration based on request"""
	changes = []

	if config_request.temperature is not None:
	self.temperature = max(0.1, min(1.0, config_request.temperature))
	changes.append(f"Temperature set to {self.temperature}")

	if config_request.max_new_tokens is not None:
	self.max_new_tokens = max(50, min(1024, config_request.max_new_tokens))
	changes.append(f"Max new tokens set to {self.max_new_tokens}")

	if config_request.system_prompt is not None:
	self.podcast_prompt = config_request.system_prompt
	# Update system prompt
	self.sys_prompt = {
	"role": "user",
	"content": [
	"Clone the voice in the provided audio prompt.",
	self.ref_audio,
	self.podcast_prompt
	]
	}
	changes.append("System prompt updated")

	if config_request.voice_path is not None:
	try:
	new_ref_audio, _ = librosa.load(config_request.voice_path, sr=self.model_in_sr, mono=True)
	self.ref_audio = new_ref_audio
	# Update system prompt with new voice
	self.sys_prompt = {
	"role": "user",
	"content": [
	"Clone the voice in the provided audio prompt.",
	self.ref_audio,
	self.podcast_prompt
	]
	}
	changes.append(f"Voice updated from {config_request.voice_path}")
	except Exception as e:
	return {
	"success": False,
	"message": f"Failed to load voice: {str(e)}",
	"current_config": self.get_current_config()
	}

	return {
	"success": True,
	"message": "Configuration updated: " + "; ".join(changes) if changes else "No changes made",
	"current_config": self.get_current_config()
	}

	def get_current_config(self) -> dict:
	"""Get current model configuration"""
	return {
	"temperature": self.temperature,
	"max_new_tokens": self.max_new_tokens,
	"top_p": self.top_p,
	"repetition_penalty": self.repetition_penalty,
	"system_prompt": self.podcast_prompt
	}

	def inference(self, audio_np, input_audio_sr):
	if input_audio_sr != self.model_in_sr:
	audio_np = librosa.resample(audio_np, orig_sr=input_audio_sr, target_sr=self.model_in_sr)

	user_question = {'role': 'user', 'content': [audio_np]}

	# round one
	msgs = [self.sys_prompt, user_question]
	res = self.model.chat(
	msgs=msgs,
	tokenizer=self.tokenizer,
	sampling=True,
	max_new_tokens=self.max_new_tokens,
	use_tts_template=True,
	generate_audio=True,
	temperature=self.temperature,
	top_p=self.top_p,
	repetition_penalty=self.repetition_penalty,
	)
	audio = res["audio_wav"].cpu().numpy()

	if self.model_out_sr != input_audio_sr:
	audio = librosa.resample(audio, orig_sr=self.model_out_sr, target_sr=input_audio_sr)

	return audio, res["text"]

	def initialize_model():
	"""Initialize the MiniCPM model"""
	global model, INITIALIZATION_STATUS
	try:
	logger.info("Initializing model...")
	model = Model()

	INITIALIZATION_STATUS["model_loaded"] = True
	logger.info("MiniCPM model initialized successfully")
	return True
	except Exception as e:
	INITIALIZATION_STATUS["error"] = str(e)
	logger.error(f"Failed to initialize model: {e}")
	return False

	@app.on_event("startup")
	async def startup_event():
	"""Initialize model on startup"""
	initialize_model()

	@app.get("/api/v1/health")
	def health_check():
	"""Health check endpoint"""
	status = {
	"status": "healthy" if INITIALIZATION_STATUS["model_loaded"] else "initializing",
	"model_loaded": INITIALIZATION_STATUS["model_loaded"],
	"error": INITIALIZATION_STATUS["error"]
	}
	return status

	@app.post("/api/v1/inference")
	async def inference(request: AudioRequest) -> AudioResponse:
	"""Run inference with MiniCPM model"""
	if not INITIALIZATION_STATUS["model_loaded"]:
	raise HTTPException(
	status_code=503,
	detail=f"Model not ready. Status: {INITIALIZATION_STATUS}"
	)

	try:
	# Decode audio data from base64
	audio_bytes = base64.b64decode(request.audio_data)
	audio_np = np.load(io.BytesIO(audio_bytes)).flatten()

	# Generate response
	import time
	start = time.time()
	print(f"starting inference with audio length {audio_np.shape}")
	audio_response, text_response = model.inference(audio_np, request.sample_rate)
	print(f"inference took {time.time() - start} seconds")

	# If we got audio, save it and encode to base64
	buffer = io.BytesIO()
	np.save(buffer, audio_response)
	audio_b64 = base64.b64encode(buffer.getvalue()).decode()

	return AudioResponse(
	audio_data=audio_b64,
	text=text_response
	)

	except Exception as e:
	logger.error(f"Inference failed: {str(e)}")
	raise HTTPException(
	status_code=500,
	detail=str(e)
	)

	@app.post("/api/v1/config")
	async def update_config(request: ConfigRequest) -> ConfigResponse:
	"""Update model configuration for podcast-style conversations"""
	if not INITIALIZATION_STATUS["model_loaded"]:
	raise HTTPException(
	status_code=503,
	detail=f"Model not ready. Status: {INITIALIZATION_STATUS}"
	)

	try:
	result = model.update_config(request)
	return ConfigResponse(
	success=result["success"],
	message=result["message"],
	current_config=result["current_config"]
	)
	except Exception as e:
	logger.error(f"Configuration update failed: {str(e)}")
	return ConfigResponse(
	success=False,
	message=f"Configuration update failed: {str(e)}",
	current_config=model.get_current_config()
	)

	@app.get("/api/v1/config")
	async def get_config() -> ConfigResponse:
	"""Get current model configuration"""
	if not INITIALIZATION_STATUS["model_loaded"]:
	raise HTTPException(
	status_code=503,
	detail=f"Model not ready. Status: {INITIALIZATION_STATUS}"
	)

	return ConfigResponse(
	success=True,
	message="Current configuration",
	current_config=model.get_current_config()
	)

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=8000)