Spaces:

Revrse
/

Yukti

Sleeping

App Files Files Community

Yukti / models.py

Revrse

Upload 2 files

1724ab6 verified 4 months ago

raw

history blame contribute delete

9.47 kB

	"""
	Model management for STT, TTS, and LLM
	Optimized for Hugging Face Zero GPU (H200)
	"""

	import os
	import torch
	import spaces
	from transformers import (
	AutoModelForSpeechSeq2Seq,
	AutoProcessor,
	pipeline,
	AutoModelForCausalLM,
	AutoTokenizer
	)
	from parler_tts import ParlerTTSForConditionalGeneration
	from transformers import AutoTokenizer as ParlerTokenizer
	import tempfile
	from typing import List, Dict
	import numpy as np
	from scipy.io import wavfile
	import soundfile as sf

	class ModelManager:
	def __init__(self):
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	# Models will be loaded lazily
	self.whisper_pipe = None
	self.tts_model = None
	self.tts_tokenizer = None
	self.llm_model = None
	self.llm_tokenizer = None

	def load_whisper(self):
	"""Load Whisper model for STT"""
	if self.whisper_pipe is None:
	print("Loading Whisper model...")
	# Using medium model for better speed/accuracy balance
	model_id = "openai/whisper-medium"

	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	model_id,
	torch_dtype=self.torch_dtype,
	low_cpu_mem_usage=True,
	use_safetensors=True
	)
	model.to(self.device)

	processor = AutoProcessor.from_pretrained(model_id)

	self.whisper_pipe = pipeline(
	"automatic-speech-recognition",
	model=model,
	tokenizer=processor.tokenizer,
	feature_extractor=processor.feature_extractor,
	torch_dtype=self.torch_dtype,
	device=self.device,
	chunk_length_s=30,
	batch_size=16,
	)
	print("Whisper model loaded successfully!")

	def load_tts(self):
	"""Load TTS model for text-to-speech"""
	if self.tts_model is None:
	print("Loading TTS model...")
	# Using smaller, faster TTS model
	model_id = "parler-tts/parler-tts-tiny-v1"

	self.tts_model = ParlerTTSForConditionalGeneration.from_pretrained(
	model_id,
	torch_dtype=self.torch_dtype
	).to(self.device)

	self.tts_tokenizer = ParlerTokenizer.from_pretrained(model_id)
	print("TTS model loaded successfully!")

	def load_llm(self):
	"""Load LLM for conversation generation"""
	if self.llm_model is None:
	print("Loading LLM...")
	# Using Llama 3.2 3B - smaller and faster than 7B models
	model_id = "meta-llama/Llama-3.2-3B-Instruct"

	self.llm_tokenizer = AutoTokenizer.from_pretrained(model_id)
	self.llm_model = AutoModelForCausalLM.from_pretrained(
	model_id,
	torch_dtype=self.torch_dtype,
	device_map="auto",
	low_cpu_mem_usage=True
	)
	print("LLM loaded successfully!")

	@spaces.GPU
	def speech_to_text(self, audio_path: str) -> str:
	"""Convert speech to text using Whisper - optimized for speed"""
	try:
	self.load_whisper()

	# Validate audio file exists and has correct format
	if not audio_path or not os.path.exists(audio_path):
	print(f"Audio file not found: {audio_path}")
	return ""

	# Check file extension
	if not audio_path.lower().endswith(('.wav', '.mp3', '.flac', '.m4a', '.ogg')):
	print(f"Invalid audio format: {audio_path}")
	return ""

	result = self.whisper_pipe(
	audio_path,
	return_timestamps=False,
	generate_kwargs={
	"language": "english",
	"task": "transcribe",
	"num_beams": 1, # Faster
	"temperature": 0.0 # More deterministic
	}
	)

	return result["text"].strip()
	except Exception as e:
	print(f"Error in STT: {e}")
	import traceback
	traceback.print_exc()
	return ""

	@spaces.GPU
	def text_to_speech(self, text: str, accent: str = "American", speaker_name: str = None) -> str:
	"""Convert text to speech - optimized for speed with American accent"""
	try:
	self.load_tts()

	# Simplified: Just use one clear American voice for speed
	description = "A clear American male voice speaks at moderate pace with good enunciation."

	# Limit text length for faster generation
	if len(text) > 200:
	text = text[:200] + "..."

	# Generate audio with optimized settings
	input_ids = self.tts_tokenizer(description, return_tensors="pt").input_ids.to(self.device)
	prompt_input_ids = self.tts_tokenizer(text, return_tensors="pt").input_ids.to(self.device)

	generation = self.tts_model.generate(
	input_ids=input_ids,
	prompt_input_ids=prompt_input_ids,
	attention_mask=torch.ones_like(input_ids),
	do_sample=False, # Faster, deterministic
	num_beams=1 # Faster generation
	)

	audio_arr = generation.cpu().numpy().squeeze()

	# Save to temporary file using scipy
	temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")

	# Normalize audio to int16 range
	audio_int16 = (audio_arr * 32767).astype(np.int16)

	# Save using scipy.io.wavfile
	wavfile.write(
	temp_file.name,
	self.tts_model.config.sampling_rate,
	audio_int16
	)

	return temp_file.name

	except Exception as e:
	print(f"Error in TTS: {e}")
	# Return a silent audio file as fallback
	return None

	@spaces.GPU
	def generate_response(
	self,
	system_prompt: str,
	conversation_history: List[Dict],
	bot_name: str
	) -> str:
	"""Generate conversational response using LLM"""
	try:
	self.load_llm()

	# Format conversation for the model
	messages = [{"role": "system", "content": system_prompt}]

	# Add conversation history
	for msg in conversation_history[-6:]: # Keep last 6 messages for context
	messages.append({
	"role": msg["role"],
	"content": msg["content"]
	})

	# Format conversation for Llama
	inputs = self.llm_tokenizer.apply_chat_template(
	messages,
	return_tensors="pt",
	add_generation_prompt=True
	).to(self.device)

	outputs = self.llm_model.generate(
	inputs,
	max_new_tokens=200,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=self.llm_tokenizer.eos_token_id
	)

	response = self.llm_tokenizer.decode(
	outputs[0][inputs.shape[1]:],
	skip_special_tokens=True
	)

	return response.strip()

	except Exception as e:
	print(f"Error in LLM generation: {e}")
	return f"I understand. Could you tell me more about that?"

	@spaces.GPU
	def generate_feedback(self, prompt: str) -> str:
	"""Generate detailed feedback using LLM"""
	try:
	self.load_llm()

	# Format feedback prompt for Llama
	messages = [
	{
	"role": "system",
	"content": "You are an expert communication coach specializing in sales and professional communication. Provide specific, actionable feedback."
	},
	{
	"role": "user",
	"content": prompt
	}
	]

	inputs = self.llm_tokenizer.apply_chat_template(
	messages,
	return_tensors="pt",
	add_generation_prompt=True
	).to(self.device)

	outputs = self.llm_model.generate(
	inputs,
	max_new_tokens=500,
	temperature=0.7,
	top_p=0.9,
	do_sample=True,
	pad_token_id=self.llm_tokenizer.eos_token_id
	)

	feedback = self.llm_tokenizer.decode(
	outputs[0][inputs.shape[1]:],
	skip_special_tokens=True
	)

	return feedback.strip()

	except Exception as e:
	print(f"Error in feedback generation: {e}")
	return "Unable to generate feedback at this time."