Spaces:

ButterM40
/

Roleplay-Chat-Box

Sleeping

App Files Files Community

Roleplay-Chat-Box / backend /models /enhanced_voice_synthesizer.py

ButterM40

Deploy Roleplay Chat Box - optimized version without large files

7e68852 28 days ago

raw

history blame contribute delete

13.7 kB

	import torch
	import asyncio
	import logging
	import base64
	import io
	import numpy as np
	from typing import Optional
	from backend.config import settings
	import math

	logger = logging.getLogger(__name__)

	class SimpleVoiceSynthesizer:
	"""
	An improved simple voice synthesizer that creates more realistic speech-like audio
	using phoneme patterns, formant synthesis, and prosody modeling.
	"""

	def __init__(self):
	self.character_voice_configs = {}
	self.initialized = False

	async def initialize(self):
	"""Initialize simple voice synthesis"""
	if not settings.ENABLE_VOICE:
	logger.info("Voice synthesis disabled in config")
	return False

	logger.info("Initializing improved simple voice synthesizer...")

	try:
	# Setup character-specific voice parameters
	self._setup_character_voices()
	self.initialized = True
	logger.info("Improved simple voice synthesizer initialized successfully")
	return True

	except Exception as e:
	logger.error(f"Failed to initialize simple voice synthesizer: {e}")
	return False

	def _setup_character_voices(self):
	"""Setup character-specific voice configurations"""
	self.character_voice_configs = {
	"moses": {
	"base_frequency": 110, # Lower, more authoritative
	"speed": 0.85, # Slower, more measured
	"pitch_variance": 0.15, # Less pitch variation
	"formant_shift": -0.2, # Deeper formants
	"voice_quality": "deep",
	},
	"samsung_employee": {
	"base_frequency": 140, # Professional, clear
	"speed": 1.0, # Normal speed
	"pitch_variance": 0.2, # Moderate variation
	"formant_shift": 0.0, # Neutral formants
	"voice_quality": "clear",
	},
	"jinx": {
	"base_frequency": 180, # Higher, more energetic
	"speed": 1.2, # Faster speech
	"pitch_variance": 0.35, # More pitch variation
	"formant_shift": 0.3, # Brighter formants
	"voice_quality": "bright",
	}
	}

	async def synthesize(self, text: str, character_id: str) -> Optional[str]:
	"""Synthesize speech for given text and character"""
	if not self.initialized or not settings.ENABLE_VOICE:
	return None

	try:
	# Get character voice config
	voice_config = self.character_voice_configs.get(
	character_id,
	self.character_voice_configs["samsung_employee"] # Default
	)

	# Generate realistic speech audio
	audio_data = self._generate_realistic_speech(text, voice_config)

	# Convert to base64 for web transmission
	audio_base64 = self._audio_to_base64(audio_data)

	logger.info(f"Generated realistic speech for {character_id}: {len(text)} chars, {len(audio_data)} samples")
	return audio_base64

	except Exception as e:
	logger.error(f"Error in simple voice synthesis: {e}")
	return None

	def _generate_realistic_speech(self, text: str, voice_config: dict) -> np.ndarray:
	"""Generate realistic speech using advanced phoneme and prosody modeling"""

	# Calculate duration based on speaking rate
	words = len(text.split())
	chars = len(text)

	# Realistic speaking rates: 150-180 words per minute
	base_wpm = 160
	speed_factor = voice_config["speed"]
	actual_wpm = base_wpm * speed_factor

	# Calculate duration
	duration = (words / actual_wpm) * 60 # Convert to seconds
	duration = max(duration, chars / 15.0) # Minimum based on character count
	duration = min(duration, 30.0) # Maximum 30 seconds

	sample_rate = settings.SAMPLE_RATE
	num_samples = int(duration * sample_rate)

	# Create time array
	t = np.linspace(0, duration, num_samples)

	# Generate phoneme-based speech patterns
	audio = self._create_phoneme_speech(t, text, voice_config)

	# Apply prosody (intonation patterns)
	prosody = self._generate_prosody(t, text, voice_config)
	audio *= prosody

	# Apply character-specific voice quality
	audio = self._apply_voice_quality(audio, t, voice_config)

	# Add natural speech envelope
	envelope = self._create_speech_envelope(audio, t)
	audio *= envelope

	# Normalize and return
	if np.max(np.abs(audio)) > 0:
	audio = audio / np.max(np.abs(audio)) * 0.8

	return audio.astype(np.float32)

	def _create_phoneme_speech(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
	"""Create speech-like audio using phoneme patterns"""

	audio = np.zeros_like(t)
	base_freq = voice_config["base_frequency"]

	# Create syllable timing based on text
	syllable_rate = 4.0 * voice_config["speed"] # syllables per second
	syllable_duration = 1.0 / syllable_rate

	for i, sample_time in enumerate(t):
	# Determine current syllable position
	syllable_phase = (sample_time % syllable_duration) / syllable_duration

	# Create vowel/consonant pattern
	# Vowels: 0.2-0.8 of syllable, Consonants: 0.0-0.2 and 0.8-1.0
	is_vowel = 0.2 < syllable_phase < 0.8

	# Get fundamental frequency with natural variation
	pitch_variation = voice_config["pitch_variance"]
	f0 = base_freq * (1 + pitch_variation * np.sin(2 * np.pi * 2.3 * sample_time))

	if is_vowel:
	# Generate vowel sound using formant synthesis
	vowel_sound = self._generate_vowel_formants(sample_time, f0, voice_config)
	audio[i] = vowel_sound
	else:
	# Generate consonant sound using filtered noise
	consonant_sound = self._generate_consonant(sample_time, f0, voice_config)
	audio[i] = consonant_sound

	return audio

	def _generate_vowel_formants(self, t: float, f0: float, voice_config: dict) -> float:
	"""Generate vowel sounds using formant frequencies"""

	formant_shift = voice_config["formant_shift"]

	# Vowel formant frequencies (approximate average)
	f1 = 650 * (1 + formant_shift * 0.5) # First formant
	f2 = 1400 * (1 + formant_shift * 0.3) # Second formant
	f3 = 2500 * (1 + formant_shift * 0.2) # Third formant

	# Add slight formant movement for naturalness
	f1 += 50 * np.sin(2 * np.pi * 1.7 * t)
	f2 += 80 * np.sin(2 * np.pi * 2.1 * t)

	# Generate harmonic series for fundamental
	fundamental = 0.4 * np.sin(2 * np.pi * f0 * t)

	# Generate formant resonances
	formant1 = 0.3 * np.sin(2 * np.pi * f1 * t) * np.exp(-abs(f1 - f0*1) / 200)
	formant2 = 0.2 * np.sin(2 * np.pi * f2 * t) * np.exp(-abs(f2 - f0*2) / 300)
	formant3 = 0.1 * np.sin(2 * np.pi * f3 * t) * np.exp(-abs(f3 - f0*3) / 500)

	# Add harmonics
	harmonic2 = 0.2 * np.sin(2 * np.pi * f0 * 2 * t)
	harmonic3 = 0.1 * np.sin(2 * np.pi * f0 * 3 * t)

	return fundamental + formant1 + formant2 + formant3 + harmonic2 + harmonic3

	def _generate_consonant(self, t: float, f0: float, voice_config: dict) -> float:
	"""Generate consonant sounds using filtered noise and fricatives"""

	# Create noise component for fricatives
	noise = (np.random.randn() - 0.5) * 0.15

	# Add some periodic component for voiced consonants
	periodic = 0.1 * np.sin(2 * np.pi * f0 * t)

	# Filter noise based on consonant type (simplified)
	filtered_noise = noise * (1 + 0.5 * np.sin(2 * np.pi * 3000 * t))

	return filtered_noise + periodic * 0.3

	def _generate_prosody(self, t: np.ndarray, text: str, voice_config: dict) -> np.ndarray:
	"""Generate natural prosody (intonation) patterns"""

	prosody = np.ones_like(t)
	duration = t[-1] if len(t) > 0 else 1.0

	# Sentence-level intonation
	time_norm = t / duration

	if text.endswith('?'):
	# Question: rising intonation
	prosody = (0.8 + 0.4 time_norm)
	elif text.endswith('!'):
	# Exclamation: dramatic contour
	prosody = (0.9 + 0.3 np.sin(np.pi * time_norm) * np.exp(-time_norm))
	else:
	# Statement: natural declination
	prosody = (1.0 - 0.2 time_norm)

	# Add micro-prosody for naturalness
	prosody = (1 + 0.05 np.sin(2 * np.pi * 8 * t))

	# Character-specific prosody
	if voice_config.get("voice_quality") == "bright":
	# More animated prosody for energetic characters
	prosody = (1 + 0.1 np.sin(2 * np.pi * 2.5 * t))
	elif voice_config.get("voice_quality") == "deep":
	# More steady prosody for authoritative characters
	prosody = (1 + 0.03 np.sin(2 * np.pi * 1.2 * t))

	return prosody

	def _apply_voice_quality(self, audio: np.ndarray, t: np.ndarray, voice_config: dict) -> np.ndarray:
	"""Apply character-specific voice quality effects"""

	quality = voice_config.get("voice_quality", "clear")

	if quality == "deep":
	# Add subtle sub-harmonics for deeper voice
	subharmonic = 0.05 * np.sin(np.pi * t)
	audio = audio + subharmonic[:len(audio)]

	elif quality == "bright":
	# Emphasize higher frequencies for brighter voice
	high_freq = 0.03 * np.sin(2 * np.pi * 4000 * t)
	audio = audio + high_freq[:len(audio)]

	# Add very subtle vocal fry for naturalness
	fry_rate = 70 # Hz
	fry = 0.01 * np.sin(2 * np.pi * fry_rate * t) * (np.random.randn(len(t)) * 0.5 + 0.5)
	audio = audio + fry[:len(audio)]

	return audio

	def _create_speech_envelope(self, audio: np.ndarray, t: np.ndarray) -> np.ndarray:
	"""Create natural speech amplitude envelope"""

	envelope = np.ones_like(audio)

	# Fade in/out
	fade_samples = min(int(0.05 * len(audio)), 1000)
	if fade_samples > 0:
	envelope[:fade_samples] *= np.linspace(0, 1, fade_samples)
	envelope[-fade_samples:] *= np.linspace(1, 0, fade_samples)

	# Add speech rhythm (breathing, pauses)
	breath_rate = 0.3 # Subtle breathing pattern
	envelope = (0.95 + 0.05 np.sin(2 * np.pi * breath_rate * t))

	return envelope

	def _audio_to_base64(self, audio_data: np.ndarray) -> str:
	"""Convert audio numpy array to base64 string"""
	# Convert to 16-bit PCM
	audio_int16 = (np.clip(audio_data, -1, 1) * 32767).astype(np.int16)

	# Create WAV file in memory manually
	buffer = io.BytesIO()

	# WAV file parameters
	sample_rate = settings.SAMPLE_RATE
	num_channels = 1 # Mono
	bits_per_sample = 16
	byte_rate = sample_rate * num_channels * bits_per_sample // 8
	block_align = num_channels * bits_per_sample // 8
	data_size = len(audio_int16) * 2 # 2 bytes per sample
	file_size = 36 + data_size

	# Write WAV header (44 bytes)
	buffer.write(b'RIFF') # Chunk ID (4 bytes)
	buffer.write(file_size.to_bytes(4, 'little')) # File size - 8 (4 bytes)
	buffer.write(b'WAVE') # Format (4 bytes)
	buffer.write(b'fmt ') # Subchunk1 ID (4 bytes)
	buffer.write((16).to_bytes(4, 'little')) # Subchunk1 size (4 bytes)
	buffer.write((1).to_bytes(2, 'little')) # Audio format (PCM) (2 bytes)
	buffer.write(num_channels.to_bytes(2, 'little')) # Num channels (2 bytes)
	buffer.write(sample_rate.to_bytes(4, 'little')) # Sample rate (4 bytes)
	buffer.write(byte_rate.to_bytes(4, 'little')) # Byte rate (4 bytes)
	buffer.write(block_align.to_bytes(2, 'little')) # Block align (2 bytes)
	buffer.write(bits_per_sample.to_bytes(2, 'little')) # Bits per sample (2 bytes)
	buffer.write(b'data') # Subchunk2 ID (4 bytes)
	buffer.write(data_size.to_bytes(4, 'little')) # Subchunk2 size (4 bytes)

	# Write audio data
	buffer.write(audio_int16.tobytes())

	# Get bytes and encode to base64
	buffer.seek(0)
	audio_bytes = buffer.read()
	audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')

	return f"data:audio/wav;base64,{audio_base64}"

	def get_character_voice_info(self, character_id: str) -> dict:
	"""Get voice configuration for character"""
	return self.character_voice_configs.get(character_id, {})