jebin2's picture
refactor: Centralize logger import to src.logger_config across various modules.
f20025d
import os
import random
from typing import Dict, Optional, List, Union
from google.cloud import texttospeech
from moviepy.editor import AudioFileClip
import tempfile
from src.logger_config import logger
from .gcs_utils import get_gcs_credentials
import json
from . import ai_studio_sdk
from src.config import get_config_value
from src.file_downloader import get_file_downloader
from pathlib import Path
class GoogleTTS:
def __init__(self, credentials=None):
if not credentials:
credentials = get_gcs_credentials("final_data")
self.client = texttospeech.TextToSpeechClient(credentials=credentials)
# New voice profiles with Chirp HD voices
self.voice_profiles = {
"female_young": [
"en-AU-Chirp3-HD-Achernar",
"en-AU-Chirp3-HD-Gacrux",
"en-AU-Chirp3-HD-Laomedeia",
"en-AU-Chirp3-HD-Sulafat",
"en-AU-Chirp3-HD-Leda",
"en-AU-Chirp3-HD-Aoede",
"en-AU-Chirp-HD-O",
"en-GB-Chirp-HD-O",
"en-GB-Chirp-HD-F",
"en-GB-Chirp3-HD-Aoede",
"en-GB-Chirp3-HD-Callirrhoe",
"en-GB-Chirp3-HD-Gacrux",
"en-GB-Chirp3-HD-Laomedeia",
"en-US-Chirp3-HD-Aoede",
"en-US-Chirp3-HD-Leda",
],
"female_mature": ["en-GB-Chirp-HD-F", "en-US-Chirp3-HD-Leda"],
"male_young": [
"en-AU-Chirp-HD-D",
"en-AU-Chirp3-HD-Algenib",
"en-AU-Chirp3-HD-Enceladus",
"en-AU-Chirp3-HD-Puck",
"en-AU-Chirp3-HD-Schedar",
"en-GB-Chirp3-HD-Achird",
"en-GB-Chirp3-HD-Algenib",
"en-GB-Chirp3-HD-Enceladus",
"en-US-Chirp3-HD-Iapetus",
],
"male_mature": ["en-AU-Chirp3-HD-Schedar", "en-GB-Chirp3-HD-Achird"],
}
# Track current voice index for sequential selection
self.current_voice_indices = {category: 0 for category in self.voice_profiles.keys()}
def select_sequential_voice(self, persona: str = "female_young") -> str:
"""
Select voice sequentially (not random) to ensure variety across videos
"""
if persona not in self.voice_profiles:
persona = "female_young"
voices = self.voice_profiles[persona]
if not voices:
return "en-AU-Chirp3-HD-Achernar" # Fallback
current_index = self.current_voice_indices[persona]
if isinstance(voices, list) and len(voices) > 0:
selected_voice = voices[current_index]
else:
selected_voice = "en-AU-Chirp3-HD-Achernar" # Fallback
# Increment for next call
self.current_voice_indices[persona] = (current_index + 1) % len(voices)
logger.debug(f"🎭 Selected sequential voice #{current_index + 1} for {persona}: {selected_voice}")
return selected_voice
async def select_voice_for_persona(self, image_prompt: str) -> str:
"""
Select appropriate voice based on image prompt/description
Uses Gemini to analyze the persona and select matching voice
"""
try:
logger.debug(f"🎭 Analyzing persona for voice selection: {image_prompt[:100]}...")
# Format voice list for prompt
voice_options = json.dumps(self.voice_profiles, indent=2)
analysis_prompt = f"""Analyze this image description and select the most suitable voice from the available options.
Image Description: {image_prompt}
Available Voices (grouped by category):
{voice_options}
Determine the persona (gender, age, style) and select the specific voice ID that best fits.
Return ONLY valid JSON:
{{
"gender": "female",
"age": "young",
"style": "casual",
"selected_voice_id": "en-US-Chirp3-HD-Leda",
"reason": "Voice description matches..."
}}"""
response = ai_studio_sdk.generate(analysis_prompt)
# Parse response
response_text = response.strip()
if response_text.startswith("```"):
response_text = response_text.split("```")[1]
if response_text.startswith("json"):
response_text = response_text[4:]
response_text = response_text.strip()
persona = json.loads(response_text)
selected_voice = persona.get("selected_voice_id")
# Validate selected voice exists
voice_found = False
for category, voices in self.voice_profiles.items():
if selected_voice in voices:
voice_found = True
break
if not voice_found:
logger.warning(f"⚠️ Selected voice '{selected_voice}' not found in profiles. Falling back to sequential selection.")
# Fallback to sequential based on detected attributes
gender = persona.get("gender", "female")
age = persona.get("age", "young")
voice_key = f"{gender}_{age}"
selected_voice = self.select_sequential_voice(voice_key)
logger.debug(f"βœ“ Selected voice: {selected_voice}")
return selected_voice
except Exception as e:
logger.error(f"❌ Voice selection failed: {e}, using default")
return self.select_sequential_voice("female_young")
def generate_tts_audio(
self,
text: str,
voice_name: Optional[str] = None,
duration: Optional[float] = None,
volume_gain_db: float = 6.0
) -> Dict:
"""
Generate TTS audio using Google Cloud TTS.
Args:
text: Text to synthesize.
voice_name: Specific voice name to use. If None, selects sequentially based on default persona.
duration: Target duration in seconds. If provided, speaking rate is adjusted.
volume_gain_db: Volume gain in dB.
Returns:
Dict containing:
- audio_content: The binary audio data.
- duration: Actual duration in seconds.
- speaking_rate: Used speaking rate.
- voice_name: The voice name used.
"""
try:
if not voice_name:
voice_name = self.select_sequential_voice()
# Test Automation Bypass
if get_config_value("TEST_AUTOMATION"):
logger.debug(f"πŸ§ͺ TEST_MODE: Skipping Google TTS for '{text[:30]}...'")
downloader = get_file_downloader()
sample_path = Path("testData/sample_tts.mp3")
drive_url = "https://drive.google.com/file/d/1B4LsRhIXpV57ADR5sZ7IMZpMqE_JUj8Q/view?usp=sharing"
file_id = downloader._extract_drive_file_id(drive_url)
# Direct download (FileDownloader handles skip_existing)
downloaded_path = downloader.download_from_drive(
file_id=file_id,
output_path=sample_path
)
with open(downloaded_path, "rb") as f:
dummy_content = f.read()
return {
"audio_content": dummy_content,
"duration": 16.05,
"speaking_rate": 1.0,
"voice_name": "test_voice"
}
synthesis_input = texttospeech.SynthesisInput(text=text)
# Determine gender for voice selection
# Simple heuristic based on known keywords in Chirp voice names
male_keywords = ["D", "Algenib", "Enceladus", "Puck", "Schedar", "Achird", "Iapetus"]
is_male = any(keyword in voice_name for keyword in male_keywords)
ssml_gender = texttospeech.SsmlVoiceGender.MALE if is_male else texttospeech.SsmlVoiceGender.FEMALE
language_code = "-".join(voice_name.split("-")[:2])
voice = texttospeech.VoiceSelectionParams(
language_code=language_code, name=voice_name, ssml_gender=ssml_gender
)
speaking_rate = 1.0
# If duration is requested, we need a two-step process:
# 1. Generate with default rate to measure duration.
# 2. Calculate new rate and regenerate.
if duration:
# Step 1: Baseline generation
temp_audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=1.0,
pitch=0.0,
volume_gain_db=0.0,
)
temp_response = self.client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=temp_audio_config
)
# We need to measure duration. MoviePy AudioFileClip requires a file.
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as temp_file:
temp_file.write(temp_response.audio_content)
temp_file.flush()
try:
with AudioFileClip(temp_file.name) as clip:
baseline_duration = clip.duration
except Exception as e:
logger.error(f"Error measuring baseline duration: {e}")
# Fallback or re-raise? Let's assume baseline is approx correct or just proceed.
# If we can't measure, we can't adjust.
baseline_duration = 0 # force skip adjustment logic if failed
if baseline_duration > 0:
speaking_rate = baseline_duration / duration
speaking_rate = max(0.15, min(4.0, speaking_rate)) # Clamp
logger.debug(
f"πŸ“Š Baseline: {baseline_duration:.2f}s, Target: {duration:.2f}s, Rate: {speaking_rate:.2f}x"
)
# Final generation
audio_config = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=speaking_rate,
pitch=0.0,
volume_gain_db=volume_gain_db,
)
response = self.client.synthesize_speech(
input=synthesis_input, voice=voice, audio_config=audio_config
)
# Measure actual duration of the final audio
# We assume the caller might want to save it, but we also want to return the duration.
# We'll leverage a temp file again just to measure this efficiently if needed,
# or rely on the caller. But the spec says return actual_duration.
actual_duration = 0.0
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=True) as temp_file:
temp_file.write(response.audio_content)
temp_file.flush()
try:
with AudioFileClip(temp_file.name) as clip:
actual_duration = clip.duration
except:
pass
return {
"audio_content": response.audio_content,
"duration": actual_duration,
"speaking_rate": speaking_rate,
"voice_name": voice_name
}
except Exception as e:
logger.error(f"❌ Error generating TTS in GoogleTTS: {e}")
raise