import asyncio from groq import Groq import edge_tts import tempfile import os # Create a Groq client once at the module level to reuse across function calls client = Groq() async def text_to_speech(text, language): # Map language to Edge TTS voice if language.lower() == 'de': voice = 'de-DE-KatjaNeural' # German female voice else: voice = 'en-US-AriaNeural' # English female voice rate = "+10%" pitch = "+0Hz" with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file: tmp_path = tmp_file.name communicate = edge_tts.Communicate(text, voice, rate=rate, pitch=pitch) await communicate.save(tmp_path) # Do not delete the file yet; Gradio needs to access it return tmp_path # Return the path to the audio file async def generate_compliment_and_audio(base64_image, compliment_prompt, model="llama-3.2-90b-vision-preview", max_tokens=300, temperature=0.5, tts_language='en'): """ Generate a compliment and its audio, starting TTS processing as soon as possible to reduce latency. Args: - base64_image (str): The base64 encoded image. - compliment_prompt (str): The prompt for generating the compliment. - model (str, optional): The model to use for the chat completion. - max_tokens (int, optional): The maximum number of tokens to generate. - temperature (float, optional): The sampling temperature. - tts_language (str, optional): The language code for TTS. Returns: - Tuple[str, str]: The generated compliment and the audio file path. """ # Prepare the messages payload messages = [ { "role": "user", "content": [ {"type": "text", "text": compliment_prompt}, { "type": "image_url", "image_url": { "url": f"data:image/jpeg;base64,{base64_image}", }, }, ], } ] # Since the Groq client does not support async methods, use asyncio.to_thread def fetch_compliment(): return client.chat.completions.create( max_tokens=max_tokens, temperature=temperature, messages=messages, model=model, ) # Call the synchronous function in a separate thread chat_completion = await asyncio.to_thread(fetch_compliment) # Extract the compliment compliment = chat_completion.choices[0].message.content # Start the TTS processing asynchronously audio_file_path = await text_to_speech(compliment, tts_language) return compliment, audio_file_path