import gradio as gr import requests import os import tempfile from groq import Groq import torch import soundfile as sf from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan from datasets import load_dataset # ============================== # API KEYS # ============================== HF_TOKEN = os.getenv("HF_TOKEN") GROQ_API_KEY = os.getenv("GROQ_API_KEY") groq_client = Groq(api_key=GROQ_API_KEY) # ============================== # LOAD TTS MODELS # ============================== processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") embeddings_dataset = load_dataset( "Matthijs/cmu-arctic-xvectors", split="validation" ) speaker_embeddings = torch.tensor( embeddings_dataset[7306]["xvector"] ).unsqueeze(0) # ============================== # TEXT → SPEECH # ============================== def text_to_speech(text): inputs = processor( text=text, return_tensors="pt", truncation=True, max_length=500 ) speech = tts_model.generate_speech( inputs["input_ids"], speaker_embeddings, vocoder=vocoder ) temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") sf.write(temp_audio.name, speech.numpy(), samplerate=16000) return temp_audio.name # ============================== # IMAGE GENERATION # ============================== def generate_image(prompt): API_URL = "https://router.huggingface.co/hf-inference/models/stabilityai/stable-diffusion-xl-base-1.0" headers = { "Authorization": f"Bearer {HF_TOKEN}" } payload = { "inputs": prompt } response = requests.post(API_URL, headers=headers, json=payload) print("HF STATUS:", response.status_code) if response.status_code != 200: print(response.text) return None image_bytes = response.content temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png") temp_file.write(image_bytes) temp_file.close() return temp_file.name # ============================== # GROQ CHATBOT # ============================== def ask_llm(question): response = groq_client.chat.completions.create( model="llama-3.1-8b-instant", max_tokens=200, messages=[ {"role": "user", "content": question} ] ) return response.choices[0].message.content # ============================== # MAIN ASSISTANT FUNCTION # ============================== def ai_assistant(user_input): reply = ask_llm(user_input) image = None if "image" in user_input.lower() or "generate" in user_input.lower(): image = generate_image(user_input) audio = text_to_speech(reply) return reply, audio, image # ============================== # GRADIO UI # ============================== with gr.Blocks() as demo: gr.Markdown("# 🤖 AI Assistant (Chat + Voice + Image)") user_input = gr.Textbox( label="Ask something or request an image" ) text_output = gr.Textbox( label="Assistant Response" ) audio_output = gr.Audio( label="Voice Response" ) image_output = gr.Image( label="Generated Image" ) submit_btn = gr.Button("Submit") submit_btn.click( fn=ai_assistant, inputs=user_input, outputs=[text_output, audio_output, image_output] ) demo.launch()