test5 / app.py
Kashif12334's picture
Update app.py
deca404 verified
import gradio as gr
import requests
import os
import tempfile
from groq import Groq
import torch
import soundfile as sf
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
# ==============================
# API KEYS
# ==============================
HF_TOKEN = os.getenv("HF_TOKEN")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
groq_client = Groq(api_key=GROQ_API_KEY)
# ==============================
# LOAD TTS MODELS
# ==============================
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
tts_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
embeddings_dataset = load_dataset(
"Matthijs/cmu-arctic-xvectors",
split="validation"
)
speaker_embeddings = torch.tensor(
embeddings_dataset[7306]["xvector"]
).unsqueeze(0)
# ==============================
# TEXT β†’ SPEECH
# ==============================
def text_to_speech(text):
inputs = processor(
text=text,
return_tensors="pt",
truncation=True,
max_length=500
)
speech = tts_model.generate_speech(
inputs["input_ids"],
speaker_embeddings,
vocoder=vocoder
)
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
sf.write(temp_audio.name, speech.numpy(), samplerate=16000)
return temp_audio.name
# ==============================
# IMAGE GENERATION
# ==============================
def generate_image(prompt):
API_URL = "https://router.huggingface.co/hf-inference/models/stabilityai/stable-diffusion-xl-base-1.0"
headers = {
"Authorization": f"Bearer {HF_TOKEN}"
}
payload = {
"inputs": prompt
}
response = requests.post(API_URL, headers=headers, json=payload)
print("HF STATUS:", response.status_code)
if response.status_code != 200:
print(response.text)
return None
image_bytes = response.content
temp_file = tempfile.NamedTemporaryFile(delete=False, suffix=".png")
temp_file.write(image_bytes)
temp_file.close()
return temp_file.name
# ==============================
# GROQ CHATBOT
# ==============================
def ask_llm(question):
response = groq_client.chat.completions.create(
model="llama-3.1-8b-instant",
max_tokens=200,
messages=[
{"role": "user", "content": question}
]
)
return response.choices[0].message.content
# ==============================
# MAIN ASSISTANT FUNCTION
# ==============================
def ai_assistant(user_input):
reply = ask_llm(user_input)
image = None
if "image" in user_input.lower() or "generate" in user_input.lower():
image = generate_image(user_input)
audio = text_to_speech(reply)
return reply, audio, image
# ==============================
# GRADIO UI
# ==============================
with gr.Blocks() as demo:
gr.Markdown("# πŸ€– AI Assistant (Chat + Voice + Image)")
user_input = gr.Textbox(
label="Ask something or request an image"
)
text_output = gr.Textbox(
label="Assistant Response"
)
audio_output = gr.Audio(
label="Voice Response"
)
image_output = gr.Image(
label="Generated Image"
)
submit_btn = gr.Button("Submit")
submit_btn.click(
fn=ai_assistant,
inputs=user_input,
outputs=[text_output, audio_output, image_output]
)
demo.launch()