Text_to_speeech / app.py
MuhammadSajid's picture
Update app.py
5f5d8ad verified
import gradio as gr
import requests
import tempfile
from faster_whisper import WhisperModel
from gtts import gTTS
import os
import subprocess
from PIL import Image, ImageDraw, ImageFont
import random
import textwrap
import pkg_resources
import sys
import io
import base64
# === Config ===
GROQ_API_KEY = "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"
GROQ_MODEL = "llama3-70b-8192"
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generations" # Corrected URL for DALL-E 3
OPENAI_API_KEY = "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" # Replace with your OpenAI key
# === Init Whisper ===
whisper = WhisperModel("base", device="cpu", compute_type="int8")
# === Animation Functions ===
def get_audio_duration(audio_file):
"""Gets the duration of the audio using ffprobe."""
try:
command = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
audio_file,
]
result = subprocess.run(command, capture_output=True, text=True)
return float(result.stdout)
except Exception as e:
print(f"Error getting audio duration: {e}")
return 5 # Return a default duration
def generate_images_with_openai(prompt, num_images=1):
"""
Generates images using OpenAI's API.
Args:
prompt (str): The prompt to use for image generation.
num_images (int, optional): The number of images to generate. Defaults to 1.
Returns:
list: A list of image URLs, or None on error.
"""
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": "dall-e-3", # Use the DALL-E 3 model
"prompt": prompt,
"n": num_images,
"size": "1024x1024", # You can adjust the size as needed
"response_format": "url" # Ensure we get URLs
}
try:
response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
response.raise_for_status() # Raise an exception for bad status codes
data = response.json()
image_urls = [item["url"] for item in data["data"]]
return image_urls
except requests.exceptions.RequestException as e:
print(f"Error generating images with OpenAI: {e}")
return None
except KeyError:
print(f"Error: Unexpected response format from OpenAI: {data}")
return None
except Exception as e:
print(f"Error during image generation: {e}")
return None
def create_animated_explanation(text, audio_file):
"""
Generates a more professional animation with images and synchronized text.
Args:
text (str): The text to display and explain.
audio_file (str): The path to the audio file.
Returns:
str: The path to the generated video file (.mp4), or None on error.
"""
try:
# 1. Split text into meaningful chunks (sentences or phrases)
sentences = split_text_into_chunks(text)
audio_duration = get_audio_duration(audio_file)
if audio_duration is None:
audio_duration = 5 # set default
total_frames = 100 # Example number of frames
fps = 10
frame_duration = 1 / fps
image_urls = []
# 2. Generate images for key sentences
for sentence in sentences:
image_prompt = f"Illustrate the concept: {sentence}"
urls = generate_images_with_openai(image_prompt) # Generate 1 image per sentence
if urls:
image_urls.append(urls[0]) # Use the first URL
else:
image_urls.append(None) # Append None if image generation fails
# 3. Create frames for the animation
frames = []
for i in range(total_frames):
frame_progress = i / total_frames
sentence_index = int(frame_progress * len(sentences))
sentence_index = min(sentence_index, len(sentences) - 1) #clamp
color = (220, 220, 220) # Light gray
img = Image.new("RGB", (640, 480), color=color)
d = ImageDraw.Draw(img)
font = ImageFont.truetype("DejaVuSans.ttf", 20)
current_sentence = sentences[sentence_index]
lines = textwrap.wrap(current_sentence, width=40)
y_start = (480 - len(lines) * 24) // 2
# Display sentence
for j, line in enumerate(lines):
try:
bbox = d.textbbox((0,0), line, font=font)
text_width = bbox[2] - bbox[0]
text_x = (640 - text_width) // 2
except AttributeError as e:
print(f"Pillow version error: {e}")
text_x = 10
d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)
# Add image if available
if image_urls[sentence_index]:
try:
image_data = requests.get(image_urls[sentence_index], stream=True).raw
img_to_paste = Image.open(image_data).resize((200, 200)) # Resize as needed
img.paste(img_to_paste, (440, 280)) # Position the image
except Exception as e:
print(f"Error loading or pasting image: {e}")
frames.append(img)
# 4. Save frames and create video
image_files = []
for i, frame in enumerate(frames):
image_file = f"frame_{i:04d}.png"
frame.save(image_file)
image_files.append(image_file)
video_file = "animated_explanation.mp4"
command = [
"ffmpeg",
"-framerate", str(fps),
"-i", "frame_%04d.png",
"-i", audio_file,
"-c:v", "libx264",
"-pix_fmt", "yuv420p",
"-y",
video_file,
]
subprocess.run(command, check=True, capture_output=True)
for image_file in image_files:
os.remove(image_file)
return video_file
except Exception as e:
print(f"Error creating animated explanation: {e}")
return None # Return None on error
def split_text_into_chunks(text):
"""Splits text into sentences or phrases, handling punctuation."""
import re
# Split by common sentence-ending punctuation, but handle abbreviations
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
return sentences
def create_animation(text, audio_file):
"""
Selects and runs an animation function.
"""
return create_animated_explanation(text, audio_file)
def process_audio(audio_file):
# 1. Speech to Text
segments, _ = whisper.transcribe(audio_file)
user_text = " ".join([segment.text for segment in segments])
# 2. Groq API Call
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": GROQ_MODEL,
"messages": [{"role": "user", "content": user_text}],
"temperature": 0.5,
}
response = requests.post(GROQ_API_URL, headers=headers, json=payload)
if response.status_code != 200:
return f"Groq API Error: {response.text}", None, None
reply = response.json()["choices"][0]["message"]["content"]
# 3. TTS using gTTS
tts = gTTS(reply)
audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.save(audio_output.name)
# 4. Create animation
video_file = create_animation(reply, audio_output.name)
return reply, audio_output.name, video_file
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="🎤 Speak your question"),
outputs=[
gr.Textbox(label="🧠 Groq Response"),
gr.Audio(label="🔊 AI Voice Reply"),
gr.Video(label="🎬 Animation"),
],
title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
live=True,
)
iface.launch()