Spaces:
Running
Running
File size: 8,443 Bytes
86d1c54 0085ebf fdb8b10 37e26bf fdb8b10 8054f81 e79bd55 18d6507 86d1c54 37e26bf 5f5d8ad 86d1c54 fdb8b10 0f49314 18d6507 fdb8b10 18d6507 fdb8b10 18d6507 fdb8b10 18d6507 fdb8b10 18d6507 37e26bf 18d6507 37e26bf 18d6507 fdb8b10 18d6507 fdb8b10 18d6507 fdb8b10 18d6507 fdb8b10 8054f81 18d6507 0f49314 18d6507 37e26bf 18d6507 e79bd55 18d6507 fdb8b10 18d6507 fdb8b10 18d6507 fdb8b10 86d1c54 18d6507 86d1c54 18d6507 86d1c54 18d6507 86d1c54 18d6507 0085ebf 86d1c54 fdb8b10 18d6507 86d1c54 88b8709 86d1c54 fdb8b10 18d6507 86d1c54 18d6507 86d1c54 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 |
import gradio as gr
import requests
import tempfile
from faster_whisper import WhisperModel
from gtts import gTTS
import os
import subprocess
from PIL import Image, ImageDraw, ImageFont
import random
import textwrap
import pkg_resources
import sys
import io
import base64
# === Config ===
GROQ_API_KEY = "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"
GROQ_MODEL = "llama3-70b-8192"
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generations" # Corrected URL for DALL-E 3
OPENAI_API_KEY = "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" # Replace with your OpenAI key
# === Init Whisper ===
whisper = WhisperModel("base", device="cpu", compute_type="int8")
# === Animation Functions ===
def get_audio_duration(audio_file):
"""Gets the duration of the audio using ffprobe."""
try:
command = [
"ffprobe",
"-v",
"error",
"-show_entries",
"format=duration",
"-of",
"default=noprint_wrappers=1:nokey=1",
audio_file,
]
result = subprocess.run(command, capture_output=True, text=True)
return float(result.stdout)
except Exception as e:
print(f"Error getting audio duration: {e}")
return 5 # Return a default duration
def generate_images_with_openai(prompt, num_images=1):
"""
Generates images using OpenAI's API.
Args:
prompt (str): The prompt to use for image generation.
num_images (int, optional): The number of images to generate. Defaults to 1.
Returns:
list: A list of image URLs, or None on error.
"""
headers = {
"Authorization": f"Bearer {OPENAI_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": "dall-e-3", # Use the DALL-E 3 model
"prompt": prompt,
"n": num_images,
"size": "1024x1024", # You can adjust the size as needed
"response_format": "url" # Ensure we get URLs
}
try:
response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
response.raise_for_status() # Raise an exception for bad status codes
data = response.json()
image_urls = [item["url"] for item in data["data"]]
return image_urls
except requests.exceptions.RequestException as e:
print(f"Error generating images with OpenAI: {e}")
return None
except KeyError:
print(f"Error: Unexpected response format from OpenAI: {data}")
return None
except Exception as e:
print(f"Error during image generation: {e}")
return None
def create_animated_explanation(text, audio_file):
"""
Generates a more professional animation with images and synchronized text.
Args:
text (str): The text to display and explain.
audio_file (str): The path to the audio file.
Returns:
str: The path to the generated video file (.mp4), or None on error.
"""
try:
# 1. Split text into meaningful chunks (sentences or phrases)
sentences = split_text_into_chunks(text)
audio_duration = get_audio_duration(audio_file)
if audio_duration is None:
audio_duration = 5 # set default
total_frames = 100 # Example number of frames
fps = 10
frame_duration = 1 / fps
image_urls = []
# 2. Generate images for key sentences
for sentence in sentences:
image_prompt = f"Illustrate the concept: {sentence}"
urls = generate_images_with_openai(image_prompt) # Generate 1 image per sentence
if urls:
image_urls.append(urls[0]) # Use the first URL
else:
image_urls.append(None) # Append None if image generation fails
# 3. Create frames for the animation
frames = []
for i in range(total_frames):
frame_progress = i / total_frames
sentence_index = int(frame_progress * len(sentences))
sentence_index = min(sentence_index, len(sentences) - 1) #clamp
color = (220, 220, 220) # Light gray
img = Image.new("RGB", (640, 480), color=color)
d = ImageDraw.Draw(img)
font = ImageFont.truetype("DejaVuSans.ttf", 20)
current_sentence = sentences[sentence_index]
lines = textwrap.wrap(current_sentence, width=40)
y_start = (480 - len(lines) * 24) // 2
# Display sentence
for j, line in enumerate(lines):
try:
bbox = d.textbbox((0,0), line, font=font)
text_width = bbox[2] - bbox[0]
text_x = (640 - text_width) // 2
except AttributeError as e:
print(f"Pillow version error: {e}")
text_x = 10
d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)
# Add image if available
if image_urls[sentence_index]:
try:
image_data = requests.get(image_urls[sentence_index], stream=True).raw
img_to_paste = Image.open(image_data).resize((200, 200)) # Resize as needed
img.paste(img_to_paste, (440, 280)) # Position the image
except Exception as e:
print(f"Error loading or pasting image: {e}")
frames.append(img)
# 4. Save frames and create video
image_files = []
for i, frame in enumerate(frames):
image_file = f"frame_{i:04d}.png"
frame.save(image_file)
image_files.append(image_file)
video_file = "animated_explanation.mp4"
command = [
"ffmpeg",
"-framerate", str(fps),
"-i", "frame_%04d.png",
"-i", audio_file,
"-c:v", "libx264",
"-pix_fmt", "yuv420p",
"-y",
video_file,
]
subprocess.run(command, check=True, capture_output=True)
for image_file in image_files:
os.remove(image_file)
return video_file
except Exception as e:
print(f"Error creating animated explanation: {e}")
return None # Return None on error
def split_text_into_chunks(text):
"""Splits text into sentences or phrases, handling punctuation."""
import re
# Split by common sentence-ending punctuation, but handle abbreviations
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
return sentences
def create_animation(text, audio_file):
"""
Selects and runs an animation function.
"""
return create_animated_explanation(text, audio_file)
def process_audio(audio_file):
# 1. Speech to Text
segments, _ = whisper.transcribe(audio_file)
user_text = " ".join([segment.text for segment in segments])
# 2. Groq API Call
headers = {
"Authorization": f"Bearer {GROQ_API_KEY}",
"Content-Type": "application/json",
}
payload = {
"model": GROQ_MODEL,
"messages": [{"role": "user", "content": user_text}],
"temperature": 0.5,
}
response = requests.post(GROQ_API_URL, headers=headers, json=payload)
if response.status_code != 200:
return f"Groq API Error: {response.text}", None, None
reply = response.json()["choices"][0]["message"]["content"]
# 3. TTS using gTTS
tts = gTTS(reply)
audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
tts.save(audio_output.name)
# 4. Create animation
video_file = create_animation(reply, audio_output.name)
return reply, audio_output.name, video_file
iface = gr.Interface(
fn=process_audio,
inputs=gr.Audio(type="filepath", label="🎤 Speak your question"),
outputs=[
gr.Textbox(label="🧠 Groq Response"),
gr.Audio(label="🔊 AI Voice Reply"),
gr.Video(label="🎬 Animation"),
],
title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
live=True,
)
iface.launch()
|