Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,177 +5,160 @@ from faster_whisper import WhisperModel
|
|
| 5 |
from gtts import gTTS
|
| 6 |
import os
|
| 7 |
import subprocess
|
| 8 |
-
from PIL import Image, ImageDraw, ImageFont
|
| 9 |
import random
|
| 10 |
import textwrap
|
| 11 |
import pkg_resources
|
| 12 |
import sys
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# === Config ===
|
| 15 |
GROQ_API_KEY = "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"
|
| 16 |
GROQ_MODEL = "llama3-70b-8192"
|
| 17 |
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
|
|
|
|
|
|
|
| 18 |
|
| 19 |
# === Init Whisper ===
|
| 20 |
whisper = WhisperModel("base", device="cpu", compute_type="int8")
|
| 21 |
|
| 22 |
# === Animation Functions ===
|
| 23 |
-
def
|
| 24 |
"""
|
| 25 |
-
Generates
|
| 26 |
|
| 27 |
Args:
|
| 28 |
-
|
| 29 |
-
|
| 30 |
|
| 31 |
Returns:
|
| 32 |
-
|
| 33 |
"""
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
"
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
"-t", str(audio_duration), # Duration of the video
|
| 60 |
-
"-pix_fmt", "yuv420p",
|
| 61 |
-
video_file
|
| 62 |
-
]
|
| 63 |
-
subprocess.run(command, check=True, capture_output=True)
|
| 64 |
-
|
| 65 |
-
# 4. Add the audio to the video
|
| 66 |
-
output_video = "output_video.mp4"
|
| 67 |
-
command = [
|
| 68 |
-
"ffmpeg",
|
| 69 |
-
"-i", video_file,
|
| 70 |
-
"-i", audio_file,
|
| 71 |
-
"-c:v", "copy",
|
| 72 |
-
"-c:a", "aac",
|
| 73 |
-
"-strict", "experimental",
|
| 74 |
-
output_video
|
| 75 |
-
]
|
| 76 |
-
subprocess.run(command, check=True, capture_output=True)
|
| 77 |
-
os.remove(image_file) #remove the image and video
|
| 78 |
-
os.remove(video_file)
|
| 79 |
-
return output_video
|
| 80 |
-
|
| 81 |
-
def get_audio_duration(audio_file):
|
| 82 |
-
"""Gets the duration of the audio using ffprobe."""
|
| 83 |
-
command = [
|
| 84 |
-
"ffprobe",
|
| 85 |
-
"-v", "error",
|
| 86 |
-
"-show_entries", "format=duration",
|
| 87 |
-
"-of", "default=noprint_wrappers=1:nokey=1",
|
| 88 |
-
audio_file
|
| 89 |
-
]
|
| 90 |
-
result = subprocess.run(command, capture_output=True, text=True)
|
| 91 |
-
return float(result.stdout)
|
| 92 |
-
|
| 93 |
-
def create_basic_animation(text, audio_file):
|
| 94 |
"""
|
| 95 |
-
Generates a
|
| 96 |
|
| 97 |
Args:
|
| 98 |
-
text (str): The text to display.
|
| 99 |
audio_file (str): The path to the audio file.
|
| 100 |
|
| 101 |
Returns:
|
| 102 |
-
str: The path to the generated video file (.mp4).
|
| 103 |
"""
|
| 104 |
-
# 1. Parameters for the video
|
| 105 |
-
width, height = 640, 480
|
| 106 |
-
frame_rate = 10
|
| 107 |
-
duration = get_audio_duration(audio_file)
|
| 108 |
-
num_frames = int(duration * frame_rate)
|
| 109 |
-
colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255), (255,0,255)] #basic colors
|
| 110 |
-
font_size = 24
|
| 111 |
-
font = ImageFont.truetype("DejaVuSans.ttf", font_size) # Use a default font
|
| 112 |
-
|
| 113 |
-
# 2. Check Pillow version
|
| 114 |
try:
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
except Exception as e:
|
| 122 |
-
print(f"Error
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
# Display the text, centered and broken into lines
|
| 132 |
-
lines = textwrap.wrap(text, width=30) # Adjust max line length as needed
|
| 133 |
-
y_start = (height - len(lines) * font_size) // 2
|
| 134 |
-
for j, line in enumerate(lines):
|
| 135 |
-
try:
|
| 136 |
-
bbox = d.textbbox((0, 0), line, font=font)
|
| 137 |
-
text_width = bbox[2] - bbox[0]
|
| 138 |
-
text_x = (width - text_width) // 2
|
| 139 |
-
|
| 140 |
-
except AttributeError as e:
|
| 141 |
-
print(f"AttributeError: {e}")
|
| 142 |
-
print(f"Pillow version: {pillow_version}")
|
| 143 |
-
raise # Raise the error
|
| 144 |
-
d.text((text_x, y_start + j * font_size), line, fill=(0, 0, 0), font=font) # Black text
|
| 145 |
-
frames.append(img)
|
| 146 |
-
|
| 147 |
-
# 3. Save frames as images
|
| 148 |
-
image_files = []
|
| 149 |
-
for i, frame in enumerate(frames):
|
| 150 |
-
image_file = f"frame_{i:04d}.png"
|
| 151 |
-
frame.save(image_file)
|
| 152 |
-
image_files.append(image_file)
|
| 153 |
-
|
| 154 |
-
# 4. Create video from images and add audio
|
| 155 |
-
video_file = "basic_animation.mp4"
|
| 156 |
-
command = [
|
| 157 |
-
"ffmpeg",
|
| 158 |
-
"-framerate", str(frame_rate),
|
| 159 |
-
"-i", "frame_%04d.png", # Input image sequence
|
| 160 |
-
"-i", audio_file,
|
| 161 |
-
"-c:v", "libx264",
|
| 162 |
-
"-pix_fmt", "yuv420p",
|
| 163 |
-
"-y", # Overwrite if exists
|
| 164 |
-
video_file
|
| 165 |
-
]
|
| 166 |
-
subprocess.run(command, check=True, capture_output=True)
|
| 167 |
-
|
| 168 |
-
# 5. Clean up image files
|
| 169 |
-
for image_file in image_files:
|
| 170 |
-
os.remove(image_file)
|
| 171 |
-
return video_file
|
| 172 |
|
| 173 |
def create_animation(text, audio_file):
|
| 174 |
"""
|
| 175 |
-
Selects and runs an animation function.
|
| 176 |
"""
|
| 177 |
-
|
| 178 |
-
return create_basic_animation(text, audio_file)
|
| 179 |
|
| 180 |
def process_audio(audio_file):
|
| 181 |
# 1. Speech to Text
|
|
@@ -185,21 +168,21 @@ def process_audio(audio_file):
|
|
| 185 |
# 2. Groq API Call
|
| 186 |
headers = {
|
| 187 |
"Authorization": f"Bearer {GROQ_API_KEY}",
|
| 188 |
-
"Content-Type": "application/json"
|
| 189 |
}
|
| 190 |
payload = {
|
| 191 |
"model": GROQ_MODEL,
|
| 192 |
"messages": [{"role": "user", "content": user_text}],
|
| 193 |
-
"temperature": 0.5
|
| 194 |
}
|
| 195 |
|
| 196 |
response = requests.post(GROQ_API_URL, headers=headers, json=payload)
|
| 197 |
if response.status_code != 200:
|
| 198 |
-
return f"Groq API Error: {response.text}", None, None
|
| 199 |
|
| 200 |
reply = response.json()["choices"][0]["message"]["content"]
|
| 201 |
|
| 202 |
-
# 3. TTS using gTTS
|
| 203 |
tts = gTTS(reply)
|
| 204 |
audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
| 205 |
tts.save(audio_output.name)
|
|
@@ -207,7 +190,7 @@ def process_audio(audio_file):
|
|
| 207 |
# 4. Create animation
|
| 208 |
video_file = create_animation(reply, audio_output.name)
|
| 209 |
|
| 210 |
-
return reply, audio_output.name, video_file
|
| 211 |
|
| 212 |
iface = gr.Interface(
|
| 213 |
fn=process_audio,
|
|
@@ -215,11 +198,11 @@ iface = gr.Interface(
|
|
| 215 |
outputs=[
|
| 216 |
gr.Textbox(label="🧠 Groq Response"),
|
| 217 |
gr.Audio(label="🔊 AI Voice Reply"),
|
| 218 |
-
gr.Video(label="🎬 Animation")
|
| 219 |
],
|
| 220 |
-
title="🗣️ Voice AI Assistant with Animation (Groq + Whisper + gTTS)",
|
| 221 |
-
description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and
|
| 222 |
-
live=True
|
| 223 |
)
|
| 224 |
|
| 225 |
iface.launch()
|
|
|
|
| 5 |
from gtts import gTTS
|
| 6 |
import os
|
| 7 |
import subprocess
|
| 8 |
+
from PIL import Image, ImageDraw, ImageFont, ImageSequence
|
| 9 |
import random
|
| 10 |
import textwrap
|
| 11 |
import pkg_resources
|
| 12 |
import sys
|
| 13 |
+
import io
|
| 14 |
+
import base64
|
| 15 |
|
| 16 |
# === Config ===
|
| 17 |
GROQ_API_KEY = "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"
|
| 18 |
GROQ_MODEL = "llama3-70b-8192"
|
| 19 |
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
|
| 20 |
+
IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generate" # Corrected URL
|
| 21 |
+
OPENAI_API_KEY = "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" # Replace with your OpenAI key
|
| 22 |
|
| 23 |
# === Init Whisper ===
|
| 24 |
whisper = WhisperModel("base", device="cpu", compute_type="int8")
|
| 25 |
|
| 26 |
# === Animation Functions ===
|
| 27 |
+
def generate_images_with_openai(prompt, num_images=1):
|
| 28 |
"""
|
| 29 |
+
Generates images using OpenAI's API.
|
| 30 |
|
| 31 |
Args:
|
| 32 |
+
prompt (str): The prompt to use for image generation.
|
| 33 |
+
num_images (int, optional): The number of images to generate. Defaults to 1.
|
| 34 |
|
| 35 |
Returns:
|
| 36 |
+
list: A list of image URLs, or None on error.
|
| 37 |
"""
|
| 38 |
+
headers = {
|
| 39 |
+
"Authorization": f"Bearer {OPENAI_API_KEY}",
|
| 40 |
+
"Content-Type": "application/json",
|
| 41 |
+
}
|
| 42 |
+
payload = {
|
| 43 |
+
"model": "dall-e-3", # Use the DALL-E 3 model
|
| 44 |
+
"prompt": prompt,
|
| 45 |
+
"n": num_images,
|
| 46 |
+
"size": "1024x1024", # You can adjust the size as needed
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
try:
|
| 50 |
+
response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
|
| 51 |
+
response.raise_for_status() # Raise an exception for bad status codes
|
| 52 |
+
data = response.json()
|
| 53 |
+
image_urls = [item["url"] for item in data["data"]]
|
| 54 |
+
return image_urls
|
| 55 |
+
except requests.exceptions.RequestException as e:
|
| 56 |
+
print(f"Error generating images with OpenAI: {e}")
|
| 57 |
+
return None
|
| 58 |
+
except KeyError:
|
| 59 |
+
print(f"Error: Unexpected response format from OpenAI: {data}")
|
| 60 |
+
return None
|
| 61 |
+
|
| 62 |
+
def create_animated_explanation(text, audio_file):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
"""
|
| 64 |
+
Generates a more professional animation with images and synchronized text.
|
| 65 |
|
| 66 |
Args:
|
| 67 |
+
text (str): The text to display and explain.
|
| 68 |
audio_file (str): The path to the audio file.
|
| 69 |
|
| 70 |
Returns:
|
| 71 |
+
str: The path to the generated video file (.mp4), or None on error.
|
| 72 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
try:
|
| 74 |
+
# 1. Split text into meaningful chunks (sentences or phrases)
|
| 75 |
+
sentences = split_text_into_chunks(text)
|
| 76 |
+
audio_duration = get_audio_duration(audio_file)
|
| 77 |
+
total_frames = 100 # Example number of frames
|
| 78 |
+
fps = 10
|
| 79 |
+
frame_duration = 1 / fps
|
| 80 |
+
image_urls = []
|
| 81 |
+
|
| 82 |
+
# 2. Generate images for key sentences
|
| 83 |
+
for sentence in sentences:
|
| 84 |
+
image_prompt = f"Illustrate the concept: {sentence}"
|
| 85 |
+
urls = generate_images_with_openai(image_prompt) # Generate 1 image per sentence
|
| 86 |
+
if urls:
|
| 87 |
+
image_urls.append(urls[0]) # Use the first URL
|
| 88 |
+
else:
|
| 89 |
+
image_urls.append(None) # Append None if image generation fails
|
| 90 |
+
|
| 91 |
+
# 3. Create frames for the animation
|
| 92 |
+
frames = []
|
| 93 |
+
for i in range(total_frames):
|
| 94 |
+
frame_progress = i / total_frames
|
| 95 |
+
sentence_index = int(frame_progress * len(sentences))
|
| 96 |
+
sentence_index = min(sentence_index, len(sentences) - 1) #clamp
|
| 97 |
+
|
| 98 |
+
color = (220, 220, 220) # Light gray
|
| 99 |
+
img = Image.new("RGB", (640, 480), color=color)
|
| 100 |
+
d = ImageDraw.Draw(img)
|
| 101 |
+
font = ImageFont.truetype("DejaVuSans.ttf", 20)
|
| 102 |
+
current_sentence = sentences[sentence_index]
|
| 103 |
+
lines = textwrap.wrap(current_sentence, width=40)
|
| 104 |
+
y_start = (480 - len(lines) * 24) // 2
|
| 105 |
+
|
| 106 |
+
# Display sentence
|
| 107 |
+
for j, line in enumerate(lines):
|
| 108 |
+
text_x = (640 - d.textsize(line, font=font)[0]) // 2
|
| 109 |
+
d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)
|
| 110 |
+
|
| 111 |
+
# Add image if available
|
| 112 |
+
if image_urls[sentence_index]:
|
| 113 |
+
try:
|
| 114 |
+
image_data = requests.get(image_urls[sentence_index], stream=True).raw
|
| 115 |
+
img_to_paste = Image.open(image_data).resize((200, 200)) # Resize as needed
|
| 116 |
+
img.paste(img_to_paste, (440, 280)) # Position the image
|
| 117 |
+
except Exception as e:
|
| 118 |
+
print(f"Error loading or pasting image: {e}")
|
| 119 |
+
|
| 120 |
+
frames.append(img)
|
| 121 |
+
|
| 122 |
+
# 4. Save frames and create video
|
| 123 |
+
image_files = []
|
| 124 |
+
for i, frame in enumerate(frames):
|
| 125 |
+
image_file = f"frame_{i:04d}.png"
|
| 126 |
+
frame.save(image_file)
|
| 127 |
+
image_files.append(image_file)
|
| 128 |
+
|
| 129 |
+
video_file = "animated_explanation.mp4"
|
| 130 |
+
command = [
|
| 131 |
+
"ffmpeg",
|
| 132 |
+
"-framerate", str(fps),
|
| 133 |
+
"-i", "frame_%04d.png",
|
| 134 |
+
"-i", audio_file,
|
| 135 |
+
"-c:v", "libx264",
|
| 136 |
+
"-pix_fmt", "yuv420p",
|
| 137 |
+
"-y",
|
| 138 |
+
video_file,
|
| 139 |
+
]
|
| 140 |
+
subprocess.run(command, check=True, capture_output=True)
|
| 141 |
+
|
| 142 |
+
for image_file in image_files:
|
| 143 |
+
os.remove(image_file)
|
| 144 |
+
return video_file
|
| 145 |
+
|
| 146 |
except Exception as e:
|
| 147 |
+
print(f"Error creating animated explanation: {e}")
|
| 148 |
+
return None # Return None on error
|
| 149 |
+
|
| 150 |
+
def split_text_into_chunks(text):
|
| 151 |
+
"""Splits text into sentences or phrases, handling punctuation."""
|
| 152 |
+
import re
|
| 153 |
+
# Split by common sentence-ending punctuation, but handle abbreviations
|
| 154 |
+
sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
|
| 155 |
+
return sentences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 156 |
|
| 157 |
def create_animation(text, audio_file):
|
| 158 |
"""
|
| 159 |
+
Selects and runs an animation function.
|
| 160 |
"""
|
| 161 |
+
return create_animated_explanation(text, audio_file)
|
|
|
|
| 162 |
|
| 163 |
def process_audio(audio_file):
|
| 164 |
# 1. Speech to Text
|
|
|
|
| 168 |
# 2. Groq API Call
|
| 169 |
headers = {
|
| 170 |
"Authorization": f"Bearer {GROQ_API_KEY}",
|
| 171 |
+
"Content-Type": "application/json",
|
| 172 |
}
|
| 173 |
payload = {
|
| 174 |
"model": GROQ_MODEL,
|
| 175 |
"messages": [{"role": "user", "content": user_text}],
|
| 176 |
+
"temperature": 0.5,
|
| 177 |
}
|
| 178 |
|
| 179 |
response = requests.post(GROQ_API_URL, headers=headers, json=payload)
|
| 180 |
if response.status_code != 200:
|
| 181 |
+
return f"Groq API Error: {response.text}", None, None
|
| 182 |
|
| 183 |
reply = response.json()["choices"][0]["message"]["content"]
|
| 184 |
|
| 185 |
+
# 3. TTS using gTTS
|
| 186 |
tts = gTTS(reply)
|
| 187 |
audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
| 188 |
tts.save(audio_output.name)
|
|
|
|
| 190 |
# 4. Create animation
|
| 191 |
video_file = create_animation(reply, audio_output.name)
|
| 192 |
|
| 193 |
+
return reply, audio_output.name, video_file
|
| 194 |
|
| 195 |
iface = gr.Interface(
|
| 196 |
fn=process_audio,
|
|
|
|
| 198 |
outputs=[
|
| 199 |
gr.Textbox(label="🧠 Groq Response"),
|
| 200 |
gr.Audio(label="🔊 AI Voice Reply"),
|
| 201 |
+
gr.Video(label="🎬 Animation"),
|
| 202 |
],
|
| 203 |
+
title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
|
| 204 |
+
description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
|
| 205 |
+
live=True,
|
| 206 |
)
|
| 207 |
|
| 208 |
iface.launch()
|