Spaces:
Running
Running
| import gradio as gr | |
| import requests | |
| import tempfile | |
| from faster_whisper import WhisperModel | |
| from gtts import gTTS | |
| import os | |
| import subprocess | |
| from PIL import Image, ImageDraw, ImageFont | |
| import random | |
| import textwrap | |
| import pkg_resources | |
| import sys | |
| import io | |
| import base64 | |
| # === Config === | |
| GROQ_API_KEY = "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" | |
| GROQ_MODEL = "llama3-70b-8192" | |
| GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions" | |
| IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generations" # Corrected URL for DALL-E 3 | |
| OPENAI_API_KEY = "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" # Replace with your OpenAI key | |
| # === Init Whisper === | |
| whisper = WhisperModel("base", device="cpu", compute_type="int8") | |
| # === Animation Functions === | |
| def get_audio_duration(audio_file): | |
| """Gets the duration of the audio using ffprobe.""" | |
| try: | |
| command = [ | |
| "ffprobe", | |
| "-v", | |
| "error", | |
| "-show_entries", | |
| "format=duration", | |
| "-of", | |
| "default=noprint_wrappers=1:nokey=1", | |
| audio_file, | |
| ] | |
| result = subprocess.run(command, capture_output=True, text=True) | |
| return float(result.stdout) | |
| except Exception as e: | |
| print(f"Error getting audio duration: {e}") | |
| return 5 # Return a default duration | |
| def generate_images_with_openai(prompt, num_images=1): | |
| """ | |
| Generates images using OpenAI's API. | |
| Args: | |
| prompt (str): The prompt to use for image generation. | |
| num_images (int, optional): The number of images to generate. Defaults to 1. | |
| Returns: | |
| list: A list of image URLs, or None on error. | |
| """ | |
| headers = { | |
| "Authorization": f"Bearer {OPENAI_API_KEY}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "model": "dall-e-3", # Use the DALL-E 3 model | |
| "prompt": prompt, | |
| "n": num_images, | |
| "size": "1024x1024", # You can adjust the size as needed | |
| "response_format": "url" # Ensure we get URLs | |
| } | |
| try: | |
| response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload) | |
| response.raise_for_status() # Raise an exception for bad status codes | |
| data = response.json() | |
| image_urls = [item["url"] for item in data["data"]] | |
| return image_urls | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error generating images with OpenAI: {e}") | |
| return None | |
| except KeyError: | |
| print(f"Error: Unexpected response format from OpenAI: {data}") | |
| return None | |
| except Exception as e: | |
| print(f"Error during image generation: {e}") | |
| return None | |
| def create_animated_explanation(text, audio_file): | |
| """ | |
| Generates a more professional animation with images and synchronized text. | |
| Args: | |
| text (str): The text to display and explain. | |
| audio_file (str): The path to the audio file. | |
| Returns: | |
| str: The path to the generated video file (.mp4), or None on error. | |
| """ | |
| try: | |
| # 1. Split text into meaningful chunks (sentences or phrases) | |
| sentences = split_text_into_chunks(text) | |
| audio_duration = get_audio_duration(audio_file) | |
| if audio_duration is None: | |
| audio_duration = 5 # set default | |
| total_frames = 100 # Example number of frames | |
| fps = 10 | |
| frame_duration = 1 / fps | |
| image_urls = [] | |
| # 2. Generate images for key sentences | |
| for sentence in sentences: | |
| image_prompt = f"Illustrate the concept: {sentence}" | |
| urls = generate_images_with_openai(image_prompt) # Generate 1 image per sentence | |
| if urls: | |
| image_urls.append(urls[0]) # Use the first URL | |
| else: | |
| image_urls.append(None) # Append None if image generation fails | |
| # 3. Create frames for the animation | |
| frames = [] | |
| for i in range(total_frames): | |
| frame_progress = i / total_frames | |
| sentence_index = int(frame_progress * len(sentences)) | |
| sentence_index = min(sentence_index, len(sentences) - 1) #clamp | |
| color = (220, 220, 220) # Light gray | |
| img = Image.new("RGB", (640, 480), color=color) | |
| d = ImageDraw.Draw(img) | |
| font = ImageFont.truetype("DejaVuSans.ttf", 20) | |
| current_sentence = sentences[sentence_index] | |
| lines = textwrap.wrap(current_sentence, width=40) | |
| y_start = (480 - len(lines) * 24) // 2 | |
| # Display sentence | |
| for j, line in enumerate(lines): | |
| try: | |
| bbox = d.textbbox((0,0), line, font=font) | |
| text_width = bbox[2] - bbox[0] | |
| text_x = (640 - text_width) // 2 | |
| except AttributeError as e: | |
| print(f"Pillow version error: {e}") | |
| text_x = 10 | |
| d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font) | |
| # Add image if available | |
| if image_urls[sentence_index]: | |
| try: | |
| image_data = requests.get(image_urls[sentence_index], stream=True).raw | |
| img_to_paste = Image.open(image_data).resize((200, 200)) # Resize as needed | |
| img.paste(img_to_paste, (440, 280)) # Position the image | |
| except Exception as e: | |
| print(f"Error loading or pasting image: {e}") | |
| frames.append(img) | |
| # 4. Save frames and create video | |
| image_files = [] | |
| for i, frame in enumerate(frames): | |
| image_file = f"frame_{i:04d}.png" | |
| frame.save(image_file) | |
| image_files.append(image_file) | |
| video_file = "animated_explanation.mp4" | |
| command = [ | |
| "ffmpeg", | |
| "-framerate", str(fps), | |
| "-i", "frame_%04d.png", | |
| "-i", audio_file, | |
| "-c:v", "libx264", | |
| "-pix_fmt", "yuv420p", | |
| "-y", | |
| video_file, | |
| ] | |
| subprocess.run(command, check=True, capture_output=True) | |
| for image_file in image_files: | |
| os.remove(image_file) | |
| return video_file | |
| except Exception as e: | |
| print(f"Error creating animated explanation: {e}") | |
| return None # Return None on error | |
| def split_text_into_chunks(text): | |
| """Splits text into sentences or phrases, handling punctuation.""" | |
| import re | |
| # Split by common sentence-ending punctuation, but handle abbreviations | |
| sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text) | |
| return sentences | |
| def create_animation(text, audio_file): | |
| """ | |
| Selects and runs an animation function. | |
| """ | |
| return create_animated_explanation(text, audio_file) | |
| def process_audio(audio_file): | |
| # 1. Speech to Text | |
| segments, _ = whisper.transcribe(audio_file) | |
| user_text = " ".join([segment.text for segment in segments]) | |
| # 2. Groq API Call | |
| headers = { | |
| "Authorization": f"Bearer {GROQ_API_KEY}", | |
| "Content-Type": "application/json", | |
| } | |
| payload = { | |
| "model": GROQ_MODEL, | |
| "messages": [{"role": "user", "content": user_text}], | |
| "temperature": 0.5, | |
| } | |
| response = requests.post(GROQ_API_URL, headers=headers, json=payload) | |
| if response.status_code != 200: | |
| return f"Groq API Error: {response.text}", None, None | |
| reply = response.json()["choices"][0]["message"]["content"] | |
| # 3. TTS using gTTS | |
| tts = gTTS(reply) | |
| audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) | |
| tts.save(audio_output.name) | |
| # 4. Create animation | |
| video_file = create_animation(reply, audio_output.name) | |
| return reply, audio_output.name, video_file | |
| iface = gr.Interface( | |
| fn=process_audio, | |
| inputs=gr.Audio(type="filepath", label="🎤 Speak your question"), | |
| outputs=[ | |
| gr.Textbox(label="🧠 Groq Response"), | |
| gr.Audio(label="🔊 AI Voice Reply"), | |
| gr.Video(label="🎬 Animation"), | |
| ], | |
| title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)", | |
| description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.", | |
| live=True, | |
| ) | |
| iface.launch() | |