Spaces:

MuhammadSajid
/

Text_to_speeech

Sleeping

App Files Files Community

MuhammadSajid commited on May 15, 2025

Commit

18d6507

verified ·

1 Parent(s): aa8af41

Update app.py

Browse files

Files changed (1) hide show

app.py +130 -147

app.py CHANGED Viewed

@@ -5,177 +5,160 @@ from faster_whisper import WhisperModel
 from gtts import gTTS
 import os
 import subprocess
-from PIL import Image, ImageDraw, ImageFont
 import random
 import textwrap
 import pkg_resources
 import sys
 # === Config ===
 GROQ_API_KEY = "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"
 GROQ_MODEL = "llama3-70b-8192"
 GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
 # === Init Whisper ===
 whisper = WhisperModel("base", device="cpu", compute_type="int8")
 # === Animation Functions ===
-def create_static_image_animation(text, audio_file):
     """
-    Generates a simple video with scrolling text over a static image.
     Args:
-        text (str): The text to display in the video.
-        audio_file (str): The path to the audio file.
     Returns:
-        str: The path to the generated video file (.mp4).
     """
-    # 1. Create a static image (you can replace this with a more interesting one)
-    width, height = 800, 600
-    image = Image.new("RGB", (width, height), color=(220, 220, 220))  # Light gray background
-    draw = ImageDraw.Draw(image)
-    font = ImageFont.truetype("DejaVuSans.ttf", 40)  # Use a default font, or specify a path
-    text_color = (0, 0, 0)  # Black text
-    # 2. Split the text into lines
-    lines = textwrap.wrap(text, width=40)  # Adjust width as needed
-    y_start = (height - len(lines) * 40) // 2  # Center vertically
-    for i, line in enumerate(lines):
-        draw.text((50, y_start + i * 40), line, fill=text_color, font=font)
-    image_file = "static_image.png"
-    image.save(image_file)
-    # 3. Create a silent video with the static image
-    video_file = "static_video.mp4"
-    audio_duration = get_audio_duration(audio_file) #get the duration
-    command = [
-        "ffmpeg",
-        "-loop", "1",  # Loop the image
-        "-i", image_file,
-        "-c:v", "libx264",
-        "-t", str(audio_duration),  # Duration of the video
-        "-pix_fmt", "yuv420p",
-        video_file
-    ]
-    subprocess.run(command, check=True, capture_output=True)
-    # 4. Add the audio to the video
-    output_video = "output_video.mp4"
-    command = [
-        "ffmpeg",
-        "-i", video_file,
-        "-i", audio_file,
-        "-c:v", "copy",
-        "-c:a", "aac",
-        "-strict", "experimental",
-        output_video
-    ]
-    subprocess.run(command, check=True, capture_output=True)
-    os.remove(image_file) #remove the image and video
-    os.remove(video_file)
-    return output_video
-def get_audio_duration(audio_file):
-    """Gets the duration of the audio using ffprobe."""
-    command = [
-        "ffprobe",
-        "-v", "error",
-        "-show_entries", "format=duration",
-        "-of", "default=noprint_wrappers=1:nokey=1",
-        audio_file
-    ]
-    result = subprocess.run(command, capture_output=True, text=True)
-    return float(result.stdout)
-def create_basic_animation(text, audio_file):
     """
-    Generates a very basic animation video with colored frames and text.
     Args:
-        text (str): The text to display.
         audio_file (str): The path to the audio file.
     Returns:
-        str: The path to the generated video file (.mp4).
     """
-    # 1. Parameters for the video
-    width, height = 640, 480
-    frame_rate = 10
-    duration = get_audio_duration(audio_file)
-    num_frames = int(duration * frame_rate)
-    colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255), (255,0,255)] #basic colors
-    font_size = 24
-    font = ImageFont.truetype("DejaVuSans.ttf", font_size)  # Use a default font
-    # 2. Check Pillow version
     try:
-        pillow_version = pkg_resources.get_distribution("Pillow").version
-        print(f"Pillow version: {pillow_version}")  # Print Pillow version
-        if tuple(map(int, pillow_version.split("."))) < (8, 0, 0):
-            raise ImportError(f"Pillow version >= 8.0.0 is required, but found {pillow_version}")
-    except pkg_resources.DistributionNotFound:
-        raise ImportError("Pillow is not installed")
     except Exception as e:
-        print(f"Error checking Pillow version: {e}")
-        raise  # Re-raise the exception to see the full traceback
-    # 2. Create frames
-    frames = []
-    for i in range(num_frames):
-        color = colors[i % len(colors)] #cycle through colors
-        img = Image.new("RGB", (width, height), color=color)
-        d = ImageDraw.Draw(img)
-        # Display the text, centered and broken into lines
-        lines = textwrap.wrap(text, width=30)  # Adjust max line length as needed
-        y_start = (height - len(lines) * font_size) // 2
-        for j, line in enumerate(lines):
-            try:
-                bbox = d.textbbox((0, 0), line, font=font)
-                text_width = bbox[2] - bbox[0]
-                text_x = (width - text_width) // 2
-            except AttributeError as e:
-                print(f"AttributeError: {e}")
-                print(f"Pillow version: {pillow_version}")
-                raise # Raise the error
-            d.text((text_x, y_start + j * font_size), line, fill=(0, 0, 0), font=font)  # Black text
-        frames.append(img)
-    # 3. Save frames as images
-    image_files = []
-    for i, frame in enumerate(frames):
-        image_file = f"frame_{i:04d}.png"
-        frame.save(image_file)
-        image_files.append(image_file)
-    # 4. Create video from images and add audio
-    video_file = "basic_animation.mp4"
-    command = [
-        "ffmpeg",
-        "-framerate", str(frame_rate),
-        "-i", "frame_%04d.png",  # Input image sequence
-        "-i", audio_file,
-        "-c:v", "libx264",
-        "-pix_fmt", "yuv420p",
-        "-y",  # Overwrite if exists
-        video_file
-    ]
-    subprocess.run(command, check=True, capture_output=True)
-    # 5. Clean up image files
-    for image_file in image_files:
-        os.remove(image_file)
-    return video_file
 def create_animation(text, audio_file):
     """
-    Selects and runs an animation function.  This could be expanded to select from multiple animation styles.
     """
-    # For now, just use the basic animation.  You can add logic here to choose different animations.
-    return create_basic_animation(text, audio_file)
 def process_audio(audio_file):
     # 1. Speech to Text
@@ -185,21 +168,21 @@ def process_audio(audio_file):
     # 2. Groq API Call
     headers = {
         "Authorization": f"Bearer {GROQ_API_KEY}",
-        "Content-Type": "application/json"
     }
     payload = {
         "model": GROQ_MODEL,
         "messages": [{"role": "user", "content": user_text}],
-        "temperature": 0.5
     }
     response = requests.post(GROQ_API_URL, headers=headers, json=payload)
     if response.status_code != 200:
-        return f"Groq API Error: {response.text}", None, None  # Return None for video_file
     reply = response.json()["choices"][0]["message"]["content"]
-    # 3. TTS using gTTS (generates .mp3)
     tts = gTTS(reply)
     audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
     tts.save(audio_output.name)
@@ -207,7 +190,7 @@ def process_audio(audio_file):
     # 4. Create animation
     video_file = create_animation(reply, audio_output.name)
-    return reply, audio_output.name, video_file # Return the video file path
 iface = gr.Interface(
     fn=process_audio,
@@ -215,11 +198,11 @@ iface = gr.Interface(
     outputs=[
         gr.Textbox(label="🧠 Groq Response"),
         gr.Audio(label="🔊 AI Voice Reply"),
-        gr.Video(label="🎬 Animation") # Add the video output
     ],
-    title="🗣️ Voice AI Assistant with Animation (Groq + Whisper + gTTS)",
-    description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and simple animation.",
-    live=True
 )
 iface.launch()

 from gtts import gTTS
 import os
 import subprocess
+from PIL import Image, ImageDraw, ImageFont, ImageSequence
 import random
 import textwrap
 import pkg_resources
 import sys
+import io
+import base64
 # === Config ===
 GROQ_API_KEY = "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"
 GROQ_MODEL = "llama3-70b-8192"
 GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
+IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generate"  # Corrected URL
+OPENAI_API_KEY = "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" # Replace with your OpenAI key
 # === Init Whisper ===
 whisper = WhisperModel("base", device="cpu", compute_type="int8")
 # === Animation Functions ===
+def generate_images_with_openai(prompt, num_images=1):
     """
+    Generates images using OpenAI's API.
     Args:
+        prompt (str): The prompt to use for image generation.
+        num_images (int, optional): The number of images to generate. Defaults to 1.
     Returns:
+        list: A list of image URLs, or None on error.
     """
+    headers = {
+        "Authorization": f"Bearer {OPENAI_API_KEY}",
+        "Content-Type": "application/json",
+    }
+    payload = {
+        "model": "dall-e-3",  # Use the DALL-E 3 model
+        "prompt": prompt,
+        "n": num_images,
+        "size": "1024x1024",  # You can adjust the size as needed
+    }
+    try:
+        response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
+        response.raise_for_status()  # Raise an exception for bad status codes
+        data = response.json()
+        image_urls = [item["url"] for item in data["data"]]
+        return image_urls
+    except requests.exceptions.RequestException as e:
+        print(f"Error generating images with OpenAI: {e}")
+        return None
+    except KeyError:
+        print(f"Error: Unexpected response format from OpenAI: {data}")
+        return None
+def create_animated_explanation(text, audio_file):
     """
+    Generates a more professional animation with images and synchronized text.
     Args:
+        text (str): The text to display and explain.
         audio_file (str): The path to the audio file.
     Returns:
+        str: The path to the generated video file (.mp4), or None on error.
     """
     try:
+        # 1. Split text into meaningful chunks (sentences or phrases)
+        sentences = split_text_into_chunks(text)
+        audio_duration = get_audio_duration(audio_file)
+        total_frames = 100  # Example number of frames
+        fps = 10
+        frame_duration = 1 / fps
+        image_urls = []
+        # 2. Generate images for key sentences
+        for sentence in sentences:
+            image_prompt = f"Illustrate the concept: {sentence}"
+            urls = generate_images_with_openai(image_prompt)  # Generate 1 image per sentence
+            if urls:
+                image_urls.append(urls[0])  # Use the first URL
+            else:
+                image_urls.append(None) # Append None if image generation fails
+        # 3. Create frames for the animation
+        frames = []
+        for i in range(total_frames):
+            frame_progress = i / total_frames
+            sentence_index = int(frame_progress * len(sentences))
+            sentence_index = min(sentence_index, len(sentences) - 1) #clamp
+            color = (220, 220, 220)  # Light gray
+            img = Image.new("RGB", (640, 480), color=color)
+            d = ImageDraw.Draw(img)
+            font = ImageFont.truetype("DejaVuSans.ttf", 20)
+            current_sentence = sentences[sentence_index]
+            lines = textwrap.wrap(current_sentence, width=40)
+            y_start = (480 - len(lines) * 24) // 2
+            # Display sentence
+            for j, line in enumerate(lines):
+                text_x = (640 - d.textsize(line, font=font)[0]) // 2
+                d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)
+            # Add image if available
+            if image_urls[sentence_index]:
+                try:
+                    image_data = requests.get(image_urls[sentence_index], stream=True).raw
+                    img_to_paste = Image.open(image_data).resize((200, 200))  # Resize as needed
+                    img.paste(img_to_paste, (440, 280))  # Position the image
+                except Exception as e:
+                    print(f"Error loading or pasting image: {e}")
+            frames.append(img)
+        # 4. Save frames and create video
+        image_files = []
+        for i, frame in enumerate(frames):
+            image_file = f"frame_{i:04d}.png"
+            frame.save(image_file)
+            image_files.append(image_file)
+        video_file = "animated_explanation.mp4"
+        command = [
+            "ffmpeg",
+            "-framerate", str(fps),
+            "-i", "frame_%04d.png",
+            "-i", audio_file,
+            "-c:v", "libx264",
+            "-pix_fmt", "yuv420p",
+            "-y",
+            video_file,
+        ]
+        subprocess.run(command, check=True, capture_output=True)
+        for image_file in image_files:
+            os.remove(image_file)
+        return video_file
     except Exception as e:
+        print(f"Error creating animated explanation: {e}")
+        return None  # Return None on error
+def split_text_into_chunks(text):
+    """Splits text into sentences or phrases, handling punctuation."""
+    import re
+    # Split by common sentence-ending punctuation, but handle abbreviations
+    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
+    return sentences
 def create_animation(text, audio_file):
     """
+    Selects and runs an animation function.
     """
+    return create_animated_explanation(text, audio_file)
 def process_audio(audio_file):
     # 1. Speech to Text
     # 2. Groq API Call
     headers = {
         "Authorization": f"Bearer {GROQ_API_KEY}",
+        "Content-Type": "application/json",
     }
     payload = {
         "model": GROQ_MODEL,
         "messages": [{"role": "user", "content": user_text}],
+        "temperature": 0.5,
     }
     response = requests.post(GROQ_API_URL, headers=headers, json=payload)
     if response.status_code != 200:
+        return f"Groq API Error: {response.text}", None, None
     reply = response.json()["choices"][0]["message"]["content"]
+    # 3. TTS using gTTS
     tts = gTTS(reply)
     audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
     tts.save(audio_output.name)
     # 4. Create animation
     video_file = create_animation(reply, audio_output.name)
+    return reply, audio_output.name, video_file
 iface = gr.Interface(
     fn=process_audio,
     outputs=[
         gr.Textbox(label="🧠 Groq Response"),
         gr.Audio(label="🔊 AI Voice Reply"),
+        gr.Video(label="🎬 Animation"),
     ],
+    title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
+    description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
+    live=True,
 )
 iface.launch()