Spaces:

MuhammadSajid
/

Text_to_speeech

Running

File size: 8,443 Bytes

import gradio as gr
import requests
import tempfile
from faster_whisper import WhisperModel
from gtts import gTTS
import os
import subprocess
from PIL import Image, ImageDraw, ImageFont
import random
import textwrap
import pkg_resources
import sys
import io
import base64

# === Config ===
GROQ_API_KEY = "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"
GROQ_MODEL = "llama3-70b-8192"
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generations"  # Corrected URL for DALL-E 3
OPENAI_API_KEY = "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" # Replace with your OpenAI key

# === Init Whisper ===
whisper = WhisperModel("base", device="cpu", compute_type="int8")

# === Animation Functions ===
def get_audio_duration(audio_file):
    """Gets the duration of the audio using ffprobe."""
    try:
        command = [
            "ffprobe",
            "-v",
            "error",
            "-show_entries",
            "format=duration",
            "-of",
            "default=noprint_wrappers=1:nokey=1",
            audio_file,
        ]
        result = subprocess.run(command, capture_output=True, text=True)
        return float(result.stdout)
    except Exception as e:
        print(f"Error getting audio duration: {e}")
        return 5  # Return a default duration

def generate_images_with_openai(prompt, num_images=1):
    """
    Generates images using OpenAI's API.

    Args:
        prompt (str): The prompt to use for image generation.
        num_images (int, optional): The number of images to generate. Defaults to 1.

    Returns:
        list: A list of image URLs, or None on error.
    """
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": "dall-e-3",  # Use the DALL-E 3 model
        "prompt": prompt,
        "n": num_images,
        "size": "1024x1024",  # You can adjust the size as needed
        "response_format": "url" # Ensure we get URLs
    }

    try:
        response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        image_urls = [item["url"] for item in data["data"]]
        return image_urls
    except requests.exceptions.RequestException as e:
        print(f"Error generating images with OpenAI: {e}")
        return None
    except KeyError:
        print(f"Error: Unexpected response format from OpenAI: {data}")
        return None
    except Exception as e:
        print(f"Error during image generation: {e}")
        return None

def create_animated_explanation(text, audio_file):
    """
    Generates a more professional animation with images and synchronized text.

    Args:
        text (str): The text to display and explain.
        audio_file (str): The path to the audio file.

    Returns:
        str: The path to the generated video file (.mp4), or None on error.
    """
    try:
        # 1. Split text into meaningful chunks (sentences or phrases)
        sentences = split_text_into_chunks(text)
        audio_duration = get_audio_duration(audio_file)
        if audio_duration is None:
            audio_duration = 5 # set default
        total_frames = 100  # Example number of frames
        fps = 10
        frame_duration = 1 / fps
        image_urls = []

        # 2. Generate images for key sentences
        for sentence in sentences:
            image_prompt = f"Illustrate the concept: {sentence}"
            urls = generate_images_with_openai(image_prompt)  # Generate 1 image per sentence
            if urls:
                image_urls.append(urls[0])  # Use the first URL
            else:
                image_urls.append(None) # Append None if image generation fails

        # 3. Create frames for the animation
        frames = []
        for i in range(total_frames):
            frame_progress = i / total_frames
            sentence_index = int(frame_progress * len(sentences))
            sentence_index = min(sentence_index, len(sentences) - 1) #clamp

            color = (220, 220, 220)  # Light gray
            img = Image.new("RGB", (640, 480), color=color)
            d = ImageDraw.Draw(img)
            font = ImageFont.truetype("DejaVuSans.ttf", 20)
            current_sentence = sentences[sentence_index]
            lines = textwrap.wrap(current_sentence, width=40)
            y_start = (480 - len(lines) * 24) // 2

            # Display sentence
            for j, line in enumerate(lines):
                try:
                    bbox = d.textbbox((0,0), line, font=font)
                    text_width = bbox[2] - bbox[0]
                    text_x = (640 - text_width) // 2
                except AttributeError as e:
                    print(f"Pillow version error: {e}")
                    text_x = 10
                d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)

            # Add image if available
            if image_urls[sentence_index]:
                try:
                    image_data = requests.get(image_urls[sentence_index], stream=True).raw
                    img_to_paste = Image.open(image_data).resize((200, 200))  # Resize as needed
                    img.paste(img_to_paste, (440, 280))  # Position the image
                except Exception as e:
                    print(f"Error loading or pasting image: {e}")

            frames.append(img)

        # 4. Save frames and create video
        image_files = []
        for i, frame in enumerate(frames):
            image_file = f"frame_{i:04d}.png"
            frame.save(image_file)
            image_files.append(image_file)

        video_file = "animated_explanation.mp4"
        command = [
            "ffmpeg",
            "-framerate", str(fps),
            "-i", "frame_%04d.png",
            "-i", audio_file,
            "-c:v", "libx264",
            "-pix_fmt", "yuv420p",
            "-y",
            video_file,
        ]
        subprocess.run(command, check=True, capture_output=True)

        for image_file in image_files:
            os.remove(image_file)
        return video_file

    except Exception as e:
        print(f"Error creating animated explanation: {e}")
        return None  # Return None on error

def split_text_into_chunks(text):
    """Splits text into sentences or phrases, handling punctuation."""
    import re
    # Split by common sentence-ending punctuation, but handle abbreviations
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
    return sentences

def create_animation(text, audio_file):
    """
    Selects and runs an animation function.
    """
    return create_animated_explanation(text, audio_file)

def process_audio(audio_file):
    # 1. Speech to Text
    segments, _ = whisper.transcribe(audio_file)
    user_text = " ".join([segment.text for segment in segments])

    # 2. Groq API Call
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": GROQ_MODEL,
        "messages": [{"role": "user", "content": user_text}],
        "temperature": 0.5,
    }

    response = requests.post(GROQ_API_URL, headers=headers, json=payload)
    if response.status_code != 200:
        return f"Groq API Error: {response.text}", None, None

    reply = response.json()["choices"][0]["message"]["content"]

    # 3. TTS using gTTS
    tts = gTTS(reply)
    audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
    tts.save(audio_output.name)

    # 4. Create animation
    video_file = create_animation(reply, audio_output.name)

    return reply, audio_output.name, video_file

iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="🎤 Speak your question"),
    outputs=[
        gr.Textbox(label="🧠 Groq Response"),
        gr.Audio(label="🔊 AI Voice Reply"),
        gr.Video(label="🎬 Animation"),
    ],
    title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
    description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
    live=True,
)

iface.launch()