File size: 8,443 Bytes
86d1c54
 
 
 
0085ebf
fdb8b10
 
37e26bf
fdb8b10
8054f81
e79bd55
 
18d6507
 
86d1c54
 
 
 
 
37e26bf
5f5d8ad
86d1c54
 
 
 
fdb8b10
0f49314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18d6507
fdb8b10
18d6507
fdb8b10
 
18d6507
 
fdb8b10
 
18d6507
fdb8b10
18d6507
 
 
 
 
 
 
 
 
37e26bf
18d6507
 
 
 
 
 
 
 
 
 
 
 
 
 
37e26bf
 
 
18d6507
 
fdb8b10
18d6507
fdb8b10
 
18d6507
fdb8b10
 
 
18d6507
fdb8b10
8054f81
18d6507
 
 
0f49314
 
18d6507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37e26bf
 
 
 
 
 
 
18d6507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e79bd55
18d6507
 
 
 
 
 
 
 
 
fdb8b10
 
 
18d6507
fdb8b10
18d6507
fdb8b10
86d1c54
 
 
 
 
 
 
 
18d6507
86d1c54
 
 
 
18d6507
86d1c54
 
 
 
18d6507
86d1c54
 
 
18d6507
0085ebf
 
 
86d1c54
fdb8b10
 
 
18d6507
86d1c54
 
 
88b8709
86d1c54
 
fdb8b10
18d6507
86d1c54
18d6507
 
 
86d1c54
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
import gradio as gr
import requests
import tempfile
from faster_whisper import WhisperModel
from gtts import gTTS
import os
import subprocess
from PIL import Image, ImageDraw, ImageFont
import random
import textwrap
import pkg_resources
import sys
import io
import base64

# === Config ===
GROQ_API_KEY = "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"
GROQ_MODEL = "llama3-70b-8192"
GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generations"  # Corrected URL for DALL-E 3
OPENAI_API_KEY = "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" # Replace with your OpenAI key

# === Init Whisper ===
whisper = WhisperModel("base", device="cpu", compute_type="int8")

# === Animation Functions ===
def get_audio_duration(audio_file):
    """Gets the duration of the audio using ffprobe."""
    try:
        command = [
            "ffprobe",
            "-v",
            "error",
            "-show_entries",
            "format=duration",
            "-of",
            "default=noprint_wrappers=1:nokey=1",
            audio_file,
        ]
        result = subprocess.run(command, capture_output=True, text=True)
        return float(result.stdout)
    except Exception as e:
        print(f"Error getting audio duration: {e}")
        return 5  # Return a default duration

def generate_images_with_openai(prompt, num_images=1):
    """
    Generates images using OpenAI's API.

    Args:
        prompt (str): The prompt to use for image generation.
        num_images (int, optional): The number of images to generate. Defaults to 1.

    Returns:
        list: A list of image URLs, or None on error.
    """
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": "dall-e-3",  # Use the DALL-E 3 model
        "prompt": prompt,
        "n": num_images,
        "size": "1024x1024",  # You can adjust the size as needed
        "response_format": "url" # Ensure we get URLs
    }

    try:
        response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
        response.raise_for_status()  # Raise an exception for bad status codes
        data = response.json()
        image_urls = [item["url"] for item in data["data"]]
        return image_urls
    except requests.exceptions.RequestException as e:
        print(f"Error generating images with OpenAI: {e}")
        return None
    except KeyError:
        print(f"Error: Unexpected response format from OpenAI: {data}")
        return None
    except Exception as e:
        print(f"Error during image generation: {e}")
        return None

def create_animated_explanation(text, audio_file):
    """
    Generates a more professional animation with images and synchronized text.

    Args:
        text (str): The text to display and explain.
        audio_file (str): The path to the audio file.

    Returns:
        str: The path to the generated video file (.mp4), or None on error.
    """
    try:
        # 1. Split text into meaningful chunks (sentences or phrases)
        sentences = split_text_into_chunks(text)
        audio_duration = get_audio_duration(audio_file)
        if audio_duration is None:
            audio_duration = 5 # set default
        total_frames = 100  # Example number of frames
        fps = 10
        frame_duration = 1 / fps
        image_urls = []

        # 2. Generate images for key sentences
        for sentence in sentences:
            image_prompt = f"Illustrate the concept: {sentence}"
            urls = generate_images_with_openai(image_prompt)  # Generate 1 image per sentence
            if urls:
                image_urls.append(urls[0])  # Use the first URL
            else:
                image_urls.append(None) # Append None if image generation fails

        # 3. Create frames for the animation
        frames = []
        for i in range(total_frames):
            frame_progress = i / total_frames
            sentence_index = int(frame_progress * len(sentences))
            sentence_index = min(sentence_index, len(sentences) - 1) #clamp

            color = (220, 220, 220)  # Light gray
            img = Image.new("RGB", (640, 480), color=color)
            d = ImageDraw.Draw(img)
            font = ImageFont.truetype("DejaVuSans.ttf", 20)
            current_sentence = sentences[sentence_index]
            lines = textwrap.wrap(current_sentence, width=40)
            y_start = (480 - len(lines) * 24) // 2

            # Display sentence
            for j, line in enumerate(lines):
                try:
                    bbox = d.textbbox((0,0), line, font=font)
                    text_width = bbox[2] - bbox[0]
                    text_x = (640 - text_width) // 2
                except AttributeError as e:
                    print(f"Pillow version error: {e}")
                    text_x = 10
                d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)

            # Add image if available
            if image_urls[sentence_index]:
                try:
                    image_data = requests.get(image_urls[sentence_index], stream=True).raw
                    img_to_paste = Image.open(image_data).resize((200, 200))  # Resize as needed
                    img.paste(img_to_paste, (440, 280))  # Position the image
                except Exception as e:
                    print(f"Error loading or pasting image: {e}")

            frames.append(img)

        # 4. Save frames and create video
        image_files = []
        for i, frame in enumerate(frames):
            image_file = f"frame_{i:04d}.png"
            frame.save(image_file)
            image_files.append(image_file)

        video_file = "animated_explanation.mp4"
        command = [
            "ffmpeg",
            "-framerate", str(fps),
            "-i", "frame_%04d.png",
            "-i", audio_file,
            "-c:v", "libx264",
            "-pix_fmt", "yuv420p",
            "-y",
            video_file,
        ]
        subprocess.run(command, check=True, capture_output=True)

        for image_file in image_files:
            os.remove(image_file)
        return video_file

    except Exception as e:
        print(f"Error creating animated explanation: {e}")
        return None  # Return None on error

def split_text_into_chunks(text):
    """Splits text into sentences or phrases, handling punctuation."""
    import re
    # Split by common sentence-ending punctuation, but handle abbreviations
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
    return sentences

def create_animation(text, audio_file):
    """
    Selects and runs an animation function.
    """
    return create_animated_explanation(text, audio_file)

def process_audio(audio_file):
    # 1. Speech to Text
    segments, _ = whisper.transcribe(audio_file)
    user_text = " ".join([segment.text for segment in segments])

    # 2. Groq API Call
    headers = {
        "Authorization": f"Bearer {GROQ_API_KEY}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": GROQ_MODEL,
        "messages": [{"role": "user", "content": user_text}],
        "temperature": 0.5,
    }

    response = requests.post(GROQ_API_URL, headers=headers, json=payload)
    if response.status_code != 200:
        return f"Groq API Error: {response.text}", None, None

    reply = response.json()["choices"][0]["message"]["content"]

    # 3. TTS using gTTS
    tts = gTTS(reply)
    audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
    tts.save(audio_output.name)

    # 4. Create animation
    video_file = create_animation(reply, audio_output.name)

    return reply, audio_output.name, video_file

iface = gr.Interface(
    fn=process_audio,
    inputs=gr.Audio(type="filepath", label="🎤 Speak your question"),
    outputs=[
        gr.Textbox(label="🧠 Groq Response"),
        gr.Audio(label="🔊 AI Voice Reply"),
        gr.Video(label="🎬 Animation"),
    ],
    title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
    description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
    live=True,
)

iface.launch()