MuhammadSajid commited on
Commit
18d6507
·
verified ·
1 Parent(s): aa8af41

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +130 -147
app.py CHANGED
@@ -5,177 +5,160 @@ from faster_whisper import WhisperModel
5
  from gtts import gTTS
6
  import os
7
  import subprocess
8
- from PIL import Image, ImageDraw, ImageFont
9
  import random
10
  import textwrap
11
  import pkg_resources
12
  import sys
 
 
13
 
14
  # === Config ===
15
  GROQ_API_KEY = "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"
16
  GROQ_MODEL = "llama3-70b-8192"
17
  GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
 
 
18
 
19
  # === Init Whisper ===
20
  whisper = WhisperModel("base", device="cpu", compute_type="int8")
21
 
22
  # === Animation Functions ===
23
- def create_static_image_animation(text, audio_file):
24
  """
25
- Generates a simple video with scrolling text over a static image.
26
 
27
  Args:
28
- text (str): The text to display in the video.
29
- audio_file (str): The path to the audio file.
30
 
31
  Returns:
32
- str: The path to the generated video file (.mp4).
33
  """
34
- # 1. Create a static image (you can replace this with a more interesting one)
35
- width, height = 800, 600
36
- image = Image.new("RGB", (width, height), color=(220, 220, 220)) # Light gray background
37
- draw = ImageDraw.Draw(image)
38
- font = ImageFont.truetype("DejaVuSans.ttf", 40) # Use a default font, or specify a path
39
- text_color = (0, 0, 0) # Black text
40
-
41
- # 2. Split the text into lines
42
- lines = textwrap.wrap(text, width=40) # Adjust width as needed
43
- y_start = (height - len(lines) * 40) // 2 # Center vertically
44
-
45
- for i, line in enumerate(lines):
46
- draw.text((50, y_start + i * 40), line, fill=text_color, font=font)
47
- image_file = "static_image.png"
48
- image.save(image_file)
49
-
50
- # 3. Create a silent video with the static image
51
- video_file = "static_video.mp4"
52
- audio_duration = get_audio_duration(audio_file) #get the duration
53
-
54
- command = [
55
- "ffmpeg",
56
- "-loop", "1", # Loop the image
57
- "-i", image_file,
58
- "-c:v", "libx264",
59
- "-t", str(audio_duration), # Duration of the video
60
- "-pix_fmt", "yuv420p",
61
- video_file
62
- ]
63
- subprocess.run(command, check=True, capture_output=True)
64
-
65
- # 4. Add the audio to the video
66
- output_video = "output_video.mp4"
67
- command = [
68
- "ffmpeg",
69
- "-i", video_file,
70
- "-i", audio_file,
71
- "-c:v", "copy",
72
- "-c:a", "aac",
73
- "-strict", "experimental",
74
- output_video
75
- ]
76
- subprocess.run(command, check=True, capture_output=True)
77
- os.remove(image_file) #remove the image and video
78
- os.remove(video_file)
79
- return output_video
80
-
81
- def get_audio_duration(audio_file):
82
- """Gets the duration of the audio using ffprobe."""
83
- command = [
84
- "ffprobe",
85
- "-v", "error",
86
- "-show_entries", "format=duration",
87
- "-of", "default=noprint_wrappers=1:nokey=1",
88
- audio_file
89
- ]
90
- result = subprocess.run(command, capture_output=True, text=True)
91
- return float(result.stdout)
92
-
93
- def create_basic_animation(text, audio_file):
94
  """
95
- Generates a very basic animation video with colored frames and text.
96
 
97
  Args:
98
- text (str): The text to display.
99
  audio_file (str): The path to the audio file.
100
 
101
  Returns:
102
- str: The path to the generated video file (.mp4).
103
  """
104
- # 1. Parameters for the video
105
- width, height = 640, 480
106
- frame_rate = 10
107
- duration = get_audio_duration(audio_file)
108
- num_frames = int(duration * frame_rate)
109
- colors = [(255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 255, 0), (0, 255, 255), (255,0,255)] #basic colors
110
- font_size = 24
111
- font = ImageFont.truetype("DejaVuSans.ttf", font_size) # Use a default font
112
-
113
- # 2. Check Pillow version
114
  try:
115
- pillow_version = pkg_resources.get_distribution("Pillow").version
116
- print(f"Pillow version: {pillow_version}") # Print Pillow version
117
- if tuple(map(int, pillow_version.split("."))) < (8, 0, 0):
118
- raise ImportError(f"Pillow version >= 8.0.0 is required, but found {pillow_version}")
119
- except pkg_resources.DistributionNotFound:
120
- raise ImportError("Pillow is not installed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
  except Exception as e:
122
- print(f"Error checking Pillow version: {e}")
123
- raise # Re-raise the exception to see the full traceback
124
-
125
- # 2. Create frames
126
- frames = []
127
- for i in range(num_frames):
128
- color = colors[i % len(colors)] #cycle through colors
129
- img = Image.new("RGB", (width, height), color=color)
130
- d = ImageDraw.Draw(img)
131
- # Display the text, centered and broken into lines
132
- lines = textwrap.wrap(text, width=30) # Adjust max line length as needed
133
- y_start = (height - len(lines) * font_size) // 2
134
- for j, line in enumerate(lines):
135
- try:
136
- bbox = d.textbbox((0, 0), line, font=font)
137
- text_width = bbox[2] - bbox[0]
138
- text_x = (width - text_width) // 2
139
-
140
- except AttributeError as e:
141
- print(f"AttributeError: {e}")
142
- print(f"Pillow version: {pillow_version}")
143
- raise # Raise the error
144
- d.text((text_x, y_start + j * font_size), line, fill=(0, 0, 0), font=font) # Black text
145
- frames.append(img)
146
-
147
- # 3. Save frames as images
148
- image_files = []
149
- for i, frame in enumerate(frames):
150
- image_file = f"frame_{i:04d}.png"
151
- frame.save(image_file)
152
- image_files.append(image_file)
153
-
154
- # 4. Create video from images and add audio
155
- video_file = "basic_animation.mp4"
156
- command = [
157
- "ffmpeg",
158
- "-framerate", str(frame_rate),
159
- "-i", "frame_%04d.png", # Input image sequence
160
- "-i", audio_file,
161
- "-c:v", "libx264",
162
- "-pix_fmt", "yuv420p",
163
- "-y", # Overwrite if exists
164
- video_file
165
- ]
166
- subprocess.run(command, check=True, capture_output=True)
167
-
168
- # 5. Clean up image files
169
- for image_file in image_files:
170
- os.remove(image_file)
171
- return video_file
172
 
173
  def create_animation(text, audio_file):
174
  """
175
- Selects and runs an animation function. This could be expanded to select from multiple animation styles.
176
  """
177
- # For now, just use the basic animation. You can add logic here to choose different animations.
178
- return create_basic_animation(text, audio_file)
179
 
180
  def process_audio(audio_file):
181
  # 1. Speech to Text
@@ -185,21 +168,21 @@ def process_audio(audio_file):
185
  # 2. Groq API Call
186
  headers = {
187
  "Authorization": f"Bearer {GROQ_API_KEY}",
188
- "Content-Type": "application/json"
189
  }
190
  payload = {
191
  "model": GROQ_MODEL,
192
  "messages": [{"role": "user", "content": user_text}],
193
- "temperature": 0.5
194
  }
195
 
196
  response = requests.post(GROQ_API_URL, headers=headers, json=payload)
197
  if response.status_code != 200:
198
- return f"Groq API Error: {response.text}", None, None # Return None for video_file
199
 
200
  reply = response.json()["choices"][0]["message"]["content"]
201
 
202
- # 3. TTS using gTTS (generates .mp3)
203
  tts = gTTS(reply)
204
  audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
205
  tts.save(audio_output.name)
@@ -207,7 +190,7 @@ def process_audio(audio_file):
207
  # 4. Create animation
208
  video_file = create_animation(reply, audio_output.name)
209
 
210
- return reply, audio_output.name, video_file # Return the video file path
211
 
212
  iface = gr.Interface(
213
  fn=process_audio,
@@ -215,11 +198,11 @@ iface = gr.Interface(
215
  outputs=[
216
  gr.Textbox(label="🧠 Groq Response"),
217
  gr.Audio(label="🔊 AI Voice Reply"),
218
- gr.Video(label="🎬 Animation") # Add the video output
219
  ],
220
- title="🗣️ Voice AI Assistant with Animation (Groq + Whisper + gTTS)",
221
- description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and simple animation.",
222
- live=True
223
  )
224
 
225
  iface.launch()
 
5
  from gtts import gTTS
6
  import os
7
  import subprocess
8
+ from PIL import Image, ImageDraw, ImageFont, ImageSequence
9
  import random
10
  import textwrap
11
  import pkg_resources
12
  import sys
13
+ import io
14
+ import base64
15
 
16
  # === Config ===
17
  GROQ_API_KEY = "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"
18
  GROQ_MODEL = "llama3-70b-8192"
19
  GROQ_API_URL = "https://api.groq.com/openai/v1/chat/completions"
20
+ IMAGE_GENERATION_API_URL = "https://api.openai.com/v1/images/generate" # Corrected URL
21
+ OPENAI_API_KEY = "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" # Replace with your OpenAI key
22
 
23
  # === Init Whisper ===
24
  whisper = WhisperModel("base", device="cpu", compute_type="int8")
25
 
26
  # === Animation Functions ===
27
+ def generate_images_with_openai(prompt, num_images=1):
28
  """
29
+ Generates images using OpenAI's API.
30
 
31
  Args:
32
+ prompt (str): The prompt to use for image generation.
33
+ num_images (int, optional): The number of images to generate. Defaults to 1.
34
 
35
  Returns:
36
+ list: A list of image URLs, or None on error.
37
  """
38
+ headers = {
39
+ "Authorization": f"Bearer {OPENAI_API_KEY}",
40
+ "Content-Type": "application/json",
41
+ }
42
+ payload = {
43
+ "model": "dall-e-3", # Use the DALL-E 3 model
44
+ "prompt": prompt,
45
+ "n": num_images,
46
+ "size": "1024x1024", # You can adjust the size as needed
47
+ }
48
+
49
+ try:
50
+ response = requests.post(IMAGE_GENERATION_API_URL, headers=headers, json=payload)
51
+ response.raise_for_status() # Raise an exception for bad status codes
52
+ data = response.json()
53
+ image_urls = [item["url"] for item in data["data"]]
54
+ return image_urls
55
+ except requests.exceptions.RequestException as e:
56
+ print(f"Error generating images with OpenAI: {e}")
57
+ return None
58
+ except KeyError:
59
+ print(f"Error: Unexpected response format from OpenAI: {data}")
60
+ return None
61
+
62
+ def create_animated_explanation(text, audio_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  """
64
+ Generates a more professional animation with images and synchronized text.
65
 
66
  Args:
67
+ text (str): The text to display and explain.
68
  audio_file (str): The path to the audio file.
69
 
70
  Returns:
71
+ str: The path to the generated video file (.mp4), or None on error.
72
  """
 
 
 
 
 
 
 
 
 
 
73
  try:
74
+ # 1. Split text into meaningful chunks (sentences or phrases)
75
+ sentences = split_text_into_chunks(text)
76
+ audio_duration = get_audio_duration(audio_file)
77
+ total_frames = 100 # Example number of frames
78
+ fps = 10
79
+ frame_duration = 1 / fps
80
+ image_urls = []
81
+
82
+ # 2. Generate images for key sentences
83
+ for sentence in sentences:
84
+ image_prompt = f"Illustrate the concept: {sentence}"
85
+ urls = generate_images_with_openai(image_prompt) # Generate 1 image per sentence
86
+ if urls:
87
+ image_urls.append(urls[0]) # Use the first URL
88
+ else:
89
+ image_urls.append(None) # Append None if image generation fails
90
+
91
+ # 3. Create frames for the animation
92
+ frames = []
93
+ for i in range(total_frames):
94
+ frame_progress = i / total_frames
95
+ sentence_index = int(frame_progress * len(sentences))
96
+ sentence_index = min(sentence_index, len(sentences) - 1) #clamp
97
+
98
+ color = (220, 220, 220) # Light gray
99
+ img = Image.new("RGB", (640, 480), color=color)
100
+ d = ImageDraw.Draw(img)
101
+ font = ImageFont.truetype("DejaVuSans.ttf", 20)
102
+ current_sentence = sentences[sentence_index]
103
+ lines = textwrap.wrap(current_sentence, width=40)
104
+ y_start = (480 - len(lines) * 24) // 2
105
+
106
+ # Display sentence
107
+ for j, line in enumerate(lines):
108
+ text_x = (640 - d.textsize(line, font=font)[0]) // 2
109
+ d.text((text_x, y_start + j * 24), line, fill=(0, 0, 0), font=font)
110
+
111
+ # Add image if available
112
+ if image_urls[sentence_index]:
113
+ try:
114
+ image_data = requests.get(image_urls[sentence_index], stream=True).raw
115
+ img_to_paste = Image.open(image_data).resize((200, 200)) # Resize as needed
116
+ img.paste(img_to_paste, (440, 280)) # Position the image
117
+ except Exception as e:
118
+ print(f"Error loading or pasting image: {e}")
119
+
120
+ frames.append(img)
121
+
122
+ # 4. Save frames and create video
123
+ image_files = []
124
+ for i, frame in enumerate(frames):
125
+ image_file = f"frame_{i:04d}.png"
126
+ frame.save(image_file)
127
+ image_files.append(image_file)
128
+
129
+ video_file = "animated_explanation.mp4"
130
+ command = [
131
+ "ffmpeg",
132
+ "-framerate", str(fps),
133
+ "-i", "frame_%04d.png",
134
+ "-i", audio_file,
135
+ "-c:v", "libx264",
136
+ "-pix_fmt", "yuv420p",
137
+ "-y",
138
+ video_file,
139
+ ]
140
+ subprocess.run(command, check=True, capture_output=True)
141
+
142
+ for image_file in image_files:
143
+ os.remove(image_file)
144
+ return video_file
145
+
146
  except Exception as e:
147
+ print(f"Error creating animated explanation: {e}")
148
+ return None # Return None on error
149
+
150
+ def split_text_into_chunks(text):
151
+ """Splits text into sentences or phrases, handling punctuation."""
152
+ import re
153
+ # Split by common sentence-ending punctuation, but handle abbreviations
154
+ sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s+', text)
155
+ return sentences
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  def create_animation(text, audio_file):
158
  """
159
+ Selects and runs an animation function.
160
  """
161
+ return create_animated_explanation(text, audio_file)
 
162
 
163
  def process_audio(audio_file):
164
  # 1. Speech to Text
 
168
  # 2. Groq API Call
169
  headers = {
170
  "Authorization": f"Bearer {GROQ_API_KEY}",
171
+ "Content-Type": "application/json",
172
  }
173
  payload = {
174
  "model": GROQ_MODEL,
175
  "messages": [{"role": "user", "content": user_text}],
176
+ "temperature": 0.5,
177
  }
178
 
179
  response = requests.post(GROQ_API_URL, headers=headers, json=payload)
180
  if response.status_code != 200:
181
+ return f"Groq API Error: {response.text}", None, None
182
 
183
  reply = response.json()["choices"][0]["message"]["content"]
184
 
185
+ # 3. TTS using gTTS
186
  tts = gTTS(reply)
187
  audio_output = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
188
  tts.save(audio_output.name)
 
190
  # 4. Create animation
191
  video_file = create_animation(reply, audio_output.name)
192
 
193
+ return reply, audio_output.name, video_file
194
 
195
  iface = gr.Interface(
196
  fn=process_audio,
 
198
  outputs=[
199
  gr.Textbox(label="🧠 Groq Response"),
200
  gr.Audio(label="🔊 AI Voice Reply"),
201
+ gr.Video(label="🎬 Animation"),
202
  ],
203
+ title="🗣️ Voice AI Assistant with Professional Animation (Groq + Whisper + gTTS)",
204
+ description="🎙️ Whisper for STT, Groq for response, gTTS for voice output, and enhanced animation.",
205
+ live=True,
206
  )
207
 
208
  iface.launch()