akazmi commited on
Commit
9ea8dc2
·
verified ·
1 Parent(s): 16b4a8e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +17 -13
app.py CHANGED
@@ -1,4 +1,3 @@
1
- # Import necessary libraries
2
  import gradio as gr
3
  from gtts import gTTS
4
  import os
@@ -16,13 +15,12 @@ def text_to_speech(text):
16
  return filename
17
 
18
  # Speech-to-Text function
19
- def speech_to_text():
20
  recognizer = sr.Recognizer()
21
- with sr.Microphone() as source:
22
- print("Please say something:")
23
- audio = recognizer.listen(source)
24
  try:
25
- text = recognizer.recognize_google(audio)
26
  return text
27
  except sr.UnknownValueError:
28
  return "Sorry, I could not understand the audio."
@@ -41,17 +39,22 @@ def generate_image_description(image):
41
 
42
  # Video Description function
43
  def generate_video_description(video):
44
- cap = cv2.VideoCapture(video.name)
45
  descriptions = []
46
 
47
- for _ in range(5): # Limit to first 5 frames for description
 
 
 
 
48
  ret, frame = cap.read()
49
  if not ret:
50
  break
51
- frame_path = f"frame.jpg"
52
- cv2.imwrite(frame_path, frame) # Save frame as image
53
- description = generate_image_description(Image.open(frame_path))
54
  descriptions.append(description)
 
55
 
56
  cap.release()
57
  return descriptions
@@ -69,9 +72,10 @@ def main():
69
  tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)
70
 
71
  # Speech-to-Text
72
- stt_button = gr.Button("Record Audio")
 
73
  stt_output = gr.Textbox(label="Speech-to-Text Output")
74
- stt_button.click(fn=speech_to_text, outputs=stt_output)
75
 
76
  # Image Description
77
  image_input = gr.Image(label="Upload an Image")
 
 
1
  import gradio as gr
2
  from gtts import gTTS
3
  import os
 
15
  return filename
16
 
17
  # Speech-to-Text function
18
+ def speech_to_text(audio):
19
  recognizer = sr.Recognizer()
20
+ with sr.AudioFile(audio) as source: # Use the audio file directly
21
+ audio_data = recognizer.record(source)
 
22
  try:
23
+ text = recognizer.recognize_google(audio_data)
24
  return text
25
  except sr.UnknownValueError:
26
  return "Sorry, I could not understand the audio."
 
39
 
40
  # Video Description function
41
  def generate_video_description(video):
42
+ cap = cv2.VideoCapture(video.name) # Access the video using the file name
43
  descriptions = []
44
 
45
+ if not cap.isOpened():
46
+ return "Error opening video file."
47
+
48
+ frame_count = 0
49
+ while frame_count < 5: # Limit to first 5 frames for this example
50
  ret, frame = cap.read()
51
  if not ret:
52
  break
53
+ # Use a temporary image to generate description
54
+ image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
55
+ description = generate_image_description(image)
56
  descriptions.append(description)
57
+ frame_count += 1
58
 
59
  cap.release()
60
  return descriptions
 
72
  tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)
73
 
74
  # Speech-to-Text
75
+ stt_input = gr.Audio(source="microphone", type="filepath", label="Record Audio")
76
+ stt_button = gr.Button("Convert Speech to Text")
77
  stt_output = gr.Textbox(label="Speech-to-Text Output")
78
+ stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output)
79
 
80
  # Image Description
81
  image_input = gr.Image(label="Upload an Image")