import gradio as gr
from gtts import gTTS
import os
import speech_recognition as sr
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import cv2
from pydub import AudioSegment
# Text-to-Speech function
def text_to_speech(text):
tts = gTTS(text=text, lang='en', slow=False)
filename = "output.mp3"
tts.save(filename)
return filename
# Speech-to-Text function
def speech_to_text(audio):
recognizer = sr.Recognizer()
# Check if the uploaded file is an MP3
if audio.endswith('.mp3'):
# Convert MP3 to WAV
audio_segment = AudioSegment.from_mp3(audio)
wav_file = "temp.wav"
audio_segment.export(wav_file, format="wav")
audio = wav_file # Update audio to the converted file
try:
with sr.AudioFile(audio) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Sorry, I could not understand the audio."
except sr.RequestError as e:
return f"Could not request results; {e}"
except Exception as e:
return f"An error occurred: {e}"
# Image Description function
def generate_image_description(image):
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
description = processor.decode(out[0], skip_special_tokens=True)
return description
# Video Description function
def generate_video_description(video):
cap = cv2.VideoCapture(video.name)
descriptions = []
if not cap.isOpened():
return "Error opening video file."
frame_count = 0
while frame_count < 5: # Limit to first 5 frames for this example
ret, frame = cap.read()
if not ret:
break
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
description = generate_image_description(image)
descriptions.append(description)
frame_count += 1
cap.release()
return descriptions if descriptions else ["No frames to describe."]
# Gradio Interface
def main():
with gr.Blocks() as app:
gr.Markdown("
AI-Powered Accessibility Tools
")
# Text-to-Speech Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(""
"
Text-to-Speech
"
"
"
"- Core Idea: Create natural-sounding speech from text input.
"
"- Functionality: Converts written text into spoken words, helping individuals with reading difficulties or visual impairments.
"
"- Target Audience: People with visual impairments, reading disabilities, and those who prefer audio content.
"
"
"
"
Supported Input: Plain text.
"
"
Output: MP3 audio file."
"
")
text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...")
tts_button = gr.Button("Convert to Speech")
tts_output = gr.Audio(label="TTS Output")
tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)
# Speech-to-Text Section
with gr.Column(scale=1):
gr.Markdown(""
"
Speech-to-Text
"
"
"
"- Core Idea: Convert spoken language into written text.
"
"- Functionality: Allows users to dictate speech and have it transcribed into text, facilitating communication and documentation.
"
"- Target Audience: Individuals with hearing impairments, those who prefer speaking over typing, and people with mobility challenges.
"
"
"
"
Supported Input: WAV, FLAC, AIFF, MP3 (converted to WAV).
"
"
Output: Transcribed text."
"
")
stt_input = gr.Audio(label="Record or Upload Audio", type="filepath")
stt_button = gr.Button("Convert Speech to Text")
stt_output = gr.Textbox(label="Speech-to-Text Output")
stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output)
# Image Description Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown(""
"
Image Description
"
"
"
"- Core Idea: Generate descriptive text for images.
"
"- Functionality: Analyzes and describes the content of images, making visual information accessible to those who are visually impaired.
"
"- Target Audience: Individuals with visual impairments and those needing assistance in understanding visual content.
"
"
"
"
Supported Input: JPEG, PNG, BMP, GIF.
"
"
Output: Text description."
"
")
image_input = gr.Image(label="Upload an Image")
image_desc_output = gr.Textbox(label="Image Description")
image_desc_button = gr.Button("Describe Image")
image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output)
# Video Description Section
with gr.Column(scale=1):
gr.Markdown(""
"
Video Description
"
"
"
"- Core Idea: Describe video content through generated text.
"
"- Functionality: Provides textual descriptions of video frames, aiding understanding for those who cannot see the video.
"
"- Target Audience: Individuals with visual impairments and those needing assistance in interpreting video content.
"
"
"
"
Supported Input: MP4, AVI, MOV.
"
"
Output: List of text descriptions."
"
")
video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"])
video_desc_output = gr.Textbox(label="Video Descriptions")
video_desc_button = gr.Button("Describe Video")
video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output)
app.launch()
if __name__ == "__main__":
main()