import gradio as gr from gtts import gTTS import os import speech_recognition as sr from transformers import BlipProcessor, BlipForConditionalGeneration import torch from PIL import Image import cv2 from pydub import AudioSegment # Text-to-Speech function def text_to_speech(text): tts = gTTS(text=text, lang='en', slow=False) filename = "output.mp3" tts.save(filename) return filename # Speech-to-Text function def speech_to_text(audio): recognizer = sr.Recognizer() # Check if the uploaded file is an MP3 if audio.endswith('.mp3'): # Convert MP3 to WAV audio_segment = AudioSegment.from_mp3(audio) wav_file = "temp.wav" audio_segment.export(wav_file, format="wav") audio = wav_file # Update audio to the converted file try: with sr.AudioFile(audio) as source: audio_data = recognizer.record(source) text = recognizer.recognize_google(audio_data) return text except sr.UnknownValueError: return "Sorry, I could not understand the audio." except sr.RequestError as e: return f"Could not request results; {e}" except Exception as e: return f"An error occurred: {e}" # Image Description function def generate_image_description(image): processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") inputs = processor(images=image, return_tensors="pt") out = model.generate(**inputs) description = processor.decode(out[0], skip_special_tokens=True) return description # Video Description function def generate_video_description(video): cap = cv2.VideoCapture(video.name) descriptions = [] if not cap.isOpened(): return "Error opening video file." frame_count = 0 while frame_count < 5: # Limit to first 5 frames for this example ret, frame = cap.read() if not ret: break image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)) description = generate_image_description(image) descriptions.append(description) frame_count += 1 cap.release() return descriptions if descriptions else ["No frames to describe."] # Gradio Interface def main(): with gr.Blocks() as app: gr.Markdown("

AI-Powered Accessibility Tools

") # Text-to-Speech Section with gr.Row(): with gr.Column(scale=1): gr.Markdown("
" "

Text-to-Speech

" "" "Supported Input: Plain text.
" "Output: MP3 audio file." "
") text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...") tts_button = gr.Button("Convert to Speech") tts_output = gr.Audio(label="TTS Output") tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output) # Speech-to-Text Section with gr.Column(scale=1): gr.Markdown("
" "

Speech-to-Text

" "" "Supported Input: WAV, FLAC, AIFF, MP3 (converted to WAV).
" "Output: Transcribed text." "
") stt_input = gr.Audio(label="Record or Upload Audio", type="filepath") stt_button = gr.Button("Convert Speech to Text") stt_output = gr.Textbox(label="Speech-to-Text Output") stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output) # Image Description Section with gr.Row(): with gr.Column(scale=1): gr.Markdown("
" "

Image Description

" "" "Supported Input: JPEG, PNG, BMP, GIF.
" "Output: Text description." "
") image_input = gr.Image(label="Upload an Image") image_desc_output = gr.Textbox(label="Image Description") image_desc_button = gr.Button("Describe Image") image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output) # Video Description Section with gr.Column(scale=1): gr.Markdown("
" "

Video Description

" "" "Supported Input: MP4, AVI, MOV.
" "Output: List of text descriptions." "
") video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"]) video_desc_output = gr.Textbox(label="Video Descriptions") video_desc_button = gr.Button("Describe Video") video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output) app.launch() if __name__ == "__main__": main()