File size: 7,818 Bytes
3553b09 17ee1ec 3553b09 9ea8dc2 3553b09 17ee1ec 6537e41 9ea8dc2 3553b09 6537e41 3553b09 5af1eca 3553b09 9ea8dc2 3553b09 9ea8dc2 3553b09 9ea8dc2 3553b09 6537e41 3553b09 d2f3f83 3553b09 6537e41 3553b09 d2f3f83 3553b09 d2f3f83 3553b09 6537e41 d2f3f83 3553b09 d2f3f83 3553b09 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import gradio as gr
from gtts import gTTS
import os
import speech_recognition as sr
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import cv2
from pydub import AudioSegment
# Text-to-Speech function
def text_to_speech(text):
tts = gTTS(text=text, lang='en', slow=False)
filename = "output.mp3"
tts.save(filename)
return filename
# Speech-to-Text function
def speech_to_text(audio):
recognizer = sr.Recognizer()
# Check if the uploaded file is an MP3
if audio.endswith('.mp3'):
# Convert MP3 to WAV
audio_segment = AudioSegment.from_mp3(audio)
wav_file = "temp.wav"
audio_segment.export(wav_file, format="wav")
audio = wav_file # Update audio to the converted file
try:
with sr.AudioFile(audio) as source:
audio_data = recognizer.record(source)
text = recognizer.recognize_google(audio_data)
return text
except sr.UnknownValueError:
return "Sorry, I could not understand the audio."
except sr.RequestError as e:
return f"Could not request results; {e}"
except Exception as e:
return f"An error occurred: {e}"
# Image Description function
def generate_image_description(image):
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
inputs = processor(images=image, return_tensors="pt")
out = model.generate(**inputs)
description = processor.decode(out[0], skip_special_tokens=True)
return description
# Video Description function
def generate_video_description(video):
cap = cv2.VideoCapture(video.name)
descriptions = []
if not cap.isOpened():
return "Error opening video file."
frame_count = 0
while frame_count < 5: # Limit to first 5 frames for this example
ret, frame = cap.read()
if not ret:
break
image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
description = generate_image_description(image)
descriptions.append(description)
frame_count += 1
cap.release()
return descriptions if descriptions else ["No frames to describe."]
# Gradio Interface
def main():
with gr.Blocks() as app:
gr.Markdown("<h1 style='text-align: center; color: #1e90ff;'>AI-Powered Accessibility Tools</h1>")
# Text-to-Speech Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #f0f8ff; padding: 20px; border-radius: 8px;'>"
"<h2>Text-to-Speech</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Create natural-sounding speech from text input.</li>"
"<li><strong>Functionality:</strong> Converts written text into spoken words, helping individuals with reading difficulties or visual impairments.</li>"
"<li><strong>Target Audience:</strong> People with visual impairments, reading disabilities, and those who prefer audio content.</li>"
"</ul>"
"<strong>Supported Input:</strong> Plain text. <br>"
"<strong>Output:</strong> MP3 audio file."
"</div>")
text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...")
tts_button = gr.Button("Convert to Speech")
tts_output = gr.Audio(label="TTS Output")
tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)
# Speech-to-Text Section
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #e6ffe6; padding: 20px; border-radius: 8px;'>"
"<h2>Speech-to-Text</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Convert spoken language into written text.</li>"
"<li><strong>Functionality:</strong> Allows users to dictate speech and have it transcribed into text, facilitating communication and documentation.</li>"
"<li><strong>Target Audience:</strong> Individuals with hearing impairments, those who prefer speaking over typing, and people with mobility challenges.</li>"
"</ul>"
"<strong>Supported Input:</strong> WAV, FLAC, AIFF, MP3 (converted to WAV). <br>"
"<strong>Output:</strong> Transcribed text."
"</div>")
stt_input = gr.Audio(label="Record or Upload Audio", type="filepath")
stt_button = gr.Button("Convert Speech to Text")
stt_output = gr.Textbox(label="Speech-to-Text Output")
stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output)
# Image Description Section
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #ffe6e6; padding: 20px; border-radius: 8px;'>"
"<h2>Image Description</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Generate descriptive text for images.</li>"
"<li><strong>Functionality:</strong> Analyzes and describes the content of images, making visual information accessible to those who are visually impaired.</li>"
"<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in understanding visual content.</li>"
"</ul>"
"<strong>Supported Input:</strong> JPEG, PNG, BMP, GIF. <br>"
"<strong>Output:</strong> Text description."
"</div>")
image_input = gr.Image(label="Upload an Image")
image_desc_output = gr.Textbox(label="Image Description")
image_desc_button = gr.Button("Describe Image")
image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output)
# Video Description Section
with gr.Column(scale=1):
gr.Markdown("<div style='background-color: #fff3e6; padding: 20px; border-radius: 8px;'>"
"<h2>Video Description</h2>"
"<ul>"
"<li><strong>Core Idea:</strong> Describe video content through generated text.</li>"
"<li><strong>Functionality:</strong> Provides textual descriptions of video frames, aiding understanding for those who cannot see the video.</li>"
"<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in interpreting video content.</li>"
"</ul>"
"<strong>Supported Input:</strong> MP4, AVI, MOV. <br>"
"<strong>Output:</strong> List of text descriptions."
"</div>")
video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"])
video_desc_output = gr.Textbox(label="Video Descriptions")
video_desc_button = gr.Button("Describe Video")
video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output)
app.launch()
if __name__ == "__main__":
main()
|