File size: 7,818 Bytes
3553b09
 
 
 
 
 
 
 
17ee1ec
3553b09
 
 
 
 
 
 
 
 
9ea8dc2
3553b09
17ee1ec
 
 
 
 
 
 
 
 
6537e41
 
 
9ea8dc2
3553b09
6537e41
 
 
 
 
 
3553b09
 
 
 
 
 
 
 
 
 
 
 
 
5af1eca
3553b09
 
9ea8dc2
 
 
 
 
3553b09
 
 
9ea8dc2
 
3553b09
9ea8dc2
3553b09
 
6537e41
3553b09
 
 
 
d2f3f83
3553b09
6537e41
3553b09
d2f3f83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3553b09
d2f3f83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3553b09
6537e41
d2f3f83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3553b09
d2f3f83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3553b09
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import gradio as gr
from gtts import gTTS
import os
import speech_recognition as sr
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import cv2
from pydub import AudioSegment

# Text-to-Speech function
def text_to_speech(text):
    tts = gTTS(text=text, lang='en', slow=False)
    filename = "output.mp3"
    tts.save(filename)
    return filename

# Speech-to-Text function
def speech_to_text(audio):
    recognizer = sr.Recognizer()
    
    # Check if the uploaded file is an MP3
    if audio.endswith('.mp3'):
        # Convert MP3 to WAV
        audio_segment = AudioSegment.from_mp3(audio)
        wav_file = "temp.wav"
        audio_segment.export(wav_file, format="wav")
        audio = wav_file  # Update audio to the converted file
    
    try:
        with sr.AudioFile(audio) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except sr.UnknownValueError:
        return "Sorry, I could not understand the audio."
    except sr.RequestError as e:
        return f"Could not request results; {e}"
    except Exception as e:
        return f"An error occurred: {e}"

# Image Description function
def generate_image_description(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)
    description = processor.decode(out[0], skip_special_tokens=True)
    return description

# Video Description function
def generate_video_description(video):
    cap = cv2.VideoCapture(video.name)
    descriptions = []

    if not cap.isOpened():
        return "Error opening video file."

    frame_count = 0
    while frame_count < 5:  # Limit to first 5 frames for this example
        ret, frame = cap.read()
        if not ret:
            break
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        description = generate_image_description(image)
        descriptions.append(description)
        frame_count += 1

    cap.release()
    return descriptions if descriptions else ["No frames to describe."]

# Gradio Interface
def main():
    with gr.Blocks() as app:
        gr.Markdown("<h1 style='text-align: center; color: #1e90ff;'>AI-Powered Accessibility Tools</h1>")
        
        # Text-to-Speech Section
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #f0f8ff; padding: 20px; border-radius: 8px;'>"
                            "<h2>Text-to-Speech</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Create natural-sounding speech from text input.</li>"
                            "<li><strong>Functionality:</strong> Converts written text into spoken words, helping individuals with reading difficulties or visual impairments.</li>"
                            "<li><strong>Target Audience:</strong> People with visual impairments, reading disabilities, and those who prefer audio content.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> Plain text. <br>"
                            "<strong>Output:</strong> MP3 audio file."
                            "</div>")
                text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...")
                tts_button = gr.Button("Convert to Speech")
                tts_output = gr.Audio(label="TTS Output")
                tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)

            # Speech-to-Text Section
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #e6ffe6; padding: 20px; border-radius: 8px;'>"
                            "<h2>Speech-to-Text</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Convert spoken language into written text.</li>"
                            "<li><strong>Functionality:</strong> Allows users to dictate speech and have it transcribed into text, facilitating communication and documentation.</li>"
                            "<li><strong>Target Audience:</strong> Individuals with hearing impairments, those who prefer speaking over typing, and people with mobility challenges.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> WAV, FLAC, AIFF, MP3 (converted to WAV). <br>"
                            "<strong>Output:</strong> Transcribed text."
                            "</div>")
                stt_input = gr.Audio(label="Record or Upload Audio", type="filepath")
                stt_button = gr.Button("Convert Speech to Text")
                stt_output = gr.Textbox(label="Speech-to-Text Output")
                stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output)

        # Image Description Section
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #ffe6e6; padding: 20px; border-radius: 8px;'>"
                            "<h2>Image Description</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Generate descriptive text for images.</li>"
                            "<li><strong>Functionality:</strong> Analyzes and describes the content of images, making visual information accessible to those who are visually impaired.</li>"
                            "<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in understanding visual content.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> JPEG, PNG, BMP, GIF. <br>"
                            "<strong>Output:</strong> Text description."
                            "</div>")
                image_input = gr.Image(label="Upload an Image")
                image_desc_output = gr.Textbox(label="Image Description")
                image_desc_button = gr.Button("Describe Image")
                image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output)

            # Video Description Section
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #fff3e6; padding: 20px; border-radius: 8px;'>"
                            "<h2>Video Description</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Describe video content through generated text.</li>"
                            "<li><strong>Functionality:</strong> Provides textual descriptions of video frames, aiding understanding for those who cannot see the video.</li>"
                            "<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in interpreting video content.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> MP4, AVI, MOV. <br>"
                            "<strong>Output:</strong> List of text descriptions."
                            "</div>")
                video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"])
                video_desc_output = gr.Textbox(label="Video Descriptions")
                video_desc_button = gr.Button("Describe Video")
                video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output)

    app.launch()

if __name__ == "__main__":
    main()