Spaces:

akazmi
/

hackaton1

Sleeping

File size: 7,818 Bytes

import gradio as gr
from gtts import gTTS
import os
import speech_recognition as sr
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch
from PIL import Image
import cv2
from pydub import AudioSegment

# Text-to-Speech function
def text_to_speech(text):
    tts = gTTS(text=text, lang='en', slow=False)
    filename = "output.mp3"
    tts.save(filename)
    return filename

# Speech-to-Text function
def speech_to_text(audio):
    recognizer = sr.Recognizer()
    
    # Check if the uploaded file is an MP3
    if audio.endswith('.mp3'):
        # Convert MP3 to WAV
        audio_segment = AudioSegment.from_mp3(audio)
        wav_file = "temp.wav"
        audio_segment.export(wav_file, format="wav")
        audio = wav_file  # Update audio to the converted file
    
    try:
        with sr.AudioFile(audio) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except sr.UnknownValueError:
        return "Sorry, I could not understand the audio."
    except sr.RequestError as e:
        return f"Could not request results; {e}"
    except Exception as e:
        return f"An error occurred: {e}"

# Image Description function
def generate_image_description(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

    inputs = processor(images=image, return_tensors="pt")
    out = model.generate(**inputs)
    description = processor.decode(out[0], skip_special_tokens=True)
    return description

# Video Description function
def generate_video_description(video):
    cap = cv2.VideoCapture(video.name)
    descriptions = []

    if not cap.isOpened():
        return "Error opening video file."

    frame_count = 0
    while frame_count < 5:  # Limit to first 5 frames for this example
        ret, frame = cap.read()
        if not ret:
            break
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        description = generate_image_description(image)
        descriptions.append(description)
        frame_count += 1

    cap.release()
    return descriptions if descriptions else ["No frames to describe."]

# Gradio Interface
def main():
    with gr.Blocks() as app:
        gr.Markdown("<h1 style='text-align: center; color: #1e90ff;'>AI-Powered Accessibility Tools</h1>")
        
        # Text-to-Speech Section
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #f0f8ff; padding: 20px; border-radius: 8px;'>"
                            "<h2>Text-to-Speech</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Create natural-sounding speech from text input.</li>"
                            "<li><strong>Functionality:</strong> Converts written text into spoken words, helping individuals with reading difficulties or visual impairments.</li>"
                            "<li><strong>Target Audience:</strong> People with visual impairments, reading disabilities, and those who prefer audio content.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> Plain text. <br>"
                            "<strong>Output:</strong> MP3 audio file."
                            "</div>")
                text_input = gr.Textbox(label="Enter text for Text-to-Speech", placeholder="Type your text here...")
                tts_button = gr.Button("Convert to Speech")
                tts_output = gr.Audio(label="TTS Output")
                tts_button.click(fn=text_to_speech, inputs=text_input, outputs=tts_output)

            # Speech-to-Text Section
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #e6ffe6; padding: 20px; border-radius: 8px;'>"
                            "<h2>Speech-to-Text</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Convert spoken language into written text.</li>"
                            "<li><strong>Functionality:</strong> Allows users to dictate speech and have it transcribed into text, facilitating communication and documentation.</li>"
                            "<li><strong>Target Audience:</strong> Individuals with hearing impairments, those who prefer speaking over typing, and people with mobility challenges.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> WAV, FLAC, AIFF, MP3 (converted to WAV). <br>"
                            "<strong>Output:</strong> Transcribed text."
                            "</div>")
                stt_input = gr.Audio(label="Record or Upload Audio", type="filepath")
                stt_button = gr.Button("Convert Speech to Text")
                stt_output = gr.Textbox(label="Speech-to-Text Output")
                stt_button.click(fn=speech_to_text, inputs=stt_input, outputs=stt_output)

        # Image Description Section
        with gr.Row():
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #ffe6e6; padding: 20px; border-radius: 8px;'>"
                            "<h2>Image Description</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Generate descriptive text for images.</li>"
                            "<li><strong>Functionality:</strong> Analyzes and describes the content of images, making visual information accessible to those who are visually impaired.</li>"
                            "<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in understanding visual content.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> JPEG, PNG, BMP, GIF. <br>"
                            "<strong>Output:</strong> Text description."
                            "</div>")
                image_input = gr.Image(label="Upload an Image")
                image_desc_output = gr.Textbox(label="Image Description")
                image_desc_button = gr.Button("Describe Image")
                image_desc_button.click(fn=generate_image_description, inputs=image_input, outputs=image_desc_output)

            # Video Description Section
            with gr.Column(scale=1):
                gr.Markdown("<div style='background-color: #fff3e6; padding: 20px; border-radius: 8px;'>"
                            "<h2>Video Description</h2>"
                            "<ul>"
                            "<li><strong>Core Idea:</strong> Describe video content through generated text.</li>"
                            "<li><strong>Functionality:</strong> Provides textual descriptions of video frames, aiding understanding for those who cannot see the video.</li>"
                            "<li><strong>Target Audience:</strong> Individuals with visual impairments and those needing assistance in interpreting video content.</li>"
                            "</ul>"
                            "<strong>Supported Input:</strong> MP4, AVI, MOV. <br>"
                            "<strong>Output:</strong> List of text descriptions."
                            "</div>")
                video_input = gr.File(label="Upload a Video", file_types=[".mp4", ".avi", ".mov"])
                video_desc_output = gr.Textbox(label="Video Descriptions")
                video_desc_button = gr.Button("Describe Video")
                video_desc_button.click(fn=generate_video_description, inputs=video_input, outputs=video_desc_output)

    app.launch()

if __name__ == "__main__":
    main()