Spaces:

gabai-capstone
/

GabAI

Sleeping

File size: 2,019 Bytes

1b65d81
 
 
6516b6c
026149f
1b65d81
2c32f32
 
1b65d81
fd054be
 
352fca9
fd054be
b1b53ca
 
 
46d8531
fd054be
 
 
 
46d8531
352fca9
d9fea6d
026149f
 
 
352fca9
026149f
 
352fca9
 
 
2c32f32
 
 
 
352fca9
 
 
 
026149f
 
 
 
 
 
 
 
 
352fca9
026149f
 
 
 
 
 
 
d9fea6d
026149f
 
 
 
1655eaa
1b65d81
8efeaf2
026149f

import gradio as gr
import cv2
import numpy as np
from PIL import Image
import pytesseract
from gtts import gTTS
import tempfile
import os

def preprocess(image):
    img = np.array(image)
    img = cv2.flip(img, 1)  # Mirror-correct
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]])
    gray = cv2.filter2D(gray, -1, kernel)
    gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
    thresh = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        31, 10
    )
    return thresh, img

def extract_text_and_speak(image):
    processed, flipped_preview = preprocess(image)
    processed_pil = Image.fromarray(processed)
    preview_pil = Image.fromarray(flipped_preview)

    text = pytesseract.image_to_string(processed, lang="eng").strip()
    
    # Create TTS only if text exists
    if text and text != "No readable text found.":
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
            tts = gTTS(text)
            tts.save(tmpfile.name)
            audio_file = tmpfile.name
    else:
        audio_file = None
        if not text:
            text = "No readable text found."

    return preview_pil, processed_pil, text, audio_file

with gr.Blocks() as demo:
    gr.Markdown("## GabAI - AI Assistive Reading System")

    with gr.Row():
        webcam = gr.Image(
            type="pil",
            sources=["webcam"],
            label="Live Webcam (mirrored)"
        )
        corrected_preview = gr.Image(type="pil", label="Mirror-Corrected Preview")

    processed_preview = gr.Image(type="pil", label="Processed Preview for OCR")
    ocr_text = gr.Textbox(label="Extracted Text")
    audio_output = gr.Audio(label="Text-to-Speech Output")

    webcam.change(
        fn=extract_text_and_speak,
        inputs=webcam,
        outputs=[corrected_preview, processed_preview, ocr_text, audio_output]
    )

if __name__ == "__main__":
    demo.launch()