import gradio as gr import cv2 import numpy as np from PIL import Image import pytesseract from gtts import gTTS import tempfile import os def preprocess(image): img = np.array(image) img = cv2.flip(img, 1) # Mirror-correct gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]]) gray = cv2.filter2D(gray, -1, kernel) gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21) thresh = cv2.adaptiveThreshold( gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 10 ) return thresh, img def extract_text_and_speak(image): processed, flipped_preview = preprocess(image) processed_pil = Image.fromarray(processed) preview_pil = Image.fromarray(flipped_preview) text = pytesseract.image_to_string(processed, lang="eng").strip() # Create TTS only if text exists if text and text != "No readable text found.": with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile: tts = gTTS(text) tts.save(tmpfile.name) audio_file = tmpfile.name else: audio_file = None if not text: text = "No readable text found." return preview_pil, processed_pil, text, audio_file with gr.Blocks() as demo: gr.Markdown("## GabAI - AI Assistive Reading System") with gr.Row(): webcam = gr.Image( type="pil", sources=["webcam"], label="Live Webcam (mirrored)" ) corrected_preview = gr.Image(type="pil", label="Mirror-Corrected Preview") processed_preview = gr.Image(type="pil", label="Processed Preview for OCR") ocr_text = gr.Textbox(label="Extracted Text") audio_output = gr.Audio(label="Text-to-Speech Output") webcam.change( fn=extract_text_and_speak, inputs=webcam, outputs=[corrected_preview, processed_preview, ocr_text, audio_output] ) if __name__ == "__main__": demo.launch()