import gradio as gr
import cv2
import numpy as np
from PIL import Image
import pytesseract
from gtts import gTTS
import tempfile
import os

def preprocess(image):
    img = np.array(image)
    img = cv2.flip(img, 1)  # Mirror-correct
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]])
    gray = cv2.filter2D(gray, -1, kernel)
    gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
    thresh = cv2.adaptiveThreshold(
        gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY,
        31, 10
    )
    return thresh, img

def extract_text_and_speak(image):
    processed, flipped_preview = preprocess(image)
    processed_pil = Image.fromarray(processed)
    preview_pil = Image.fromarray(flipped_preview)

    text = pytesseract.image_to_string(processed, lang="eng").strip()
    
    # Create TTS only if text exists
    if text and text != "No readable text found.":
        with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmpfile:
            tts = gTTS(text)
            tts.save(tmpfile.name)
            audio_file = tmpfile.name
    else:
        audio_file = None
        if not text:
            text = "No readable text found."

    return preview_pil, processed_pil, text, audio_file

with gr.Blocks() as demo:
    gr.Markdown("## GabAI - AI Assistive Reading System")

    with gr.Row():
        webcam = gr.Image(
            type="pil",
            sources=["webcam"],
            label="Live Webcam (mirrored)"
        )
        corrected_preview = gr.Image(type="pil", label="Mirror-Corrected Preview")

    processed_preview = gr.Image(type="pil", label="Processed Preview for OCR")
    ocr_text = gr.Textbox(label="Extracted Text")
    audio_output = gr.Audio(label="Text-to-Speech Output")

    webcam.change(
        fn=extract_text_and_speak,
        inputs=webcam,
        outputs=[corrected_preview, processed_preview, ocr_text, audio_output]
    )

if __name__ == "__main__":
    demo.launch()