Spaces:

gabai-capstone
/

GabAI

Sleeping

App Files Files Community

Arjooohn commited on about 1 month ago

Commit

1655eaa

verified ·

1 Parent(s): d4a888f

Fix bugs

Browse files

Files changed (1) hide show

app.py +23 -24

app.py CHANGED Viewed

@@ -1,29 +1,32 @@
 import gradio as gr
-import pytesseract
 import cv2
 import numpy as np
 from PIL import Image
 from gtts import gTTS
 import io
-def preprocess(image):
-    img = np.array(image)
-    img = cv2.flip(img, 1)  # mirror
-    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
     kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]])
     gray = cv2.filter2D(gray, -1, kernel)
     gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
     thresh = cv2.adaptiveThreshold(
-        gray, 255,
-        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
-        cv2.THRESH_BINARY,
-        31, 10
     )
     return thresh
-def extract_text_and_speak(image):
-    processed = preprocess(image)
     processed_pil = Image.fromarray(processed)
     text = pytesseract.image_to_string(processed, lang="eng").strip()
     if text == "":
         text = "No readable text found."
@@ -34,29 +37,25 @@ def extract_text_and_speak(image):
         tts.write_to_fp(buffer)
         buffer.seek(0)
         audio_file = buffer
     return processed_pil, text, audio_file
 with gr.Blocks() as demo:
-    gr.Markdown("## GabAI - AI Assistive Reading System (Live OCR)")
     with gr.Row():
-        webcam = gr.Image(
-            type="pil",
-            sources=["webcam"],  # user captures frames manually
-            label="Webcam Input"
-        )
         processed_preview = gr.Image(type="pil", label="Processed Preview")
     ocr_text = gr.Textbox(label="Extracted Text")
     audio_output = gr.Audio(label="Text-to-Speech Output")
-    # Timer triggers the function every 0.5 seconds
-    def timer_fn():
-        if webcam.value is None:
-            return None, None, None
-        return extract_text_and_speak(webcam.value)
-    gr.Timer(interval=0.5, fn=timer_fn, outputs=[processed_preview, ocr_text, audio_output])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import cv2
 import numpy as np
 from PIL import Image
+import pytesseract
 from gtts import gTTS
 import io
+def preprocess(frame):
+    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
     kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]])
     gray = cv2.filter2D(gray, -1, kernel)
     gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
     thresh = cv2.adaptiveThreshold(
+        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
+        cv2.THRESH_BINARY, 31, 10
     )
     return thresh
+def process_video(frame):
+    # frame is a numpy array from the webcam feed
+    if frame is None:
+        return None, "No frame captured", None
+    frame = cv2.flip(frame, 1)  # mirror
+    processed = preprocess(frame)
     processed_pil = Image.fromarray(processed)
+    # OCR
     text = pytesseract.image_to_string(processed, lang="eng").strip()
     if text == "":
         text = "No readable text found."
         tts.write_to_fp(buffer)
         buffer.seek(0)
         audio_file = buffer
     return processed_pil, text, audio_file
 with gr.Blocks() as demo:
+    gr.Markdown("## GabAI - Real-Time OCR with Webcam")
     with gr.Row():
+        webcam = gr.Video(source="webcam", type="numpy", label="Webcam Feed")
         processed_preview = gr.Image(type="pil", label="Processed Preview")
     ocr_text = gr.Textbox(label="Extracted Text")
     audio_output = gr.Audio(label="Text-to-Speech Output")
+    # Real-time processing: call function on each frame
+    webcam.change(
+        fn=process_video,
+        inputs=webcam,
+        outputs=[processed_preview, ocr_text, audio_output]
+    )
 if __name__ == "__main__":
     demo.launch()