Arjooohn commited on
Commit
1655eaa
·
verified ·
1 Parent(s): d4a888f
Files changed (1) hide show
  1. app.py +23 -24
app.py CHANGED
@@ -1,29 +1,32 @@
1
  import gradio as gr
2
- import pytesseract
3
  import cv2
4
  import numpy as np
5
  from PIL import Image
 
6
  from gtts import gTTS
7
  import io
8
 
9
- def preprocess(image):
10
- img = np.array(image)
11
- img = cv2.flip(img, 1) # mirror
12
- gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
13
  kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]])
14
  gray = cv2.filter2D(gray, -1, kernel)
15
  gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
16
  thresh = cv2.adaptiveThreshold(
17
- gray, 255,
18
- cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
19
- cv2.THRESH_BINARY,
20
- 31, 10
21
  )
22
  return thresh
23
 
24
- def extract_text_and_speak(image):
25
- processed = preprocess(image)
 
 
 
 
 
26
  processed_pil = Image.fromarray(processed)
 
 
27
  text = pytesseract.image_to_string(processed, lang="eng").strip()
28
  if text == "":
29
  text = "No readable text found."
@@ -34,29 +37,25 @@ def extract_text_and_speak(image):
34
  tts.write_to_fp(buffer)
35
  buffer.seek(0)
36
  audio_file = buffer
 
37
  return processed_pil, text, audio_file
38
 
39
  with gr.Blocks() as demo:
40
- gr.Markdown("## GabAI - AI Assistive Reading System (Live OCR)")
41
 
42
  with gr.Row():
43
- webcam = gr.Image(
44
- type="pil",
45
- sources=["webcam"], # user captures frames manually
46
- label="Webcam Input"
47
- )
48
  processed_preview = gr.Image(type="pil", label="Processed Preview")
49
 
50
  ocr_text = gr.Textbox(label="Extracted Text")
51
  audio_output = gr.Audio(label="Text-to-Speech Output")
52
 
53
- # Timer triggers the function every 0.5 seconds
54
- def timer_fn():
55
- if webcam.value is None:
56
- return None, None, None
57
- return extract_text_and_speak(webcam.value)
58
-
59
- gr.Timer(interval=0.5, fn=timer_fn, outputs=[processed_preview, ocr_text, audio_output])
60
 
61
  if __name__ == "__main__":
62
  demo.launch()
 
1
  import gradio as gr
 
2
  import cv2
3
  import numpy as np
4
  from PIL import Image
5
+ import pytesseract
6
  from gtts import gTTS
7
  import io
8
 
9
+ def preprocess(frame):
10
+ gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
 
 
11
  kernel = np.array([[0,-1,0], [-1,5,-1], [0,-1,0]])
12
  gray = cv2.filter2D(gray, -1, kernel)
13
  gray = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
14
  thresh = cv2.adaptiveThreshold(
15
+ gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
16
+ cv2.THRESH_BINARY, 31, 10
 
 
17
  )
18
  return thresh
19
 
20
+ def process_video(frame):
21
+ # frame is a numpy array from the webcam feed
22
+ if frame is None:
23
+ return None, "No frame captured", None
24
+
25
+ frame = cv2.flip(frame, 1) # mirror
26
+ processed = preprocess(frame)
27
  processed_pil = Image.fromarray(processed)
28
+
29
+ # OCR
30
  text = pytesseract.image_to_string(processed, lang="eng").strip()
31
  if text == "":
32
  text = "No readable text found."
 
37
  tts.write_to_fp(buffer)
38
  buffer.seek(0)
39
  audio_file = buffer
40
+
41
  return processed_pil, text, audio_file
42
 
43
  with gr.Blocks() as demo:
44
+ gr.Markdown("## GabAI - Real-Time OCR with Webcam")
45
 
46
  with gr.Row():
47
+ webcam = gr.Video(source="webcam", type="numpy", label="Webcam Feed")
 
 
 
 
48
  processed_preview = gr.Image(type="pil", label="Processed Preview")
49
 
50
  ocr_text = gr.Textbox(label="Extracted Text")
51
  audio_output = gr.Audio(label="Text-to-Speech Output")
52
 
53
+ # Real-time processing: call function on each frame
54
+ webcam.change(
55
+ fn=process_video,
56
+ inputs=webcam,
57
+ outputs=[processed_preview, ocr_text, audio_output]
58
+ )
 
59
 
60
  if __name__ == "__main__":
61
  demo.launch()