rahulrana0001 commited on
Commit
ac3541e
·
1 Parent(s): 209c5b0

Upgrade: Hybrid Digital/OCR scanner for 100% accurate ebook reading

Browse files
Files changed (2) hide show
  1. app.py +14 -7
  2. pipeline/document_parser.py +14 -0
app.py CHANGED
@@ -8,7 +8,8 @@ from pipeline.tts import generate_tamil_speech
8
  from pipeline.document_parser import (
9
  extract_text_from_document,
10
  get_pdf_page_as_image,
11
- get_pdf_page_count
 
12
  )
13
 
14
  # Setup logging
@@ -64,12 +65,18 @@ def load_comic_page(pdf_path, page_num):
64
  status = f"Page {page_num + 1} of {total_pages}"
65
  return img_path, status, page_num
66
 
67
- def process_comic_page(img_path, emotion_choice):
68
  if not img_path:
69
  return "No page loaded", "", None
70
 
71
- # OCR on the rendered comic page
72
- extracted_text = extract_text_from_image(img_path)
 
 
 
 
 
 
73
  if not extracted_text.strip():
74
  return "No text found on this page", "", None
75
 
@@ -137,7 +144,7 @@ with gr.Blocks(title="Tamil Comic & Manga Reader AI") as demo:
137
 
138
  read_page_btn.click(
139
  process_comic_page,
140
- inputs=[comic_display, voice_style_comic],
141
  outputs=[comic_text, comic_tamil, comic_audio]
142
  )
143
 
@@ -172,8 +179,8 @@ with gr.Blocks(title="Tamil Comic & Manga Reader AI") as demo:
172
  if not img: # End of book
173
  return gr.update(), status, p_num, gr.update(), gr.update(), gr.update()
174
 
175
- # 2. Process the new page
176
- txt, tam, aud = process_comic_page(img, voice)
177
  return img, status, p_num, txt, tam, aud
178
 
179
  # The hidden button triggers the actual logic
 
8
  from pipeline.document_parser import (
9
  extract_text_from_document,
10
  get_pdf_page_as_image,
11
+ get_pdf_page_count,
12
+ get_text_from_page
13
  )
14
 
15
  # Setup logging
 
65
  status = f"Page {page_num + 1} of {total_pages}"
66
  return img_path, status, page_num
67
 
68
+ def process_comic_page(img_path, pdf_path, page_num, emotion_choice):
69
  if not img_path:
70
  return "No page loaded", "", None
71
 
72
+ # 1. Try Direct Digital Extraction (100% Accuracy for ebooks/EPUBs)
73
+ extracted_text = get_text_from_page(pdf_path, page_num)
74
+
75
+ # 2. Fallback to AI OCR (For image-based comics)
76
+ if not extracted_text or len(extracted_text.strip()) < 5:
77
+ print(f"DEBUG: No digital text found. Falling back to AI OCR...")
78
+ extracted_text = extract_text_from_image(img_path)
79
+
80
  if not extracted_text.strip():
81
  return "No text found on this page", "", None
82
 
 
144
 
145
  read_page_btn.click(
146
  process_comic_page,
147
+ inputs=[comic_display, comic_pdf_path, current_page, voice_style_comic],
148
  outputs=[comic_text, comic_tamil, comic_audio]
149
  )
150
 
 
179
  if not img: # End of book
180
  return gr.update(), status, p_num, gr.update(), gr.update(), gr.update()
181
 
182
+ # 2. Process the new page (Using Hybrid Mode)
183
+ txt, tam, aud = process_comic_page(img, pdf, p_num, voice)
184
  return img, status, p_num, txt, tam, aud
185
 
186
  # The hidden button triggers the actual logic
pipeline/document_parser.py CHANGED
@@ -62,6 +62,20 @@ def get_pdf_page_as_image(file_path: str, page_num: int) -> str:
62
  print(f"ERROR: PDF rendering failed: {e}")
63
  return None
64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  def extract_text_from_document(file_path: str) -> str:
66
  """
67
  Dispatcher to extract text based on file extension.
 
62
  print(f"ERROR: PDF rendering failed: {e}")
63
  return None
64
 
65
+ def get_text_from_page(file_path: str, page_num: int) -> str:
66
+ """
67
+ Tries to extract digital text directly from a specific page.
68
+ """
69
+ try:
70
+ doc = fitz.open(file_path)
71
+ if page_num >= len(doc):
72
+ return ""
73
+ text = doc[page_num].get_text().strip()
74
+ doc.close()
75
+ return text
76
+ except:
77
+ return ""
78
+
79
  def extract_text_from_document(file_path: str) -> str:
80
  """
81
  Dispatcher to extract text based on file extension.