Commit ·
ac3541e
1
Parent(s): 209c5b0
Upgrade: Hybrid Digital/OCR scanner for 100% accurate ebook reading
Browse files- app.py +14 -7
- pipeline/document_parser.py +14 -0
app.py
CHANGED
|
@@ -8,7 +8,8 @@ from pipeline.tts import generate_tamil_speech
|
|
| 8 |
from pipeline.document_parser import (
|
| 9 |
extract_text_from_document,
|
| 10 |
get_pdf_page_as_image,
|
| 11 |
-
get_pdf_page_count
|
|
|
|
| 12 |
)
|
| 13 |
|
| 14 |
# Setup logging
|
|
@@ -64,12 +65,18 @@ def load_comic_page(pdf_path, page_num):
|
|
| 64 |
status = f"Page {page_num + 1} of {total_pages}"
|
| 65 |
return img_path, status, page_num
|
| 66 |
|
| 67 |
-
def process_comic_page(img_path, emotion_choice):
|
| 68 |
if not img_path:
|
| 69 |
return "No page loaded", "", None
|
| 70 |
|
| 71 |
-
#
|
| 72 |
-
extracted_text =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
if not extracted_text.strip():
|
| 74 |
return "No text found on this page", "", None
|
| 75 |
|
|
@@ -137,7 +144,7 @@ with gr.Blocks(title="Tamil Comic & Manga Reader AI") as demo:
|
|
| 137 |
|
| 138 |
read_page_btn.click(
|
| 139 |
process_comic_page,
|
| 140 |
-
inputs=[comic_display, voice_style_comic],
|
| 141 |
outputs=[comic_text, comic_tamil, comic_audio]
|
| 142 |
)
|
| 143 |
|
|
@@ -172,8 +179,8 @@ with gr.Blocks(title="Tamil Comic & Manga Reader AI") as demo:
|
|
| 172 |
if not img: # End of book
|
| 173 |
return gr.update(), status, p_num, gr.update(), gr.update(), gr.update()
|
| 174 |
|
| 175 |
-
# 2. Process the new page
|
| 176 |
-
txt, tam, aud = process_comic_page(img, voice)
|
| 177 |
return img, status, p_num, txt, tam, aud
|
| 178 |
|
| 179 |
# The hidden button triggers the actual logic
|
|
|
|
| 8 |
from pipeline.document_parser import (
|
| 9 |
extract_text_from_document,
|
| 10 |
get_pdf_page_as_image,
|
| 11 |
+
get_pdf_page_count,
|
| 12 |
+
get_text_from_page
|
| 13 |
)
|
| 14 |
|
| 15 |
# Setup logging
|
|
|
|
| 65 |
status = f"Page {page_num + 1} of {total_pages}"
|
| 66 |
return img_path, status, page_num
|
| 67 |
|
| 68 |
+
def process_comic_page(img_path, pdf_path, page_num, emotion_choice):
|
| 69 |
if not img_path:
|
| 70 |
return "No page loaded", "", None
|
| 71 |
|
| 72 |
+
# 1. Try Direct Digital Extraction (100% Accuracy for ebooks/EPUBs)
|
| 73 |
+
extracted_text = get_text_from_page(pdf_path, page_num)
|
| 74 |
+
|
| 75 |
+
# 2. Fallback to AI OCR (For image-based comics)
|
| 76 |
+
if not extracted_text or len(extracted_text.strip()) < 5:
|
| 77 |
+
print(f"DEBUG: No digital text found. Falling back to AI OCR...")
|
| 78 |
+
extracted_text = extract_text_from_image(img_path)
|
| 79 |
+
|
| 80 |
if not extracted_text.strip():
|
| 81 |
return "No text found on this page", "", None
|
| 82 |
|
|
|
|
| 144 |
|
| 145 |
read_page_btn.click(
|
| 146 |
process_comic_page,
|
| 147 |
+
inputs=[comic_display, comic_pdf_path, current_page, voice_style_comic],
|
| 148 |
outputs=[comic_text, comic_tamil, comic_audio]
|
| 149 |
)
|
| 150 |
|
|
|
|
| 179 |
if not img: # End of book
|
| 180 |
return gr.update(), status, p_num, gr.update(), gr.update(), gr.update()
|
| 181 |
|
| 182 |
+
# 2. Process the new page (Using Hybrid Mode)
|
| 183 |
+
txt, tam, aud = process_comic_page(img, pdf, p_num, voice)
|
| 184 |
return img, status, p_num, txt, tam, aud
|
| 185 |
|
| 186 |
# The hidden button triggers the actual logic
|
pipeline/document_parser.py
CHANGED
|
@@ -62,6 +62,20 @@ def get_pdf_page_as_image(file_path: str, page_num: int) -> str:
|
|
| 62 |
print(f"ERROR: PDF rendering failed: {e}")
|
| 63 |
return None
|
| 64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
def extract_text_from_document(file_path: str) -> str:
|
| 66 |
"""
|
| 67 |
Dispatcher to extract text based on file extension.
|
|
|
|
| 62 |
print(f"ERROR: PDF rendering failed: {e}")
|
| 63 |
return None
|
| 64 |
|
| 65 |
+
def get_text_from_page(file_path: str, page_num: int) -> str:
|
| 66 |
+
"""
|
| 67 |
+
Tries to extract digital text directly from a specific page.
|
| 68 |
+
"""
|
| 69 |
+
try:
|
| 70 |
+
doc = fitz.open(file_path)
|
| 71 |
+
if page_num >= len(doc):
|
| 72 |
+
return ""
|
| 73 |
+
text = doc[page_num].get_text().strip()
|
| 74 |
+
doc.close()
|
| 75 |
+
return text
|
| 76 |
+
except:
|
| 77 |
+
return ""
|
| 78 |
+
|
| 79 |
def extract_text_from_document(file_path: str) -> str:
|
| 80 |
"""
|
| 81 |
Dispatcher to extract text based on file extension.
|