Spaces:
Sleeping
Sleeping
Update ocr_utils.py
Browse files- ocr_utils.py +17 -7
ocr_utils.py
CHANGED
|
@@ -5,11 +5,11 @@ import tempfile
|
|
| 5 |
|
| 6 |
def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
|
| 7 |
"""
|
| 8 |
-
Extract text and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
|
| 9 |
Args:
|
| 10 |
pdf_path (str): Path to the PDF file.
|
| 11 |
Returns:
|
| 12 |
-
list: List of dictionaries, each containing 'text' (str) and 'bbox' (list of [x0, y0, x1, y1]) for each page.
|
| 13 |
Returns empty list if failed.
|
| 14 |
"""
|
| 15 |
try:
|
|
@@ -32,14 +32,24 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
|
|
| 32 |
|
| 33 |
# Perform OCR using EasyOCR
|
| 34 |
results = reader.readtext(img_path)
|
| 35 |
-
text = " ".join([res[1] for res in results]) #
|
| 36 |
-
|
| 37 |
-
bboxes = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
if text.strip():
|
| 40 |
-
all_pages.append({"text": text, "bbox": bboxes})
|
| 41 |
else:
|
| 42 |
-
all_pages.append({"text": f"Page {page_num + 1}: No text detected", "bbox": []})
|
| 43 |
|
| 44 |
# Clean up temporary image
|
| 45 |
if os.path.exists(img_path):
|
|
|
|
| 5 |
|
| 6 |
def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
|
| 7 |
"""
|
| 8 |
+
Extract text, words, and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
|
| 9 |
Args:
|
| 10 |
pdf_path (str): Path to the PDF file.
|
| 11 |
Returns:
|
| 12 |
+
list: List of dictionaries, each containing 'text' (str), 'words' (list of str), and 'bbox' (list of [x0, y0, x1, y1]) for each page.
|
| 13 |
Returns empty list if failed.
|
| 14 |
"""
|
| 15 |
try:
|
|
|
|
| 32 |
|
| 33 |
# Perform OCR using EasyOCR
|
| 34 |
results = reader.readtext(img_path)
|
| 35 |
+
text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
|
| 36 |
+
words = []
|
| 37 |
+
bboxes = []
|
| 38 |
+
|
| 39 |
+
# Split text segments into words and assign bounding boxes
|
| 40 |
+
for res in results:
|
| 41 |
+
segment_text = res[1]
|
| 42 |
+
segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
|
| 43 |
+
segment_words = segment_text.split()
|
| 44 |
+
# Assign the same bounding box to each word in the segment
|
| 45 |
+
for word in segment_words:
|
| 46 |
+
words.append(word)
|
| 47 |
+
bboxes.append(segment_bbox)
|
| 48 |
|
| 49 |
if text.strip():
|
| 50 |
+
all_pages.append({"text": text, "words": words, "bbox": bboxes})
|
| 51 |
else:
|
| 52 |
+
all_pages.append({"text": f"Page {page_num + 1}: No text detected", "words": [], "bbox": []})
|
| 53 |
|
| 54 |
# Clean up temporary image
|
| 55 |
if os.path.exists(img_path):
|