Spaces:
Sleeping
Sleeping
Update ocr_utils.py
Browse files- ocr_utils.py +30 -7
ocr_utils.py
CHANGED
|
@@ -5,11 +5,12 @@ import tempfile
|
|
| 5 |
|
| 6 |
def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
|
| 7 |
"""
|
| 8 |
-
Extract text, words, and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
|
| 9 |
Args:
|
| 10 |
pdf_path (str): Path to the PDF file.
|
| 11 |
Returns:
|
| 12 |
-
list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
|
|
|
|
| 13 |
Returns empty list if failed.
|
| 14 |
"""
|
| 15 |
try:
|
|
@@ -30,26 +31,48 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
|
|
| 30 |
img_path = f"{temp_path}_page_{page_num}.png"
|
| 31 |
pix.save(img_path)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
# Perform OCR using EasyOCR
|
| 34 |
results = reader.readtext(img_path)
|
| 35 |
text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
|
| 36 |
words = []
|
| 37 |
bboxes = []
|
| 38 |
|
| 39 |
-
# Split text segments into words and assign bounding boxes
|
| 40 |
for res in results:
|
| 41 |
segment_text = res[1]
|
| 42 |
segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
segment_words = segment_text.split()
|
| 44 |
-
# Assign the same bounding box to each word in the segment
|
| 45 |
for word in segment_words:
|
| 46 |
words.append(word)
|
| 47 |
-
bboxes.append(
|
| 48 |
|
| 49 |
if text.strip():
|
| 50 |
-
all_pages.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
else:
|
| 52 |
-
all_pages.append({
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
# Clean up temporary image
|
| 55 |
if os.path.exists(img_path):
|
|
|
|
| 5 |
|
| 6 |
def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
|
| 7 |
"""
|
| 8 |
+
Extract text, words, and normalized bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
|
| 9 |
Args:
|
| 10 |
pdf_path (str): Path to the PDF file.
|
| 11 |
Returns:
|
| 12 |
+
list: List of dictionaries, each containing 'text' (str), 'words' (list of str),
|
| 13 |
+
'bbox' (list of [x0, y0, x1, y1] normalized to 0-1000), and 'image_dims' ([width, height]) for each page.
|
| 14 |
Returns empty list if failed.
|
| 15 |
"""
|
| 16 |
try:
|
|
|
|
| 31 |
img_path = f"{temp_path}_page_{page_num}.png"
|
| 32 |
pix.save(img_path)
|
| 33 |
|
| 34 |
+
# Get image dimensions
|
| 35 |
+
image_width, image_height = pix.width, pix.height
|
| 36 |
+
|
| 37 |
# Perform OCR using EasyOCR
|
| 38 |
results = reader.readtext(img_path)
|
| 39 |
text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
|
| 40 |
words = []
|
| 41 |
bboxes = []
|
| 42 |
|
| 43 |
+
# Split text segments into words and assign normalized bounding boxes
|
| 44 |
for res in results:
|
| 45 |
segment_text = res[1]
|
| 46 |
segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
|
| 47 |
+
# Normalize bounding box to 0-1000 range
|
| 48 |
+
normalized_bbox = [
|
| 49 |
+
int((segment_bbox[0] / image_width) * 1000),
|
| 50 |
+
int((segment_bbox[1] / image_height) * 1000),
|
| 51 |
+
int((segment_bbox[2] / image_width) * 1000),
|
| 52 |
+
int((segment_bbox[3] / image_height) * 1000)
|
| 53 |
+
]
|
| 54 |
+
# Ensure coordinates are within 0-1000
|
| 55 |
+
normalized_bbox = [max(0, min(1000, coord)) for coord in normalized_bbox]
|
| 56 |
segment_words = segment_text.split()
|
| 57 |
+
# Assign the same normalized bounding box to each word in the segment
|
| 58 |
for word in segment_words:
|
| 59 |
words.append(word)
|
| 60 |
+
bboxes.append(normalized_bbox)
|
| 61 |
|
| 62 |
if text.strip():
|
| 63 |
+
all_pages.append({
|
| 64 |
+
"text": text,
|
| 65 |
+
"words": words,
|
| 66 |
+
"bbox": bboxes,
|
| 67 |
+
"image_dims": [image_width, image_height]
|
| 68 |
+
})
|
| 69 |
else:
|
| 70 |
+
all_pages.append({
|
| 71 |
+
"text": f"Page {page_num + 1}: No text detected",
|
| 72 |
+
"words": [],
|
| 73 |
+
"bbox": [],
|
| 74 |
+
"image_dims": [image_width, image_height]
|
| 75 |
+
})
|
| 76 |
|
| 77 |
# Clean up temporary image
|
| 78 |
if os.path.exists(img_path):
|