pavansuresh commited on
Commit
0a21337
·
verified ·
1 Parent(s): 3982363

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +17 -7
ocr_utils.py CHANGED
@@ -5,11 +5,11 @@ import tempfile
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
7
  """
8
- Extract text and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
- list: List of dictionaries, each containing 'text' (str) and 'bbox' (list of [x0, y0, x1, y1]) for each page.
13
  Returns empty list if failed.
14
  """
15
  try:
@@ -32,14 +32,24 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
32
 
33
  # Perform OCR using EasyOCR
34
  results = reader.readtext(img_path)
35
- text = " ".join([res[1] for res in results]) # Extract text
36
- # Extract bounding boxes in [x0, y0, x1, y1] format
37
- bboxes = [[res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] for res in results]
 
 
 
 
 
 
 
 
 
 
38
 
39
  if text.strip():
40
- all_pages.append({"text": text, "bbox": bboxes})
41
  else:
42
- all_pages.append({"text": f"Page {page_num + 1}: No text detected", "bbox": []})
43
 
44
  # Clean up temporary image
45
  if os.path.exists(img_path):
 
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
7
  """
8
+ Extract text, words, and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
+ list: List of dictionaries, each containing 'text' (str), 'words' (list of str), and 'bbox' (list of [x0, y0, x1, y1]) for each page.
13
  Returns empty list if failed.
14
  """
15
  try:
 
32
 
33
  # Perform OCR using EasyOCR
34
  results = reader.readtext(img_path)
35
+ text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
36
+ words = []
37
+ bboxes = []
38
+
39
+ # Split text segments into words and assign bounding boxes
40
+ for res in results:
41
+ segment_text = res[1]
42
+ segment_bbox = [res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] # [x0, y0, x1, y1]
43
+ segment_words = segment_text.split()
44
+ # Assign the same bounding box to each word in the segment
45
+ for word in segment_words:
46
+ words.append(word)
47
+ bboxes.append(segment_bbox)
48
 
49
  if text.strip():
50
+ all_pages.append({"text": text, "words": words, "bbox": bboxes})
51
  else:
52
+ all_pages.append({"text": f"Page {page_num + 1}: No text detected", "words": [], "bbox": []})
53
 
54
  # Clean up temporary image
55
  if os.path.exists(img_path):