pavansuresh commited on
Commit
27937fa
·
verified ·
1 Parent(s): 120db3b

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +13 -9
ocr_utils.py CHANGED
@@ -3,13 +3,14 @@ import easyocr
3
  import os
4
  import tempfile
5
 
6
- def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
7
  """
8
- Extract text from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
- str: Extracted text from all pages, or empty string if failed.
 
13
  """
14
  try:
15
  # Save PDF to a temporary file
@@ -20,7 +21,7 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
20
 
21
  # Convert PDF to images using PyMuPDF
22
  doc = fitz.open(temp_path)
23
- all_text = []
24
  reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
25
 
26
  for page_num in range(len(doc)):
@@ -31,21 +32,24 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
31
 
32
  # Perform OCR using EasyOCR
33
  results = reader.readtext(img_path)
34
- text = " ".join([res[1] for res in results]) # Extract text from results
 
 
 
35
  if text.strip():
36
- all_text.append(f"Page {page_num + 1}:\n{text}")
37
  else:
38
- all_text.append(f"Page {page_num + 1}: No text detected")
39
 
40
  # Clean up temporary image
41
  if os.path.exists(img_path):
42
  os.unlink(img_path)
43
 
44
  doc.close()
45
- return "\n".join(all_text) if all_text else ""
46
  except Exception as e:
47
  print(f"OCR failed: {str(e)}")
48
- return ""
49
  finally:
50
  if os.path.exists(temp_path):
51
  os.unlink(temp_path)
 
3
  import os
4
  import tempfile
5
 
6
+ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
7
  """
8
+ Extract text and bounding boxes from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
+ list: List of dictionaries, each containing 'text' (str) and 'bbox' (list of [x0, y0, x1, y1]) for each page.
13
+ Returns empty list if failed.
14
  """
15
  try:
16
  # Save PDF to a temporary file
 
21
 
22
  # Convert PDF to images using PyMuPDF
23
  doc = fitz.open(temp_path)
24
+ all_pages = []
25
  reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
26
 
27
  for page_num in range(len(doc)):
 
32
 
33
  # Perform OCR using EasyOCR
34
  results = reader.readtext(img_path)
35
+ text = " ".join([res[1] for res in results]) # Extract text
36
+ # Extract bounding boxes in [x0, y0, x1, y1] format
37
+ bboxes = [[res[0][0][0], res[0][0][1], res[0][2][0], res[0][2][1]] for res in results]
38
+
39
  if text.strip():
40
+ all_pages.append({"text": text, "bbox": bboxes})
41
  else:
42
+ all_pages.append({"text": f"Page {page_num + 1}: No text detected", "bbox": []})
43
 
44
  # Clean up temporary image
45
  if os.path.exists(img_path):
46
  os.unlink(img_path)
47
 
48
  doc.close()
49
+ return all_pages
50
  except Exception as e:
51
  print(f"OCR failed: {str(e)}")
52
+ return []
53
  finally:
54
  if os.path.exists(temp_path):
55
  os.unlink(temp_path)