pavansuresh commited on
Commit
4968aaa
·
verified ·
1 Parent(s): afc39b5

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +15 -2
ocr_utils.py CHANGED
@@ -2,6 +2,11 @@ import fitz # PyMuPDF
2
  import easyocr
3
  import os
4
  import tempfile
 
 
 
 
 
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
7
  """
@@ -19,23 +24,30 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
19
  with open(pdf_path, 'rb') as f:
20
  tmp.write(f.read())
21
  temp_path = tmp.name
 
22
 
23
  # Convert PDF to images using PyMuPDF
24
  doc = fitz.open(temp_path)
 
 
 
25
  all_pages = []
26
  reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
27
 
28
  for page_num in range(len(doc)):
29
  page = doc[page_num]
30
- pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72)) # 300 DPI
31
  img_path = f"{temp_path}_page_{page_num}.png"
32
  pix.save(img_path)
 
33
 
34
  # Get image dimensions
35
  image_width, image_height = pix.width, pix.height
36
 
37
  # Perform OCR using EasyOCR
38
  results = reader.readtext(img_path)
 
 
39
  text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
40
  words = []
41
  bboxes = []
@@ -79,9 +91,10 @@ def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
79
  os.unlink(img_path)
80
 
81
  doc.close()
 
82
  return all_pages
83
  except Exception as e:
84
- print(f"OCR failed: {str(e)}")
85
  return []
86
  finally:
87
  if os.path.exists(temp_path):
 
2
  import easyocr
3
  import os
4
  import tempfile
5
+ import logging
6
+
7
+ # Set up logging
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
 
11
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> list:
12
  """
 
24
  with open(pdf_path, 'rb') as f:
25
  tmp.write(f.read())
26
  temp_path = tmp.name
27
+ logger.info(f"Temporary PDF created at: {temp_path}")
28
 
29
  # Convert PDF to images using PyMuPDF
30
  doc = fitz.open(temp_path)
31
+ if not doc.page_count:
32
+ logger.error(f"PDF is empty or unreadable: {pdf_path}")
33
+ return []
34
  all_pages = []
35
  reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
36
 
37
  for page_num in range(len(doc)):
38
  page = doc[page_num]
39
+ pix = page.get_pixmap(matrix=fitz.Matrix(400/72, 400/72)) # Increased DPI to 400 for better detection
40
  img_path = f"{temp_path}_page_{page_num}.png"
41
  pix.save(img_path)
42
+ logger.info(f"Image generated at: {img_path}, Dimensions: {pix.width}x{pix.height}")
43
 
44
  # Get image dimensions
45
  image_width, image_height = pix.width, pix.height
46
 
47
  # Perform OCR using EasyOCR
48
  results = reader.readtext(img_path)
49
+ if not results:
50
+ logger.warning(f"No text detected on page {page_num + 1}")
51
  text = " ".join([res[1] for res in results]) # Concatenated text for compatibility
52
  words = []
53
  bboxes = []
 
91
  os.unlink(img_path)
92
 
93
  doc.close()
94
+ logger.info(f"Extracted data from {len(all_pages)} pages")
95
  return all_pages
96
  except Exception as e:
97
+ logger.error(f"OCR failed: {str(e)}")
98
  return []
99
  finally:
100
  if os.path.exists(temp_path):