pavansuresh commited on
Commit
ee4c8aa
·
verified ·
1 Parent(s): 2970e4e

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +24 -8
ocr_utils.py CHANGED
@@ -1,31 +1,47 @@
1
- from pdf2image import convert_from_path
2
- import pytesseract
3
  import os
4
  import tempfile
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
7
  """
8
- Extract text from a scanned PDF using Tesseract.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
  str: Extracted text from all pages, or empty string if failed.
13
  """
14
  try:
 
15
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
16
  with open(pdf_path, 'rb') as f:
17
  tmp.write(f.read())
18
  temp_path = tmp.name
19
- images = convert_from_path(temp_path)
 
 
20
  all_text = []
 
 
 
 
 
 
 
21
 
22
- for i, image in enumerate(images):
23
- text = pytesseract.image_to_string(image)
 
24
  if text.strip():
25
- all_text.append(f"Page {i+1}:\n{text}")
26
  else:
27
- all_text.append(f"Page {i+1}: No text detected")
 
 
 
 
28
 
 
29
  return "\n".join(all_text) if all_text else ""
30
  except Exception as e:
31
  print(f"OCR failed: {str(e)}")
 
1
+ import fitz # PyMuPDF
2
+ import easyocr
3
  import os
4
  import tempfile
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
7
  """
8
+ Extract text from a scanned PDF using PyMuPDF and EasyOCR.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
  str: Extracted text from all pages, or empty string if failed.
13
  """
14
  try:
15
+ # Save PDF to a temporary file
16
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
17
  with open(pdf_path, 'rb') as f:
18
  tmp.write(f.read())
19
  temp_path = tmp.name
20
+
21
+ # Convert PDF to images using PyMuPDF
22
+ doc = fitz.open(temp_path)
23
  all_text = []
24
+ reader = easyocr.Reader(['en'], gpu=False) # Initialize EasyOCR, adjust languages as needed
25
+
26
+ for page_num in range(len(doc)):
27
+ page = doc[page_num]
28
+ pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72)) # 300 DPI
29
+ img_path = f"{temp_path}_page_{page_num}.png"
30
+ pix.save(img_path)
31
 
32
+ # Perform OCR using EasyOCR
33
+ results = reader.readtext(img_path)
34
+ text = " ".join([res[1] for res in results]) # Extract text from results
35
  if text.strip():
36
+ all_text.append(f"Page {page_num + 1}:\n{text}")
37
  else:
38
+ all_text.append(f"Page {page_num + 1}: No text detected")
39
+
40
+ # Clean up temporary image
41
+ if os.path.exists(img_path):
42
+ os.unlink(img_path)
43
 
44
+ doc.close()
45
  return "\n".join(all_text) if all_text else ""
46
  except Exception as e:
47
  print(f"OCR failed: {str(e)}")