pavansuresh commited on
Commit
608a057
·
verified ·
1 Parent(s): 061b9a2

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +67 -8
ocr_utils.py CHANGED
@@ -1,12 +1,71 @@
1
- from pdf2image import convert_from_path
2
  import pytesseract
3
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  def extract_text_from_pdf(pdf_path):
6
- with tempfile.TemporaryDirectory() as tempdir:
7
- images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
8
- all_text = []
9
- for img in images:
10
- text = pytesseract.image_to_string(img)
11
- all_text.append(text)
12
- return "\n".join(all_text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pytesseract
2
  import tempfile
3
+ import os
4
+ from pdf2image import convert_from_path
5
+ from pdf2image.exceptions import PDFInfoNotInstalledError
6
+ try:
7
+ import pdfplumber
8
+ PDFPLUMBER_AVAILABLE = True
9
+ except ImportError:
10
+ PDFPLUMBER_AVAILABLE = False
11
+
12
+ def check_poppler_installed():
13
+ """Check if Poppler's pdfinfo is installed and in PATH."""
14
+ import subprocess
15
+ try:
16
+ subprocess.run(['pdfinfo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
17
+ return True
18
+ except (subprocess.CalledProcessError, FileNotFoundError):
19
+ return False
20
 
21
  def extract_text_from_pdf(pdf_path):
22
+ """
23
+ Extract text from a PDF file using pdf2image and pytesseract for OCR.
24
+ Fallback to pdfplumber if Poppler is not installed or pdf2image fails.
25
+ """
26
+ # Validate PDF path
27
+ if not os.path.exists(pdf_path):
28
+ raise FileNotFoundError(f"❌ PDF file not found: {pdf_path}")
29
+
30
+ # Try pdf2image with Poppler for OCR (best for scanned PDFs)
31
+ if check_poppler_installed():
32
+ try:
33
+ with tempfile.TemporaryDirectory() as tempdir:
34
+ images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
35
+ all_text = []
36
+ for img in images:
37
+ text = pytesseract.image_to_string(img)
38
+ all_text.append(text)
39
+ extracted_text = "\n".join(all_text).strip()
40
+ if not extracted_text:
41
+ print("⚠️ No text extracted with pdf2image. The PDF may be empty or OCR failed.")
42
+ return extracted_text
43
+ except PDFInfoNotInstalledError:
44
+ print("❌ Poppler not installed or not in PATH. Falling back to pdfplumber.")
45
+ except Exception as e:
46
+ print(f"❌ Error with pdf2image: {str(e)}. Falling back to pdfplumber.")
47
+ else:
48
+ print("❌ Poppler (pdfinfo) not found. Install it with: sudo apt-get install poppler-utils")
49
+ print("Falling back to pdfplumber for text extraction.")
50
+
51
+ # Fallback to pdfplumber if pdf2image fails or Poppler is not installed
52
+ if not PDFPLUMBER_AVAILABLE:
53
+ raise ImportError(
54
+ "❌ pdfplumber not installed and Poppler is unavailable. "
55
+ "Install pdfplumber with: pip install pdfplumber\n"
56
+ "Or install Poppler with: sudo apt-get install poppler-utils"
57
+ )
58
+
59
+ try:
60
+ with pdfplumber.open(pdf_path) as pdf:
61
+ all_text = []
62
+ for page in pdf.pages:
63
+ page_text = page.extract_text()
64
+ if page_text:
65
+ all_text.append(page_text)
66
+ extracted_text = "\n".join(all_text).strip()
67
+ if not extracted_text:
68
+ print("⚠️ No text extracted with pdfplumber. The PDF may be scanned or empty.")
69
+ return extracted_text
70
+ except Exception as e:
71
+ raise Exception(f"❌ Failed to extract text with pdfplumber: {str(e)}")