pavansuresh commited on
Commit
ab6b7bb
·
verified ·
1 Parent(s): 5f8a311

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +26 -29
ocr_utils.py CHANGED
@@ -13,59 +13,56 @@ def check_poppler_installed():
13
  """Check if Poppler's pdfinfo is installed and in PATH."""
14
  import subprocess
15
  try:
16
- subprocess.run(['pdfinfo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
 
17
  return True
18
  except (subprocess.CalledProcessError, FileNotFoundError):
 
19
  return False
20
 
21
  def extract_text_from_pdf(pdf_path):
22
  """
23
  Extract text from a PDF file using pdf2image and pytesseract for OCR.
24
- Fallback to pdfplumber if Poppler is not installed or pdf2image fails.
25
  """
 
26
  # Validate PDF path
27
  if not os.path.exists(pdf_path):
28
  raise FileNotFoundError(f"❌ PDF file not found: {pdf_path}")
29
 
 
 
 
 
 
 
 
30
  # Try pdf2image with Poppler for OCR (best for scanned PDFs)
 
31
  if check_poppler_installed():
32
  try:
33
  with tempfile.TemporaryDirectory() as tempdir:
 
34
  images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
35
- all_text = []
36
- for img in images:
37
- text = pytesseract.image_to_string(img)
38
- all_text.append(text)
39
- extracted_text = "\n".join(all_text).strip()
40
- if not extracted_text:
41
- print("⚠️ No text extracted with pdf2image. The PDF may be empty or OCR failed.")
42
- return extracted_text
 
43
  except PDFInfoNotInstalledError:
44
  print("❌ Poppler not installed or not in PATH. Falling back to pdfplumber.")
45
  except Exception as e:
46
  print(f"❌ Error with pdf2image: {str(e)}. Falling back to pdfplumber.")
47
- else:
48
- print(" Poppler (pdfinfo) not found. Install it with: sudo apt-get install poppler-utils")
49
- print("Falling back to pdfplumber for text extraction.")
50
 
51
  # Fallback to pdfplumber if pdf2image fails or Poppler is not installed
52
  if not PDFPLUMBER_AVAILABLE:
53
  raise ImportError(
54
  "❌ pdfplumber not installed and Poppler is unavailable. "
55
  "Install pdfplumber with: pip install pdfplumber\n"
56
- "Or install Poppler with: sudo apt-get install poppler-utils"
57
- )
58
-
59
- try:
60
- with pdfplumber.open(pdf_path) as pdf:
61
- all_text = []
62
- for page in pdf.pages:
63
- page_text = page.extract_text()
64
- if page_text:
65
- all_text.append(page_text)
66
- extracted_text = "\n".join(all_text).strip()
67
- if not extracted_text:
68
- print("⚠️ No text extracted with pdfplumber. The PDF may be scanned or empty.")
69
- return extracted_text
70
- except Exception as e:
71
- raise Exception(f"❌ Failed to extract text with pdfplumber: {str(e)}")
 
13
  """Check if Poppler's pdfinfo is installed and in PATH."""
14
  import subprocess
15
  try:
16
+ result = subprocess.run(['pdfinfo', '--version'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
17
+ print(f"✅ Poppler found: {result.stdout.decode().strip()}")
18
  return True
19
  except (subprocess.CalledProcessError, FileNotFoundError):
20
+ print("❌ Poppler (pdfinfo) not found. Install it with: sudo apt-get install poppler-utils")
21
  return False
22
 
23
  def extract_text_from_pdf(pdf_path):
24
  """
25
  Extract text from a PDF file using pdf2image and pytesseract for OCR.
26
+ Fallback to pdfplumber for text-based PDFs. Returns structured JSON per page.
27
  """
28
+ print(f"Processing PDF: {pdf_path}")
29
  # Validate PDF path
30
  if not os.path.exists(pdf_path):
31
  raise FileNotFoundError(f"❌ PDF file not found: {pdf_path}")
32
 
33
+ # Check for tesseract-ocr
34
+ try:
35
+ pytesseract.get_tesseract_version()
36
+ print("✅ Tesseract OCR found")
37
+ except pytesseract.TesseractNotFoundError:
38
+ raise Exception("❌ Tesseract OCR not found. Install it with: sudo apt-get install tesseract-ocr")
39
+
40
  # Try pdf2image with Poppler for OCR (best for scanned PDFs)
41
+ result = {"pages": [], "status": "success", "error": None}
42
  if check_poppler_installed():
43
  try:
44
  with tempfile.TemporaryDirectory() as tempdir:
45
+ print(f"Converting PDF to images in temp directory: {tempdir}")
46
  images = convert_from_path(pdf_path, dpi=300, output_folder=tempdir)
47
+ print(f"Converted {len(images)} pages to images")
48
+ for i, img in enumerate(images):
49
+ text = pytesseract.image_to_string(img).strip()
50
+ print(f"Extracted text from page {i+1}: {text[:50]}...")
51
+ result["pages"].append({"page_number": i+1, "text": text})
52
+ if not result["pages"]:
53
+ result["status"] = "failed"
54
+ result["error"] = "No text extracted with pdf2image. The PDF may be empty or OCR failed."
55
+ return result
56
  except PDFInfoNotInstalledError:
57
  print("❌ Poppler not installed or not in PATH. Falling back to pdfplumber.")
58
  except Exception as e:
59
  print(f"❌ Error with pdf2image: {str(e)}. Falling back to pdfplumber.")
60
+ result["status"] = "failed"
61
+ result["error"] = str(e)
 
62
 
63
  # Fallback to pdfplumber if pdf2image fails or Poppler is not installed
64
  if not PDFPLUMBER_AVAILABLE:
65
  raise ImportError(
66
  "❌ pdfplumber not installed and Poppler is unavailable. "
67
  "Install pdfplumber with: pip install pdfplumber\n"
68
+ Long context detected, continuing in next response...