pavansuresh commited on
Commit
2ae2f88
·
verified ·
1 Parent(s): b9ae2ff

Update ocr_utils.py

Browse files
Files changed (1) hide show
  1. ocr_utils.py +12 -24
ocr_utils.py CHANGED
@@ -1,47 +1,35 @@
1
  from pdf2image import convert_from_path
2
  import pytesseract
3
- from transformers import LayoutLMv3ImageProcessor, LayoutLMv3ForTokenClassification
4
- from PIL import Image
5
- import torch
6
  import os
7
-
8
- # Load LayoutLMv3 components for OCR (optional, use if fine-tuned)
9
- processor = LayoutLMv3ImageProcessor(apply_ocr=True)
10
- model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base") # Fine-tune for OCR if needed
11
 
12
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
13
  """
14
- Extract text from a scanned PDF using Tesseract or LayoutLMv3.
15
  Args:
16
  pdf_path (str): Path to the PDF file.
17
  Returns:
18
  str: Extracted text from all pages, or empty string if failed.
19
  """
20
  try:
21
- # Convert PDF to images (one per page)
22
- images = convert_from_path(pdf_path)
 
 
 
23
  all_text = []
24
 
25
  for i, image in enumerate(images):
26
- # Try Tesseract first
27
  text = pytesseract.image_to_string(image)
28
  if text.strip():
29
  all_text.append(f"Page {i+1}:\n{text}")
30
  else:
31
- # Fall back to LayoutLMv3 if Tesseract fails (simplified)
32
- encoding = processor(images=[image], return_tensors="pt")
33
- input_ids = encoding["input_ids"]
34
- attention_mask = encoding["attention_mask"]
35
-
36
- with torch.no_grad():
37
- outputs = model(input_ids=input_ids, attention_mask=attention_mask)
38
- predictions = torch.argmax(outputs.logits, dim=2)
39
- tokens = processor.tokenizer.convert_ids_to_tokens(input_ids[0])
40
- labels = predictions[0].tolist()
41
- page_text = " ".join([tokens[i] for i, label in enumerate(labels) if label > 0]) # Adjust label logic
42
- all_text.append(f"Page {i+1} (LayoutLMv3):\n{page_text}")
43
 
44
  return "\n".join(all_text) if all_text else ""
45
  except Exception as e:
46
  print(f"OCR failed: {str(e)}")
47
- return ""
 
 
 
 
1
  from pdf2image import convert_from_path
2
  import pytesseract
 
 
 
3
  import os
4
+ import tempfile
 
 
 
5
 
6
  def extract_text_from_pdf_with_tesseract_or_layoutlm(pdf_path: str) -> str:
7
  """
8
+ Extract text from a scanned PDF using Tesseract.
9
  Args:
10
  pdf_path (str): Path to the PDF file.
11
  Returns:
12
  str: Extracted text from all pages, or empty string if failed.
13
  """
14
  try:
15
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
16
+ with open(pdf_path, 'rb') as f:
17
+ tmp.write(f.read())
18
+ temp_path = tmp.name
19
+ images = convert_from_path(temp_path)
20
  all_text = []
21
 
22
  for i, image in enumerate(images):
 
23
  text = pytesseract.image_to_string(image)
24
  if text.strip():
25
  all_text.append(f"Page {i+1}:\n{text}")
26
  else:
27
+ all_text.append(f"Page {i+1}: No text detected")
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  return "\n".join(all_text) if all_text else ""
30
  except Exception as e:
31
  print(f"OCR failed: {str(e)}")
32
+ return ""
33
+ finally:
34
+ if os.path.exists(temp_path):
35
+ os.unlink(temp_path)