pavansuresh commited on
Commit
83973ae
·
verified ·
1 Parent(s): 4c19d6c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -22
app.py CHANGED
@@ -5,7 +5,6 @@ from PIL import Image
5
  import os
6
  import tempfile
7
  from tqdm import tqdm
8
- import subprocess
9
  import re
10
  from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
11
  from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
@@ -20,22 +19,6 @@ total_files = 0
20
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
21
  model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
22
 
23
- def check_poppler():
24
- """Check if poppler-utils is installed."""
25
- try:
26
- subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
27
- return True
28
- except FileNotFoundError:
29
- return False
30
-
31
- def check_tesseract():
32
- """Check if tesseract-ocr is installed."""
33
- try:
34
- subprocess.run(['tesseract', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
35
- return True
36
- except FileNotFoundError:
37
- return False
38
-
39
  def save_temp_file(pdf_bytes):
40
  """Save PDF bytes to a temporary file and return the path."""
41
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
@@ -58,11 +41,6 @@ def process_contract(pdf_bytes, object_type):
58
  processed_files = 0
59
 
60
  print("Received file - Starting processing")
61
- if not check_poppler() or not check_tesseract():
62
- error_msg = "Error: Required dependencies missing. Install poppler-utils (e.g., 'sudo apt-get install poppler-utils') and tesseract-ocr (e.g., 'sudo apt-get install tesseract-ocr')."
63
- print(error_msg)
64
- return error_msg, {}, [], "0/1"
65
-
66
  temp_path = save_temp_file(pdf_bytes)
67
  print(f"Temporary file created at: {temp_path}")
68
  text = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
 
5
  import os
6
  import tempfile
7
  from tqdm import tqdm
 
8
  import re
9
  from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
10
  from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
 
19
  tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
20
  model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  def save_temp_file(pdf_bytes):
23
  """Save PDF bytes to a temporary file and return the path."""
24
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
 
41
  processed_files = 0
42
 
43
  print("Received file - Starting processing")
 
 
 
 
 
44
  temp_path = save_temp_file(pdf_bytes)
45
  print(f"Temporary file created at: {temp_path}")
46
  text = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)