Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,6 @@ from PIL import Image
|
|
| 5 |
import os
|
| 6 |
import tempfile
|
| 7 |
from tqdm import tqdm
|
| 8 |
-
import subprocess
|
| 9 |
import re
|
| 10 |
from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
|
| 11 |
from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
|
|
@@ -20,22 +19,6 @@ total_files = 0
|
|
| 20 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 21 |
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
|
| 22 |
|
| 23 |
-
def check_poppler():
|
| 24 |
-
"""Check if poppler-utils is installed."""
|
| 25 |
-
try:
|
| 26 |
-
subprocess.run(['pdftoppm', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 27 |
-
return True
|
| 28 |
-
except FileNotFoundError:
|
| 29 |
-
return False
|
| 30 |
-
|
| 31 |
-
def check_tesseract():
|
| 32 |
-
"""Check if tesseract-ocr is installed."""
|
| 33 |
-
try:
|
| 34 |
-
subprocess.run(['tesseract', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
| 35 |
-
return True
|
| 36 |
-
except FileNotFoundError:
|
| 37 |
-
return False
|
| 38 |
-
|
| 39 |
def save_temp_file(pdf_bytes):
|
| 40 |
"""Save PDF bytes to a temporary file and return the path."""
|
| 41 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
|
@@ -58,11 +41,6 @@ def process_contract(pdf_bytes, object_type):
|
|
| 58 |
processed_files = 0
|
| 59 |
|
| 60 |
print("Received file - Starting processing")
|
| 61 |
-
if not check_poppler() or not check_tesseract():
|
| 62 |
-
error_msg = "Error: Required dependencies missing. Install poppler-utils (e.g., 'sudo apt-get install poppler-utils') and tesseract-ocr (e.g., 'sudo apt-get install tesseract-ocr')."
|
| 63 |
-
print(error_msg)
|
| 64 |
-
return error_msg, {}, [], "0/1"
|
| 65 |
-
|
| 66 |
temp_path = save_temp_file(pdf_bytes)
|
| 67 |
print(f"Temporary file created at: {temp_path}")
|
| 68 |
text = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
|
|
|
|
| 5 |
import os
|
| 6 |
import tempfile
|
| 7 |
from tqdm import tqdm
|
|
|
|
| 8 |
import re
|
| 9 |
from ai_mapping import extract_key_values_with_layoutlm, run_ai_mapping_with_layoutlm
|
| 10 |
from ocr_utils import extract_text_from_pdf_with_tesseract_or_layoutlm
|
|
|
|
| 19 |
tokenizer = LayoutLMv3Tokenizer.from_pretrained("microsoft/layoutlmv3-base")
|
| 20 |
model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
def save_temp_file(pdf_bytes):
|
| 23 |
"""Save PDF bytes to a temporary file and return the path."""
|
| 24 |
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
|
|
|
|
| 41 |
processed_files = 0
|
| 42 |
|
| 43 |
print("Received file - Starting processing")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
temp_path = save_temp_file(pdf_bytes)
|
| 45 |
print(f"Temporary file created at: {temp_path}")
|
| 46 |
text = extract_text_from_pdf_with_tesseract_or_layoutlm(temp_path)
|