pavansuresh commited on
Commit
c8b6167
·
verified ·
1 Parent(s): 8f3b77b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -0
app.py CHANGED
@@ -28,10 +28,20 @@ def check_poppler():
28
  except FileNotFoundError:
29
  return False
30
 
 
 
 
 
 
 
 
 
31
  def extract_text_from_pdf(pdf_bytes):
32
  """Convert PDF to images and extract text using OCR."""
33
  if not check_poppler():
34
  return "Error: poppler-utils not installed. Install it (e.g., 'sudo apt-get install poppler-utils')."
 
 
35
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
36
  tmp.write(pdf_bytes)
37
  temp_path = tmp.name
@@ -40,8 +50,10 @@ def extract_text_from_pdf(pdf_bytes):
40
  text = ""
41
  for img in images:
42
  text += pytesseract.image_to_string(img) + "\n"
 
43
  return text
44
  except Exception as e:
 
45
  return f"Error extracting text: {str(e)}"
46
  finally:
47
  if os.path.exists(temp_path):
@@ -74,7 +86,9 @@ def process_contract(pdf_bytes, object_type):
74
  if isinstance(text, str) and text.startswith("Error"):
75
  return text, {}, [], "0/1"
76
 
 
77
  key_data = extract_key_data(text)
 
78
  risks = detect_risks(key_data)
79
  status = "✅ Processed" if not risks else "⚠️ Processed with risks"
80
 
 
28
  except FileNotFoundError:
29
  return False
30
 
31
+ def check_tesseract():
32
+ """Check if tesseract-ocr is installed."""
33
+ try:
34
+ subprocess.run(['tesseract', '-v'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
35
+ return True
36
+ except FileNotFoundError:
37
+ return False
38
+
39
  def extract_text_from_pdf(pdf_bytes):
40
  """Convert PDF to images and extract text using OCR."""
41
  if not check_poppler():
42
  return "Error: poppler-utils not installed. Install it (e.g., 'sudo apt-get install poppler-utils')."
43
+ if not check_tesseract():
44
+ return "Error: tesseract-ocr not installed. Install it (e.g., 'sudo apt-get install tesseract-ocr')."
45
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
46
  tmp.write(pdf_bytes)
47
  temp_path = tmp.name
 
50
  text = ""
51
  for img in images:
52
  text += pytesseract.image_to_string(img) + "\n"
53
+ print(f"OCR completed - Extracted text length: {len(text)}")
54
  return text
55
  except Exception as e:
56
+ print(f"OCR failed: {str(e)}")
57
  return f"Error extracting text: {str(e)}"
58
  finally:
59
  if os.path.exists(temp_path):
 
86
  if isinstance(text, str) and text.startswith("Error"):
87
  return text, {}, [], "0/1"
88
 
89
+ print(f"Extracting key data")
90
  key_data = extract_key_data(text)
91
+ print(f"Detecting risks")
92
  risks = detect_risks(key_data)
93
  status = "✅ Processed" if not risks else "⚠️ Processed with risks"
94