Ash2749 commited on
Commit
0302b73
Β·
verified Β·
1 Parent(s): dc406c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -8
app.py CHANGED
@@ -3,6 +3,7 @@ import gradio as gr
3
  import os
4
  import json
5
  import shutil
 
6
  from datetime import datetime
7
  from pathlib import Path
8
  from typing import Tuple
@@ -11,6 +12,47 @@ from typing import Tuple
11
  from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
12
  from eval import evaluate_ocr_accuracy, clean_control_characters
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  # Initialize directories
16
  def create_directories():
@@ -90,7 +132,44 @@ def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]:
90
  return extracted_text, json_display, analysis_display
91
 
92
  except Exception as e:
93
- error_msg = f"❌ Error processing PDF: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  return error_msg, "", ""
95
 
96
 
@@ -397,11 +476,6 @@ except Exception as e:
397
  # Create and launch the interface
398
  if __name__ == "__main__":
399
  app = create_interface()
400
-
401
  # Launch with proper configuration for Hugging Face Spaces
402
- app.launch(
403
- server_name="0.0.0.0",
404
- server_port=7860,
405
- share=False,
406
- show_error=True
407
- )
 
3
  import os
4
  import json
5
  import shutil
6
+ import subprocess
7
  from datetime import datetime
8
  from pathlib import Path
9
  from typing import Tuple
 
12
  from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
13
  from eval import evaluate_ocr_accuracy, clean_control_characters
14
 
15
+ def check_system_dependencies():
16
+ """Check and report system dependencies status."""
17
+ print("πŸ” Checking system dependencies...")
18
+
19
+ # Check Tesseract
20
+ try:
21
+ result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
22
+ if result.returncode == 0:
23
+ print("βœ… Tesseract is available")
24
+ else:
25
+ print("❌ Tesseract check failed")
26
+ except FileNotFoundError:
27
+ print("❌ Tesseract not found in PATH")
28
+
29
+ # Check Poppler
30
+ poppler_tools = ['pdftoppm', 'pdfinfo']
31
+ for tool in poppler_tools:
32
+ try:
33
+ result = subprocess.run(['which', tool], capture_output=True, text=True)
34
+ if result.returncode == 0:
35
+ print(f"βœ… {tool} is available")
36
+ else:
37
+ print(f"❌ {tool} not found")
38
+ except FileNotFoundError:
39
+ print(f"❌ {tool} not available")
40
+
41
+ # Check pdf2image
42
+ try:
43
+ import importlib.util
44
+ if importlib.util.find_spec("pdf2image") is not None:
45
+ print("βœ… pdf2image is available")
46
+ else:
47
+ print("❌ pdf2image module not found")
48
+ except Exception as e:
49
+ print(f"❌ pdf2image check failed: {e}")
50
+
51
+ print(f"πŸ“ PATH: {os.environ.get('PATH', 'NOT SET')}")
52
+
53
+ # Run dependency check on startup
54
+ check_system_dependencies()
55
+
56
 
57
  # Initialize directories
58
  def create_directories():
 
132
  return extracted_text, json_display, analysis_display
133
 
134
  except Exception as e:
135
+ error_msg = str(e)
136
+
137
+ # Provide specific guidance for common errors
138
+ if (
139
+ "poppler" in error_msg.lower()
140
+ or "unable to get page count" in error_msg.lower()
141
+ ):
142
+ error_msg = """❌ PDF Processing Error: Poppler not found
143
+
144
+ πŸ”§ This error occurs because Poppler (PDF utilities) is not properly installed.
145
+
146
+ πŸ“‹ For Hugging Face Spaces:
147
+ 1. Ensure your setup.sh script runs during deployment
148
+ 2. Check that poppler-utils is installed in the container
149
+ 3. Verify the setup logs show successful poppler installation
150
+
151
+ πŸ’‘ The setup.sh script should install these packages:
152
+ - poppler-utils
153
+ - libpoppler-cpp-dev
154
+ - pkg-config
155
+
156
+ 🚨 Original error: {error_msg}
157
+
158
+ πŸ”„ Try restarting the space if this persists."""
159
+ elif "tesseract" in error_msg.lower():
160
+ error_msg = f"""❌ OCR Engine Error: Tesseract issue
161
+
162
+ πŸ”§ This error is related to Tesseract OCR engine.
163
+
164
+ πŸ“‹ Possible solutions:
165
+ 1. Check Tesseract installation in setup.sh
166
+ 2. Verify language data files are available
167
+ 3. Ensure proper permissions on tessdata directory
168
+
169
+ 🚨 Original error: {error_msg}"""
170
+ else:
171
+ error_msg = f"❌ Error processing PDF: {error_msg}"
172
+
173
  return error_msg, "", ""
174
 
175
 
 
476
  # Create and launch the interface
477
  if __name__ == "__main__":
478
  app = create_interface()
479
+
480
  # Launch with proper configuration for Hugging Face Spaces
481
+ app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)