Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import gradio as gr
|
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import shutil
|
|
|
|
| 6 |
from datetime import datetime
|
| 7 |
from pathlib import Path
|
| 8 |
from typing import Tuple
|
|
@@ -11,6 +12,47 @@ from typing import Tuple
|
|
| 11 |
from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
|
| 12 |
from eval import evaluate_ocr_accuracy, clean_control_characters
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
# Initialize directories
|
| 16 |
def create_directories():
|
|
@@ -90,7 +132,44 @@ def process_pdf_ocr(pdf_file) -> Tuple[str, str, str]:
|
|
| 90 |
return extracted_text, json_display, analysis_display
|
| 91 |
|
| 92 |
except Exception as e:
|
| 93 |
-
error_msg =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
return error_msg, "", ""
|
| 95 |
|
| 96 |
|
|
@@ -397,11 +476,6 @@ except Exception as e:
|
|
| 397 |
# Create and launch the interface
|
| 398 |
if __name__ == "__main__":
|
| 399 |
app = create_interface()
|
| 400 |
-
|
| 401 |
# Launch with proper configuration for Hugging Face Spaces
|
| 402 |
-
app.launch(
|
| 403 |
-
server_name="0.0.0.0",
|
| 404 |
-
server_port=7860,
|
| 405 |
-
share=False,
|
| 406 |
-
show_error=True
|
| 407 |
-
)
|
|
|
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import shutil
|
| 6 |
+
import subprocess
|
| 7 |
from datetime import datetime
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Tuple
|
|
|
|
| 12 |
from main6_pix2text import extract_all_text_advanced_pix2text, initialize_pix2text
|
| 13 |
from eval import evaluate_ocr_accuracy, clean_control_characters
|
| 14 |
|
| 15 |
+
def check_system_dependencies():
|
| 16 |
+
"""Check and report system dependencies status."""
|
| 17 |
+
print("π Checking system dependencies...")
|
| 18 |
+
|
| 19 |
+
# Check Tesseract
|
| 20 |
+
try:
|
| 21 |
+
result = subprocess.run(['tesseract', '--version'], capture_output=True, text=True)
|
| 22 |
+
if result.returncode == 0:
|
| 23 |
+
print("β
Tesseract is available")
|
| 24 |
+
else:
|
| 25 |
+
print("β Tesseract check failed")
|
| 26 |
+
except FileNotFoundError:
|
| 27 |
+
print("β Tesseract not found in PATH")
|
| 28 |
+
|
| 29 |
+
# Check Poppler
|
| 30 |
+
poppler_tools = ['pdftoppm', 'pdfinfo']
|
| 31 |
+
for tool in poppler_tools:
|
| 32 |
+
try:
|
| 33 |
+
result = subprocess.run(['which', tool], capture_output=True, text=True)
|
| 34 |
+
if result.returncode == 0:
|
| 35 |
+
print(f"β
{tool} is available")
|
| 36 |
+
else:
|
| 37 |
+
print(f"β {tool} not found")
|
| 38 |
+
except FileNotFoundError:
|
| 39 |
+
print(f"β {tool} not available")
|
| 40 |
+
|
| 41 |
+
# Check pdf2image
|
| 42 |
+
try:
|
| 43 |
+
import importlib.util
|
| 44 |
+
if importlib.util.find_spec("pdf2image") is not None:
|
| 45 |
+
print("β
pdf2image is available")
|
| 46 |
+
else:
|
| 47 |
+
print("β pdf2image module not found")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"β pdf2image check failed: {e}")
|
| 50 |
+
|
| 51 |
+
print(f"π PATH: {os.environ.get('PATH', 'NOT SET')}")
|
| 52 |
+
|
| 53 |
+
# Run dependency check on startup
|
| 54 |
+
check_system_dependencies()
|
| 55 |
+
|
| 56 |
|
| 57 |
# Initialize directories
|
| 58 |
def create_directories():
|
|
|
|
| 132 |
return extracted_text, json_display, analysis_display
|
| 133 |
|
| 134 |
except Exception as e:
|
| 135 |
+
error_msg = str(e)
|
| 136 |
+
|
| 137 |
+
# Provide specific guidance for common errors
|
| 138 |
+
if (
|
| 139 |
+
"poppler" in error_msg.lower()
|
| 140 |
+
or "unable to get page count" in error_msg.lower()
|
| 141 |
+
):
|
| 142 |
+
error_msg = """β PDF Processing Error: Poppler not found
|
| 143 |
+
|
| 144 |
+
π§ This error occurs because Poppler (PDF utilities) is not properly installed.
|
| 145 |
+
|
| 146 |
+
π For Hugging Face Spaces:
|
| 147 |
+
1. Ensure your setup.sh script runs during deployment
|
| 148 |
+
2. Check that poppler-utils is installed in the container
|
| 149 |
+
3. Verify the setup logs show successful poppler installation
|
| 150 |
+
|
| 151 |
+
π‘ The setup.sh script should install these packages:
|
| 152 |
+
- poppler-utils
|
| 153 |
+
- libpoppler-cpp-dev
|
| 154 |
+
- pkg-config
|
| 155 |
+
|
| 156 |
+
π¨ Original error: {error_msg}
|
| 157 |
+
|
| 158 |
+
π Try restarting the space if this persists."""
|
| 159 |
+
elif "tesseract" in error_msg.lower():
|
| 160 |
+
error_msg = f"""β OCR Engine Error: Tesseract issue
|
| 161 |
+
|
| 162 |
+
π§ This error is related to Tesseract OCR engine.
|
| 163 |
+
|
| 164 |
+
π Possible solutions:
|
| 165 |
+
1. Check Tesseract installation in setup.sh
|
| 166 |
+
2. Verify language data files are available
|
| 167 |
+
3. Ensure proper permissions on tessdata directory
|
| 168 |
+
|
| 169 |
+
π¨ Original error: {error_msg}"""
|
| 170 |
+
else:
|
| 171 |
+
error_msg = f"β Error processing PDF: {error_msg}"
|
| 172 |
+
|
| 173 |
return error_msg, "", ""
|
| 174 |
|
| 175 |
|
|
|
|
| 476 |
# Create and launch the interface
|
| 477 |
if __name__ == "__main__":
|
| 478 |
app = create_interface()
|
| 479 |
+
|
| 480 |
# Launch with proper configuration for Hugging Face Spaces
|
| 481 |
+
app.launch(server_name="0.0.0.0", server_port=7860, share=False, show_error=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|