mbuckle commited on
Commit
9b66525
Β·
1 Parent(s): 8c10e64

Page Count fix

Browse files
Files changed (1) hide show
  1. app.py +140 -13
app.py CHANGED
@@ -1,17 +1,103 @@
1
- # app.py - Hugging Face Spaces version
2
- import gradio as gr
3
- import tempfile
4
  import os
 
 
 
5
  import time
6
  import base64
7
  import json
8
- from paddleocr import PaddleOCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import fitz # PyMuPDF
10
 
11
- # Initialize PaddleOCR
12
- print("Loading PaddleOCR models...")
13
- ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
14
- print("PaddleOCR models loaded!")
 
 
 
 
 
 
 
15
 
16
  def process_document(file):
17
  """Process uploaded document with PaddleOCR"""
@@ -21,7 +107,6 @@ def process_document(file):
21
  start_time = time.time()
22
 
23
  try:
24
- # Get file info
25
  filename = os.path.basename(file.name)
26
  print(f"Processing: {filename}")
27
 
@@ -30,7 +115,11 @@ def process_document(file):
30
  if filename.lower().endswith('.pdf'):
31
  try:
32
  doc = fitz.open(file.name)
33
- total_pages = len(doc)
 
 
 
 
34
  doc.close()
35
  except Exception as e:
36
  print(f"Could not count PDF pages: {e}")
@@ -57,6 +146,7 @@ def process_document(file):
57
  πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
58
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
59
  πŸ“ **Text Length**: {len(extracted_text)} characters
 
60
  """
61
 
62
  # For API compatibility, also return JSON format
@@ -66,13 +156,15 @@ def process_document(file):
66
  "filename": filename,
67
  "pages_processed": pages_processed,
68
  "total_pages": total_pages,
69
- "processing_time": processing_time
 
70
  }, indent=2)
71
 
72
  return summary, extracted_text, api_response
73
 
74
  except Exception as e:
75
  error_msg = f"Error processing file: {str(e)}"
 
76
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
77
 
78
  def process_api_request(api_data):
@@ -107,7 +199,8 @@ def process_api_request(api_data):
107
  return json.dumps({
108
  "success": True,
109
  "text": text,
110
- "filename": filename
 
111
  })
112
 
113
  finally:
@@ -148,10 +241,40 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
148
 
149
  with gr.Tab("πŸ”Œ API Integration"):
150
  gr.Markdown("### For integration with your Vercel app:")
151
- gr.Markdown("**Endpoint**: `https://your-space-name-your-username.hf.space/api/predict`")
152
  gr.Markdown("**Method**: POST")
153
  gr.Markdown("**Headers**: `Content-Type: application/json`")
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  api_input = gr.Textbox(
156
  label="API Request (JSON)",
157
  placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
@@ -188,6 +311,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
188
  - Multi-page PDF support
189
  - RESTful API integration
190
  - Free hosting on Hugging Face
 
 
 
 
191
  """)
192
 
193
  # Launch the app
 
1
+ # app.py - Complete Hugging Face Spaces app with SSL fix
 
 
2
  import os
3
+ import subprocess
4
+ import sys
5
+ import tempfile
6
  import time
7
  import base64
8
  import json
9
+
10
+ # Try to fix SSL library issue before importing PaddleOCR
11
+ def fix_ssl_library():
12
+ """Download and install libssl1.1 if not present"""
13
+ try:
14
+ # Check if libssl1.1 already exists
15
+ if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
16
+ print("libssl.so.1.1 already exists")
17
+ return True
18
+
19
+ print("Attempting to install libssl1.1...")
20
+
21
+ # Download libssl1.1 from Ubuntu repos
22
+ subprocess.run([
23
+ 'wget', '-q',
24
+ 'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
25
+ '-O', '/tmp/libssl1.1.deb'
26
+ ], check=True)
27
+
28
+ # Try to install the package
29
+ result = subprocess.run([
30
+ 'dpkg', '-i', '/tmp/libssl1.1.deb'
31
+ ], capture_output=True, text=True)
32
+
33
+ # If dpkg install failed, try extracting manually
34
+ if result.returncode != 0:
35
+ print("dpkg install failed, trying manual extraction...")
36
+ subprocess.run([
37
+ 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
38
+ ], check=True)
39
+
40
+ # Set LD_LIBRARY_PATH to include the extracted libraries
41
+ lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
42
+ current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
43
+ if current_ld_path:
44
+ os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
45
+ else:
46
+ os.environ['LD_LIBRARY_PATH'] = lib_path
47
+ print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
48
+
49
+ return True
50
+
51
+ except Exception as e:
52
+ print(f"Failed to install libssl1.1: {e}")
53
+ return False
54
+
55
+ # Try alternative PaddlePaddle versions
56
+ def try_paddle_import():
57
+ """Try different approaches to import PaddleOCR"""
58
+
59
+ # First try the SSL fix
60
+ fix_ssl_library()
61
+
62
+ # Try importing with different environment variables
63
+ os.environ['PADDLE_GIT_DISABLE'] = '1'
64
+
65
+ try:
66
+ from paddleocr import PaddleOCR
67
+ return PaddleOCR
68
+ except ImportError as e:
69
+ if 'libssl.so.1.1' in str(e):
70
+ print("Still having SSL issues, trying alternative PaddlePaddle version...")
71
+
72
+ # Try installing older version
73
+ try:
74
+ subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
75
+ capture_output=True)
76
+ subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
77
+ check=True)
78
+ from paddleocr import PaddleOCR
79
+ return PaddleOCR
80
+ except Exception as inner_e:
81
+ print(f"Failed to install alternative version: {inner_e}")
82
+
83
+ print(f"PaddleOCR import failed: {e}")
84
+ raise e
85
+
86
+ # Import other required libraries
87
+ import gradio as gr
88
  import fitz # PyMuPDF
89
 
90
+ # Try to import PaddleOCR with fixes
91
+ print("Attempting to import PaddleOCR...")
92
+ try:
93
+ PaddleOCR = try_paddle_import()
94
+ print("Loading PaddleOCR models...")
95
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
96
+ print("PaddleOCR models loaded successfully!")
97
+ except Exception as e:
98
+ print(f"Failed to load PaddleOCR: {e}")
99
+ print("Application will exit - SSL library issue not resolved")
100
+ sys.exit(1)
101
 
102
  def process_document(file):
103
  """Process uploaded document with PaddleOCR"""
 
107
  start_time = time.time()
108
 
109
  try:
 
110
  filename = os.path.basename(file.name)
111
  print(f"Processing: {filename}")
112
 
 
115
  if filename.lower().endswith('.pdf'):
116
  try:
117
  doc = fitz.open(file.name)
118
+ # Handle different PyMuPDF versions
119
+ try:
120
+ total_pages = doc.page_count # Newer versions
121
+ except AttributeError:
122
+ total_pages = len(doc) # Older versions or alternative
123
  doc.close()
124
  except Exception as e:
125
  print(f"Could not count PDF pages: {e}")
 
146
  πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
147
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
148
  πŸ“ **Text Length**: {len(extracted_text)} characters
149
+ πŸ”§ **OCR Engine**: PaddleOCR
150
  """
151
 
152
  # For API compatibility, also return JSON format
 
156
  "filename": filename,
157
  "pages_processed": pages_processed,
158
  "total_pages": total_pages,
159
+ "processing_time": processing_time,
160
+ "ocr_engine": "PaddleOCR"
161
  }, indent=2)
162
 
163
  return summary, extracted_text, api_response
164
 
165
  except Exception as e:
166
  error_msg = f"Error processing file: {str(e)}"
167
+ print(f"Processing error: {e}")
168
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
169
 
170
  def process_api_request(api_data):
 
199
  return json.dumps({
200
  "success": True,
201
  "text": text,
202
+ "filename": filename,
203
+ "ocr_engine": "PaddleOCR"
204
  })
205
 
206
  finally:
 
241
 
242
  with gr.Tab("πŸ”Œ API Integration"):
243
  gr.Markdown("### For integration with your Vercel app:")
244
+ gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
245
  gr.Markdown("**Method**: POST")
246
  gr.Markdown("**Headers**: `Content-Type: application/json`")
247
 
248
+ with gr.Row():
249
+ with gr.Column():
250
+ gr.Markdown("**Sample Request:**")
251
+ gr.Code('''
252
+ {
253
+ "data": [
254
+ {
255
+ "file": "base64_encoded_file_data_here",
256
+ "filename": "lab_report.pdf"
257
+ }
258
+ ]
259
+ }
260
+ ''', language="json")
261
+
262
+ with gr.Column():
263
+ gr.Markdown("**Sample Response:**")
264
+ gr.Code('''
265
+ {
266
+ "data": [
267
+ {
268
+ "success": true,
269
+ "text": "Extracted text content...",
270
+ "filename": "lab_report.pdf",
271
+ "ocr_engine": "PaddleOCR"
272
+ }
273
+ ]
274
+ }
275
+ ''', language="json")
276
+
277
+ gr.Markdown("### Test API Request:")
278
  api_input = gr.Textbox(
279
  label="API Request (JSON)",
280
  placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
 
311
  - Multi-page PDF support
312
  - RESTful API integration
313
  - Free hosting on Hugging Face
314
+ - SSL compatibility fixes included
315
+
316
+ ### πŸ”— Integration URL
317
+ `https://mbuck17-paddleocr-processor.hf.space/api/predict`
318
  """)
319
 
320
  # Launch the app