mbuck17 commited on
Commit
c0f45ab
·
verified ·
1 Parent(s): 23f8a04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +135 -12
app.py CHANGED
@@ -1,17 +1,103 @@
1
- # app.py - Hugging Face Spaces version
2
- import gradio as gr
3
- import tempfile
4
  import os
 
 
 
5
  import time
6
  import base64
7
  import json
8
- from paddleocr import PaddleOCR
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import fitz # PyMuPDF
10
 
11
- # Initialize PaddleOCR
12
- print("Loading PaddleOCR models...")
13
- ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
14
- print("PaddleOCR models loaded!")
 
 
 
 
 
 
 
15
 
16
  def process_document(file):
17
  """Process uploaded document with PaddleOCR"""
@@ -21,7 +107,6 @@ def process_document(file):
21
  start_time = time.time()
22
 
23
  try:
24
- # Get file info
25
  filename = os.path.basename(file.name)
26
  print(f"Processing: {filename}")
27
 
@@ -57,6 +142,7 @@ def process_document(file):
57
  📊 **Pages Processed**: {pages_processed}/{total_pages}
58
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
59
  📝 **Text Length**: {len(extracted_text)} characters
 
60
  """
61
 
62
  # For API compatibility, also return JSON format
@@ -66,13 +152,15 @@ def process_document(file):
66
  "filename": filename,
67
  "pages_processed": pages_processed,
68
  "total_pages": total_pages,
69
- "processing_time": processing_time
 
70
  }, indent=2)
71
 
72
  return summary, extracted_text, api_response
73
 
74
  except Exception as e:
75
  error_msg = f"Error processing file: {str(e)}"
 
76
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
77
 
78
  def process_api_request(api_data):
@@ -107,7 +195,8 @@ def process_api_request(api_data):
107
  return json.dumps({
108
  "success": True,
109
  "text": text,
110
- "filename": filename
 
111
  })
112
 
113
  finally:
@@ -148,10 +237,40 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
148
 
149
  with gr.Tab("🔌 API Integration"):
150
  gr.Markdown("### For integration with your Vercel app:")
151
- gr.Markdown("**Endpoint**: `https://your-space-name-your-username.hf.space/api/predict`")
152
  gr.Markdown("**Method**: POST")
153
  gr.Markdown("**Headers**: `Content-Type: application/json`")
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  api_input = gr.Textbox(
156
  label="API Request (JSON)",
157
  placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
@@ -188,6 +307,10 @@ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
188
  - Multi-page PDF support
189
  - RESTful API integration
190
  - Free hosting on Hugging Face
 
 
 
 
191
  """)
192
 
193
  # Launch the app
 
1
+ # app.py - Complete Hugging Face Spaces app with SSL fix
 
 
2
  import os
3
+ import subprocess
4
+ import sys
5
+ import tempfile
6
  import time
7
  import base64
8
  import json
9
+
10
+ # Try to fix SSL library issue before importing PaddleOCR
11
+ def fix_ssl_library():
12
+ """Download and install libssl1.1 if not present"""
13
+ try:
14
+ # Check if libssl1.1 already exists
15
+ if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
16
+ print("libssl.so.1.1 already exists")
17
+ return True
18
+
19
+ print("Attempting to install libssl1.1...")
20
+
21
+ # Download libssl1.1 from Ubuntu repos
22
+ subprocess.run([
23
+ 'wget', '-q',
24
+ 'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
25
+ '-O', '/tmp/libssl1.1.deb'
26
+ ], check=True)
27
+
28
+ # Try to install the package
29
+ result = subprocess.run([
30
+ 'dpkg', '-i', '/tmp/libssl1.1.deb'
31
+ ], capture_output=True, text=True)
32
+
33
+ # If dpkg install failed, try extracting manually
34
+ if result.returncode != 0:
35
+ print("dpkg install failed, trying manual extraction...")
36
+ subprocess.run([
37
+ 'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
38
+ ], check=True)
39
+
40
+ # Set LD_LIBRARY_PATH to include the extracted libraries
41
+ lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
42
+ current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
43
+ if current_ld_path:
44
+ os.environ['LD_LIBRARY_PATH'] = f"{lib_path}:{current_ld_path}"
45
+ else:
46
+ os.environ['LD_LIBRARY_PATH'] = lib_path
47
+ print(f"Set LD_LIBRARY_PATH to: {os.environ['LD_LIBRARY_PATH']}")
48
+
49
+ return True
50
+
51
+ except Exception as e:
52
+ print(f"Failed to install libssl1.1: {e}")
53
+ return False
54
+
55
+ # Try alternative PaddlePaddle versions
56
+ def try_paddle_import():
57
+ """Try different approaches to import PaddleOCR"""
58
+
59
+ # First try the SSL fix
60
+ fix_ssl_library()
61
+
62
+ # Try importing with different environment variables
63
+ os.environ['PADDLE_GIT_DISABLE'] = '1'
64
+
65
+ try:
66
+ from paddleocr import PaddleOCR
67
+ return PaddleOCR
68
+ except ImportError as e:
69
+ if 'libssl.so.1.1' in str(e):
70
+ print("Still having SSL issues, trying alternative PaddlePaddle version...")
71
+
72
+ # Try installing older version
73
+ try:
74
+ subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
75
+ capture_output=True)
76
+ subprocess.run([sys.executable, '-m', 'pip', 'install', 'paddlepaddle==2.4.2'],
77
+ check=True)
78
+ from paddleocr import PaddleOCR
79
+ return PaddleOCR
80
+ except Exception as inner_e:
81
+ print(f"Failed to install alternative version: {inner_e}")
82
+
83
+ print(f"PaddleOCR import failed: {e}")
84
+ raise e
85
+
86
+ # Import other required libraries
87
+ import gradio as gr
88
  import fitz # PyMuPDF
89
 
90
+ # Try to import PaddleOCR with fixes
91
+ print("Attempting to import PaddleOCR...")
92
+ try:
93
+ PaddleOCR = try_paddle_import()
94
+ print("Loading PaddleOCR models...")
95
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
96
+ print("PaddleOCR models loaded successfully!")
97
+ except Exception as e:
98
+ print(f"Failed to load PaddleOCR: {e}")
99
+ print("Application will exit - SSL library issue not resolved")
100
+ sys.exit(1)
101
 
102
  def process_document(file):
103
  """Process uploaded document with PaddleOCR"""
 
107
  start_time = time.time()
108
 
109
  try:
 
110
  filename = os.path.basename(file.name)
111
  print(f"Processing: {filename}")
112
 
 
142
  📊 **Pages Processed**: {pages_processed}/{total_pages}
143
  ⏱️ **Processing Time**: {processing_time:.2f} seconds
144
  📝 **Text Length**: {len(extracted_text)} characters
145
+ 🔧 **OCR Engine**: PaddleOCR
146
  """
147
 
148
  # For API compatibility, also return JSON format
 
152
  "filename": filename,
153
  "pages_processed": pages_processed,
154
  "total_pages": total_pages,
155
+ "processing_time": processing_time,
156
+ "ocr_engine": "PaddleOCR"
157
  }, indent=2)
158
 
159
  return summary, extracted_text, api_response
160
 
161
  except Exception as e:
162
  error_msg = f"Error processing file: {str(e)}"
163
+ print(f"Processing error: {e}")
164
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
165
 
166
  def process_api_request(api_data):
 
195
  return json.dumps({
196
  "success": True,
197
  "text": text,
198
+ "filename": filename,
199
+ "ocr_engine": "PaddleOCR"
200
  })
201
 
202
  finally:
 
237
 
238
  with gr.Tab("🔌 API Integration"):
239
  gr.Markdown("### For integration with your Vercel app:")
240
+ gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
241
  gr.Markdown("**Method**: POST")
242
  gr.Markdown("**Headers**: `Content-Type: application/json`")
243
 
244
+ with gr.Row():
245
+ with gr.Column():
246
+ gr.Markdown("**Sample Request:**")
247
+ gr.Code('''
248
+ {
249
+ "data": [
250
+ {
251
+ "file": "base64_encoded_file_data_here",
252
+ "filename": "lab_report.pdf"
253
+ }
254
+ ]
255
+ }
256
+ ''', language="json")
257
+
258
+ with gr.Column():
259
+ gr.Markdown("**Sample Response:**")
260
+ gr.Code('''
261
+ {
262
+ "data": [
263
+ {
264
+ "success": true,
265
+ "text": "Extracted text content...",
266
+ "filename": "lab_report.pdf",
267
+ "ocr_engine": "PaddleOCR"
268
+ }
269
+ ]
270
+ }
271
+ ''', language="json")
272
+
273
+ gr.Markdown("### Test API Request:")
274
  api_input = gr.Textbox(
275
  label="API Request (JSON)",
276
  placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
 
307
  - Multi-page PDF support
308
  - RESTful API integration
309
  - Free hosting on Hugging Face
310
+ - SSL compatibility fixes included
311
+
312
+ ### 🔗 Integration URL
313
+ `https://mbuck17-paddleocr-processor.hf.space/api/predict`
314
  """)
315
 
316
  # Launch the app