mbuckle commited on
Commit
56630b3
Β·
1 Parent(s): e02f05d

Update monkey patch

Browse files
Files changed (1) hide show
  1. app.py +59 -53
app.py CHANGED
@@ -1,4 +1,5 @@
1
- # app.py - Complete Hugging Face Spaces app with SSL fix
 
2
  import os
3
  import subprocess
4
  import sys
@@ -7,37 +8,32 @@ import time
7
  import base64
8
  import json
9
 
10
- # Try to fix SSL library issue before importing PaddleOCR
11
  def fix_ssl_library():
12
  """Download and install libssl1.1 if not present"""
13
  try:
14
- # Check if libssl1.1 already exists
15
  if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
16
  print("libssl.so.1.1 already exists")
17
  return True
18
 
19
  print("Attempting to install libssl1.1...")
20
 
21
- # Download libssl1.1 from Ubuntu repos
22
  subprocess.run([
23
  'wget', '-q',
24
  'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
25
  '-O', '/tmp/libssl1.1.deb'
26
  ], check=True)
27
 
28
- # Try to install the package
29
  result = subprocess.run([
30
  'dpkg', '-i', '/tmp/libssl1.1.deb'
31
  ], capture_output=True, text=True)
32
 
33
- # If dpkg install failed, try extracting manually
34
  if result.returncode != 0:
35
  print("dpkg install failed, trying manual extraction...")
36
  subprocess.run([
37
  'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
38
  ], check=True)
39
 
40
- # Set LD_LIBRARY_PATH to include the extracted libraries
41
  lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
42
  current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
43
  if current_ld_path:
@@ -52,8 +48,12 @@ def fix_ssl_library():
52
  print(f"Failed to install libssl1.1: {e}")
53
  return False
54
 
 
55
  def monkey_patch_pymupdf():
56
  """Fix PaddleOCR compatibility with newer PyMuPDF versions"""
 
 
 
57
  import fitz
58
 
59
  # Add pageCount property to Document class if it doesn't exist
@@ -62,7 +62,9 @@ def monkey_patch_pymupdf():
62
  return self.page_count
63
 
64
  fitz.Document.pageCount = property(pageCount_property)
65
- print("Added pageCount compatibility property to PyMuPDF Document class")
 
 
66
 
67
  # Add getPixmap method to Page class if it doesn't exist
68
  if not hasattr(fitz.Page, 'getPixmap'):
@@ -70,25 +72,31 @@ def monkey_patch_pymupdf():
70
  return self.get_pixmap(matrix=matrix, alpha=alpha)
71
 
72
  fitz.Page.getPixmap = getPixmap
73
- print("Added getPixmap compatibility method to PyMuPDF Page class")
 
 
74
 
75
- # Add other common compatibility methods if needed
76
  if not hasattr(fitz.Page, 'getText'):
77
  def getText(self, option="text"):
78
  return self.get_text(option)
79
 
80
  fitz.Page.getText = getText
81
- print("Added getText compatibility method to PyMuPDF Page class")
 
 
82
 
83
- print("PyMuPDF compatibility patches applied successfully")
84
 
85
- # Try alternative PaddlePaddle versions
86
  def try_paddle_import():
87
  """Try different approaches to import PaddleOCR"""
88
 
89
  # First try the SSL fix
90
  fix_ssl_library()
91
 
 
 
 
92
  # Try importing with different environment variables
93
  os.environ['PADDLE_GIT_DISABLE'] = '1'
94
 
@@ -99,7 +107,6 @@ def try_paddle_import():
99
  if 'libssl.so.1.1' in str(e):
100
  print("Still having SSL issues, trying alternative PaddlePaddle version...")
101
 
102
- # Try installing older version
103
  try:
104
  subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
105
  capture_output=True)
@@ -113,9 +120,11 @@ def try_paddle_import():
113
  print(f"PaddleOCR import failed: {e}")
114
  raise e
115
 
116
- # Import other required libraries
117
  import gradio as gr
118
- import fitz # PyMuPDF
 
 
119
 
120
  # Try to import PaddleOCR with fixes
121
  print("Attempting to import PaddleOCR...")
@@ -126,44 +135,43 @@ try:
126
  print("PaddleOCR models loaded successfully!")
127
  except Exception as e:
128
  print(f"Failed to load PaddleOCR: {e}")
129
- print("Application will exit - SSL library issue not resolved")
130
  sys.exit(1)
131
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  def process_document(file):
133
- """Process uploaded document with PaddleOCR - Debug Version"""
134
  if file is None:
135
  return "No file uploaded", "", ""
136
 
137
  start_time = time.time()
138
 
139
  try:
140
- # Debug file object
141
- print(f"File object type: {type(file)}")
142
- print(f"File object attributes: {dir(file)}")
143
-
144
- # Try different ways to get filename
145
- try:
146
- filename = os.path.basename(file.name)
147
- except AttributeError:
148
- try:
149
- filename = file.orig_name if hasattr(file, 'orig_name') else 'unknown.pdf'
150
- except:
151
- filename = 'unknown.pdf'
152
-
153
  print(f"Processing: {filename}")
154
 
155
- # Try different ways to access file path
156
- file_path = None
157
- if hasattr(file, 'name'):
158
- file_path = file.name
159
- elif hasattr(file, 'path'):
160
- file_path = file.path
161
- elif hasattr(file, 'file'):
162
- file_path = file.file.name if hasattr(file.file, 'name') else None
163
-
164
- if not file_path:
165
- return "Error: Could not access file path", "", json.dumps({"success": False, "error": "File path not accessible"})
166
-
167
  print(f"File path: {file_path}")
168
 
169
  # Count pages if PDF
@@ -173,17 +181,16 @@ def process_document(file):
173
  print(f"Opening PDF: {file_path}")
174
  doc = fitz.open(file_path)
175
 
176
- # Debug document object
177
- print(f"Document object type: {type(doc)}")
178
- print(f"Document attributes: {[attr for attr in dir(doc) if not attr.startswith('_')]}")
179
 
180
- # Try all possible ways to get page count
181
- if hasattr(doc, 'page_count'):
182
- total_pages = doc.page_count
183
- print(f"Used page_count: {total_pages}")
184
- elif hasattr(doc, 'pageCount'):
185
  total_pages = doc.pageCount
186
  print(f"Used pageCount: {total_pages}")
 
 
 
187
  else:
188
  total_pages = len(doc)
189
  print(f"Used len(): {total_pages}")
@@ -196,7 +203,6 @@ def process_document(file):
196
  # Run OCR
197
  print(f"Running OCR on: {file_path}")
198
  result = ocr.ocr(file_path, cls=True)
199
- print(f"OCR result type: {type(result)}")
200
 
201
  # Extract text
202
  extracted_text = ""
@@ -238,7 +244,7 @@ def process_document(file):
238
  import traceback
239
  traceback.print_exc()
240
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
241
-
242
  def process_api_request(api_data):
243
  """Process API-style requests (for integration with your Vercel app)"""
244
  try:
 
1
+ # app.py - Correct structure with monkey patch BEFORE any fitz imports
2
+
3
  import os
4
  import subprocess
5
  import sys
 
8
  import base64
9
  import json
10
 
11
+ # SSL fix function (keep as is)
12
  def fix_ssl_library():
13
  """Download and install libssl1.1 if not present"""
14
  try:
 
15
  if os.path.exists('/usr/lib/x86_64-linux-gnu/libssl.so.1.1'):
16
  print("libssl.so.1.1 already exists")
17
  return True
18
 
19
  print("Attempting to install libssl1.1...")
20
 
 
21
  subprocess.run([
22
  'wget', '-q',
23
  'http://archive.ubuntu.com/ubuntu/pool/main/o/openssl/libssl1.1_1.1.1f-1ubuntu2_amd64.deb',
24
  '-O', '/tmp/libssl1.1.deb'
25
  ], check=True)
26
 
 
27
  result = subprocess.run([
28
  'dpkg', '-i', '/tmp/libssl1.1.deb'
29
  ], capture_output=True, text=True)
30
 
 
31
  if result.returncode != 0:
32
  print("dpkg install failed, trying manual extraction...")
33
  subprocess.run([
34
  'dpkg', '-x', '/tmp/libssl1.1.deb', '/tmp/ssl_extract'
35
  ], check=True)
36
 
 
37
  lib_path = '/tmp/ssl_extract/usr/lib/x86_64-linux-gnu'
38
  current_ld_path = os.environ.get('LD_LIBRARY_PATH', '')
39
  if current_ld_path:
 
48
  print(f"Failed to install libssl1.1: {e}")
49
  return False
50
 
51
+ # CRITICAL: Apply monkey patch BEFORE importing fitz/PyMuPDF
52
  def monkey_patch_pymupdf():
53
  """Fix PaddleOCR compatibility with newer PyMuPDF versions"""
54
+ print("Applying PyMuPDF compatibility patches...")
55
+
56
+ # Import fitz here to apply patches
57
  import fitz
58
 
59
  # Add pageCount property to Document class if it doesn't exist
 
62
  return self.page_count
63
 
64
  fitz.Document.pageCount = property(pageCount_property)
65
+ print("βœ“ Added pageCount compatibility property to PyMuPDF Document class")
66
+ else:
67
+ print("βœ“ pageCount already exists")
68
 
69
  # Add getPixmap method to Page class if it doesn't exist
70
  if not hasattr(fitz.Page, 'getPixmap'):
 
72
  return self.get_pixmap(matrix=matrix, alpha=alpha)
73
 
74
  fitz.Page.getPixmap = getPixmap
75
+ print("βœ“ Added getPixmap compatibility method to PyMuPDF Page class")
76
+ else:
77
+ print("βœ“ getPixmap already exists")
78
 
79
+ # Add getText method if it doesn't exist
80
  if not hasattr(fitz.Page, 'getText'):
81
  def getText(self, option="text"):
82
  return self.get_text(option)
83
 
84
  fitz.Page.getText = getText
85
+ print("βœ“ Added getText compatibility method to PyMuPDF Page class")
86
+ else:
87
+ print("βœ“ getText already exists")
88
 
89
+ print("βœ“ PyMuPDF compatibility patches applied successfully")
90
 
 
91
  def try_paddle_import():
92
  """Try different approaches to import PaddleOCR"""
93
 
94
  # First try the SSL fix
95
  fix_ssl_library()
96
 
97
+ # CRITICAL: Apply PyMuPDF compatibility patches BEFORE importing PaddleOCR
98
+ monkey_patch_pymupdf()
99
+
100
  # Try importing with different environment variables
101
  os.environ['PADDLE_GIT_DISABLE'] = '1'
102
 
 
107
  if 'libssl.so.1.1' in str(e):
108
  print("Still having SSL issues, trying alternative PaddlePaddle version...")
109
 
 
110
  try:
111
  subprocess.run([sys.executable, '-m', 'pip', 'uninstall', 'paddlepaddle', '-y'],
112
  capture_output=True)
 
120
  print(f"PaddleOCR import failed: {e}")
121
  raise e
122
 
123
+ # Import Gradio
124
  import gradio as gr
125
+
126
+ # Import PyMuPDF AFTER monkey patch is defined but BEFORE PaddleOCR
127
+ import fitz # This import will use the patched version
128
 
129
  # Try to import PaddleOCR with fixes
130
  print("Attempting to import PaddleOCR...")
 
135
  print("PaddleOCR models loaded successfully!")
136
  except Exception as e:
137
  print(f"Failed to load PaddleOCR: {e}")
138
+ print("Application will exit - compatibility issue not resolved")
139
  sys.exit(1)
140
 
141
+ # Test the monkey patch
142
+ print("Testing monkey patch...")
143
+ test_doc = None
144
+ try:
145
+ # Create a simple test to verify pageCount exists
146
+ import io
147
+ pdf_content = b"%PDF-1.4\n1 0 obj\n<< /Type /Catalog /Pages 2 0 R >>\nendobj\n2 0 obj\n<< /Type /Pages /Kids [3 0 R] /Count 1 >>\nendobj\n3 0 obj\n<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] >>\nendobj\nxref\n0 4\n0000000000 65535 f \n0000000010 00000 n \n0000000053 00000 n \n0000000100 00000 n \ntrailer\n<< /Size 4 /Root 1 0 R >>\nstartxref\n179\n%%EOF"
148
+ test_doc = fitz.open(stream=pdf_content, filetype="pdf")
149
+
150
+ if hasattr(test_doc, 'pageCount'):
151
+ print(f"βœ“ Monkey patch successful! pageCount = {test_doc.pageCount}")
152
+ else:
153
+ print("βœ— Monkey patch failed - pageCount not found")
154
+ print(f"Available attributes: {[attr for attr in dir(test_doc) if 'count' in attr.lower()]}")
155
+
156
+ test_doc.close()
157
+ except Exception as e:
158
+ print(f"Monkey patch test failed: {e}")
159
+ if test_doc:
160
+ test_doc.close()
161
+
162
+ # Rest of your app code (process_document, API functions, Gradio interface, etc.)
163
  def process_document(file):
164
+ """Process uploaded document with PaddleOCR"""
165
  if file is None:
166
  return "No file uploaded", "", ""
167
 
168
  start_time = time.time()
169
 
170
  try:
171
+ filename = os.path.basename(file.name)
 
 
 
 
 
 
 
 
 
 
 
 
172
  print(f"Processing: {filename}")
173
 
174
+ file_path = file.name
 
 
 
 
 
 
 
 
 
 
 
175
  print(f"File path: {file_path}")
176
 
177
  # Count pages if PDF
 
181
  print(f"Opening PDF: {file_path}")
182
  doc = fitz.open(file_path)
183
 
184
+ # Test pageCount attribute
185
+ print(f"Document has pageCount attribute: {hasattr(doc, 'pageCount')}")
186
+ print(f"Document has page_count attribute: {hasattr(doc, 'page_count')}")
187
 
188
+ if hasattr(doc, 'pageCount'):
 
 
 
 
189
  total_pages = doc.pageCount
190
  print(f"Used pageCount: {total_pages}")
191
+ elif hasattr(doc, 'page_count'):
192
+ total_pages = doc.page_count
193
+ print(f"Used page_count: {total_pages}")
194
  else:
195
  total_pages = len(doc)
196
  print(f"Used len(): {total_pages}")
 
203
  # Run OCR
204
  print(f"Running OCR on: {file_path}")
205
  result = ocr.ocr(file_path, cls=True)
 
206
 
207
  # Extract text
208
  extracted_text = ""
 
244
  import traceback
245
  traceback.print_exc()
246
  return error_msg, "", json.dumps({"success": False, "error": str(e)})
247
+
248
  def process_api_request(api_data):
249
  """Process API-style requests (for integration with your Vercel app)"""
250
  try: