mbuckle commited on
Commit
43267ee
·
1 Parent(s): 997925e

Minimal test edition

Browse files
Files changed (2) hide show
  1. app.py +44 -278
  2. minimal_test_paddle.py +94 -0
app.py CHANGED
@@ -1,298 +1,64 @@
1
- # app.py - Using subprocess approach like your local Node.js implementation
2
-
3
- import os
4
  import subprocess
5
- import sys
6
- import tempfile
7
- import time
8
- import base64
9
  import json
10
-
11
- # Import Gradio
12
  import gradio as gr
13
 
14
- def run_paddle_ocr_subprocess(file_path):
15
- """Run PaddleOCR as a subprocess - mirrors your local Node.js approach"""
 
 
16
  try:
17
- # Get the path to our standalone OCR script
18
- script_path = os.path.join(os.path.dirname(__file__), 'paddle_ocr_standalone.py')
19
-
20
- # Run the subprocess - exactly like your Node.js implementation
21
- command = [sys.executable, script_path, file_path]
22
 
23
- print(f"Running command: {' '.join(command)}")
24
 
25
- # Track progress
26
- total_pages = 1
27
- current_page = 0
28
-
29
- process = subprocess.Popen(
30
  command,
31
- stdout=subprocess.PIPE,
32
- stderr=subprocess.PIPE,
33
  text=True,
34
- bufsize=1,
35
- universal_newlines=True
36
  )
37
 
38
- # Read stderr for progress updates (like your Node.js implementation)
39
- stderr_output = ""
40
- while True:
41
- stderr_line = process.stderr.readline()
42
- if not stderr_line:
43
- break
44
-
45
- stderr_output += stderr_line
46
-
47
- if stderr_line.startswith('TOTAL_PAGES:'):
48
- total_pages = int(stderr_line.split(':')[1].strip())
49
- print(f"Processing document with {total_pages} pages")
50
-
51
- elif stderr_line.startswith('CURRENT_PAGE:'):
52
- current_page = int(stderr_line.split(':')[1].strip())
53
- print(f"Processing page {current_page} of {total_pages}")
54
-
55
- # Wait for process to complete and get stdout
56
- stdout, remaining_stderr = process.communicate()
57
-
58
- if process.returncode != 0:
59
- print(f"OCR process failed with return code {process.returncode}")
60
- print(f"stderr: {stderr_output + remaining_stderr}")
61
- return {
62
- "success": False,
63
- "error": f"OCR process failed: {stderr_output + remaining_stderr}"
64
- }
65
-
66
- # Parse the JSON result from stdout
67
- try:
68
- result = json.loads(stdout.strip())
69
- print(f"OCR completed successfully: {result.get('pages_processed', 0)}/{result.get('total_pages', 0)} pages")
70
- return result
71
- except json.JSONDecodeError as e:
72
- print(f"Failed to parse OCR result: {e}")
73
- print(f"stdout: {stdout}")
74
- return {
75
- "success": False,
76
- "error": f"Failed to parse OCR result: {str(e)}"
77
- }
78
 
79
  except Exception as e:
80
- print(f"Error running OCR subprocess: {e}")
81
- return {
82
- "success": False,
83
- "error": str(e)
84
- }
85
 
86
- def process_document(file):
87
- """Process uploaded document using subprocess OCR"""
88
- if file is None:
89
- return "No file uploaded", "", ""
90
 
91
- start_time = time.time()
 
 
92
 
93
- try:
94
- filename = os.path.basename(file.name)
95
- print(f"Processing: {filename}")
96
-
97
- file_path = file.name
98
- print(f"File path: {file_path}")
99
-
100
- # Run OCR using subprocess (like your Node.js implementation)
101
- ocr_result = run_paddle_ocr_subprocess(file_path)
102
-
103
- if not ocr_result.get("success", False):
104
- error_msg = f"❌ OCR failed: {ocr_result.get('error', 'Unknown error')}"
105
- return error_msg, "", json.dumps(ocr_result)
106
-
107
- # Extract results
108
- extracted_text = ocr_result.get("text", "")
109
- pages_processed = ocr_result.get("pages_processed", 0)
110
- total_pages = ocr_result.get("total_pages", 1)
111
-
112
- processing_time = time.time() - start_time
113
-
114
- summary = f"""
115
- 📄 **File**: {filename}
116
- 📊 **Pages Processed**: {pages_processed}/{total_pages}
117
- ⏱️ **Processing Time**: {processing_time:.2f} seconds
118
- 📝 **Text Length**: {len(extracted_text)} characters
119
- 🔧 **OCR Engine**: PaddleOCR (Subprocess)
120
- ✅ **Method**: Subprocess execution (like your local Node.js implementation)
121
- """
122
-
123
- api_response = json.dumps({
124
- "success": True,
125
- "text": extracted_text,
126
- "filename": filename,
127
- "pages_processed": pages_processed,
128
- "total_pages": total_pages,
129
- "processing_time": processing_time,
130
- "ocr_engine": "PaddleOCR",
131
- "method": "subprocess"
132
- }, indent=2)
133
-
134
- return summary, extracted_text, api_response
135
-
136
- except Exception as e:
137
- error_msg = f"❌ Error processing file: {str(e)}"
138
- print(f"Full error: {e}")
139
- import traceback
140
- traceback.print_exc()
141
- return error_msg, "", json.dumps({"success": False, "error": str(e)})
142
-
143
- def process_api_request(api_data):
144
- """Process API-style requests (for integration with your Vercel app)"""
145
- try:
146
- data = json.loads(api_data)
147
-
148
- if 'file' not in data:
149
- return json.dumps({"success": False, "error": "No file data provided"})
150
-
151
- # Decode base64 file
152
- file_data = base64.b64decode(data['file'])
153
- filename = data.get('filename', 'unknown.pdf')
154
-
155
- # Save to temp file
156
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
157
- tmp_file.write(file_data)
158
- tmp_file_path = tmp_file.name
159
-
160
- try:
161
- # Run OCR using subprocess
162
- ocr_result = run_paddle_ocr_subprocess(tmp_file_path)
163
-
164
- if ocr_result.get("success", False):
165
- return json.dumps({
166
- "success": True,
167
- "text": ocr_result.get("text", ""),
168
- "filename": filename,
169
- "pages_processed": ocr_result.get("pages_processed", 0),
170
- "total_pages": ocr_result.get("total_pages", 1),
171
- "ocr_engine": "PaddleOCR",
172
- "method": "subprocess"
173
- })
174
- else:
175
- return json.dumps(ocr_result)
176
-
177
- finally:
178
- os.unlink(tmp_file_path)
179
-
180
- except Exception as e:
181
- return json.dumps({"success": False, "error": str(e)})
182
-
183
- # Create Gradio interface
184
- with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
185
- gr.Markdown("# 🏥 PaddleOCR Medical Document Processor")
186
- gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
187
-
188
- with gr.Tab("📄 File Upload"):
189
- with gr.Row():
190
- with gr.Column():
191
- file_input = gr.File(
192
- label="Upload Document (PDF, JPG, PNG)",
193
- file_types=[".pdf", ".jpg", ".jpeg", ".png"]
194
- )
195
- process_btn = gr.Button("🔍 Process Document", variant="primary")
196
-
197
- with gr.Column():
198
- summary_output = gr.Markdown(label="📊 Processing Summary")
199
-
200
- with gr.Row():
201
- text_output = gr.Textbox(
202
- label="📝 Extracted Text",
203
- lines=15,
204
- max_lines=20
205
- )
206
-
207
- process_btn.click(
208
- fn=process_document,
209
- inputs=[file_input],
210
- outputs=[summary_output, text_output, gr.Textbox(visible=False)]
211
- )
212
 
213
- with gr.Tab("🔌 API Integration"):
214
- gr.Markdown("### For integration with your Vercel app:")
215
- gr.Markdown("**Endpoint**: `https://mbuck17-paddleocr-processor.hf.space/api/predict`")
216
- gr.Markdown("**Method**: POST")
217
- gr.Markdown("**Headers**: `Content-Type: application/json`")
218
-
219
- with gr.Row():
220
- with gr.Column():
221
- gr.Markdown("**Sample Request:**")
222
- gr.Code('''
223
- {
224
- "data": [
225
- {
226
- "file": "base64_encoded_file_data_here",
227
- "filename": "lab_report.pdf"
228
- }
229
- ]
230
- }
231
- ''', language="json")
232
-
233
- with gr.Column():
234
- gr.Markdown("**Sample Response:**")
235
- gr.Code('''
236
- {
237
- "data": [
238
- {
239
- "success": true,
240
- "text": "Extracted text content...",
241
- "filename": "lab_report.pdf",
242
- "ocr_engine": "PaddleOCR",
243
- "method": "subprocess"
244
- }
245
- ]
246
- }
247
- ''', language="json")
248
-
249
- gr.Markdown("### Test API Request:")
250
- api_input = gr.Textbox(
251
- label="API Request (JSON)",
252
- placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
253
- lines=5
254
- )
255
- api_btn = gr.Button("🧪 Test API Request")
256
- api_output = gr.Textbox(
257
- label="API Response (JSON)",
258
- lines=10
259
- )
260
-
261
- api_btn.click(
262
- fn=process_api_request,
263
- inputs=[api_input],
264
- outputs=[api_output]
265
- )
266
-
267
- with gr.Tab("ℹ️ About"):
268
- gr.Markdown("""
269
- ### 🎯 Purpose
270
- This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
271
-
272
- ### 🔧 Integration
273
- This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
274
-
275
- ### 📚 Supported Formats
276
- - PDF documents (multi-page)
277
- - JPEG/JPG images
278
- - PNG images
279
-
280
- ### 🚀 Features
281
- - High accuracy OCR with PaddleOCR
282
- - Subprocess execution (mirrors your local Node.js implementation)
283
- - Medical document optimization
284
- - Multi-page PDF support
285
- - RESTful API integration
286
- - Free hosting on Hugging Face
287
-
288
- ### 🔗 Integration URL
289
- `https://mbuck17-paddleocr-processor.hf.space/api/predict`
290
-
291
- ### ⚙️ Architecture
292
- This implementation uses subprocess execution just like your local Node.js version,
293
- ensuring maximum compatibility with PaddleOCR's PDF processing capabilities.
294
- """)
295
 
296
- # Launch the app
297
  if __name__ == "__main__":
298
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
 
 
1
  import subprocess
 
 
 
 
2
  import json
3
+ import sys
 
4
  import gradio as gr
5
 
6
+ def test_ocr_minimal(file):
7
+ if file is None:
8
+ return "No file uploaded", ""
9
+
10
  try:
11
+ # Run the minimal test script
12
+ script_path = "/home/user/app/minimal_test_paddle.py"
13
+ command = [sys.executable, script_path, file.name]
 
 
14
 
15
+ print(f"Running: {' '.join(command)}")
16
 
17
+ process = subprocess.run(
 
 
 
 
18
  command,
19
+ capture_output=True,
 
20
  text=True,
21
+ timeout=120
 
22
  )
23
 
24
+ print(f"Return code: {process.returncode}")
25
+ print(f"Stderr: {process.stderr}")
26
+ print(f"Stdout: {process.stdout}")
27
+
28
+ if process.returncode == 0:
29
+ try:
30
+ result = json.loads(process.stdout.strip())
31
+ summary = f"""
32
+ **Success!**
33
+ - Detections: {result.get('detections', 0)}
34
+ - Text length: {len(result.get('text', ''))}
35
+ """
36
+ return summary, result.get('text', '')
37
+ except json.JSONDecodeError:
38
+ return f"JSON parse error. Stdout: {process.stdout}", ""
39
+ else:
40
+ return f"Process failed with code {process.returncode}\nStderr: {process.stderr}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
42
  except Exception as e:
43
+ return f"Error: {e}", ""
 
 
 
 
44
 
45
+ # Simple Gradio interface for testing
46
+ with gr.Blocks(title="OCR Test") as demo:
47
+ gr.Markdown("# Simple OCR Test")
 
48
 
49
+ with gr.Row():
50
+ file_input = gr.File(label="Upload PDF", file_types=[".pdf"])
51
+ test_btn = gr.Button("Test OCR")
52
 
53
+ with gr.Row():
54
+ summary_output = gr.Markdown(label="Summary")
55
+ text_output = gr.Textbox(label="Extracted Text", lines=10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
57
+ test_btn.click(
58
+ fn=test_ocr_minimal,
59
+ inputs=[file_input],
60
+ outputs=[summary_output, text_output]
61
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
 
63
  if __name__ == "__main__":
64
  demo.launch(server_name="0.0.0.0", server_port=7860)
minimal_test_paddle.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # minimal_test_paddle.py - Minimal test to isolate the OCR issue
3
+
4
+ import sys
5
+ import os
6
+ import json
7
+ import fitz
8
+ from paddleocr import PaddleOCR
9
+
10
+ def test_ocr():
11
+ if len(sys.argv) < 2:
12
+ print(json.dumps({"error": "No file path provided"}))
13
+ return
14
+
15
+ file_path = sys.argv[1]
16
+
17
+ try:
18
+ print(f"Testing OCR on: {file_path}", file=sys.stderr)
19
+
20
+ # Test 1: Can we open the PDF?
21
+ print("Opening PDF...", file=sys.stderr)
22
+ doc = fitz.open(file_path)
23
+ print(f"PDF has {len(doc)} pages", file=sys.stderr)
24
+
25
+ # Test 2: Convert first page to image
26
+ print("Converting first page to image...", file=sys.stderr)
27
+ page = doc[0]
28
+ mat = fitz.Matrix(150/72, 150/72)
29
+ pix = page.get_pixmap(matrix=mat)
30
+
31
+ temp_img = "/tmp/test_page.png"
32
+ pix.save(temp_img)
33
+
34
+ if os.path.exists(temp_img):
35
+ img_size = os.path.getsize(temp_img)
36
+ print(f"Image created: {temp_img} (size: {img_size} bytes, {pix.width}x{pix.height})", file=sys.stderr)
37
+ else:
38
+ print("Failed to create image", file=sys.stderr)
39
+ doc.close()
40
+ return
41
+
42
+ doc.close()
43
+
44
+ # Test 3: Initialize OCR
45
+ print("Initializing OCR...", file=sys.stderr)
46
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
47
+ print("OCR initialized", file=sys.stderr)
48
+
49
+ # Test 4: Run OCR on the image
50
+ print("Running OCR...", file=sys.stderr)
51
+ result = ocr.ocr(temp_img, cls=True)
52
+
53
+ print(f"OCR result type: {type(result)}", file=sys.stderr)
54
+ if result:
55
+ print(f"Result length: {len(result)}", file=sys.stderr)
56
+ if result[0]:
57
+ print(f"First page has {len(result[0])} detections", file=sys.stderr)
58
+
59
+ # Print all detected text
60
+ for i, detection in enumerate(result[0]):
61
+ if len(detection) >= 2:
62
+ text = detection[1][0] if isinstance(detection[1], (list, tuple)) else str(detection[1])
63
+ conf = detection[1][1] if isinstance(detection[1], (list, tuple)) and len(detection[1]) > 1 else 1.0
64
+ print(f"Detection {i}: '{text}' (confidence: {conf})", file=sys.stderr)
65
+ else:
66
+ print("First page result is empty", file=sys.stderr)
67
+ else:
68
+ print("OCR returned None", file=sys.stderr)
69
+
70
+ # Clean up
71
+ if os.path.exists(temp_img):
72
+ os.unlink(temp_img)
73
+
74
+ # Return simple result
75
+ text_found = ""
76
+ if result and result[0]:
77
+ for detection in result[0]:
78
+ if len(detection) >= 2:
79
+ text_found += detection[1][0] + "\n"
80
+
81
+ print(json.dumps({
82
+ "success": True,
83
+ "text": text_found,
84
+ "detections": len(result[0]) if result and result[0] else 0
85
+ }))
86
+
87
+ except Exception as e:
88
+ print(f"Error: {e}", file=sys.stderr)
89
+ import traceback
90
+ traceback.print_exc(file=sys.stderr)
91
+ print(json.dumps({"success": False, "error": str(e)}))
92
+
93
+ if __name__ == "__main__":
94
+ test_ocr()