mbuckle commited on
Commit
8c10e64
Β·
1 Parent(s): b313953

Initial PaddleOCR setup

Browse files
Files changed (3) hide show
  1. README.md +22 -7
  2. app.py +195 -0
  3. requirements.txt +5 -0
README.md CHANGED
@@ -1,14 +1,29 @@
 
1
  ---
2
- title: Paddleocr Processor
3
- emoji: ⚑
4
- colorFrom: yellow
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.32.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: OCR processor for health tracker app
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
  ---
3
+ title: PaddleOCR Medical Document Processor
4
+ emoji: πŸ₯
5
+ colorFrom: blue
6
+ colorTo: green
7
  sdk: gradio
8
+ sdk_version: 4.8.0
9
  app_file: app.py
10
  pinned: false
11
  license: mit
 
12
  ---
13
 
14
+ # PaddleOCR Medical Document Processor
15
+
16
+ This Hugging Face Space provides OCR processing for medical documents using PaddleOCR.
17
+
18
+ ## Features
19
+ - Extract text from PDFs and images
20
+ - Optimized for medical/lab documents
21
+ - RESTful API for integration
22
+ - Multi-page PDF support
23
+
24
+ ## API Usage
25
+ Send POST requests to the `/api/predict` endpoint with JSON data containing base64-encoded files.
26
+
27
+ ## Integration
28
+ This space can be integrated with external applications as an OCR microservice.
29
+ """
app.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py - Hugging Face Spaces version
2
+ import gradio as gr
3
+ import tempfile
4
+ import os
5
+ import time
6
+ import base64
7
+ import json
8
+ from paddleocr import PaddleOCR
9
+ import fitz # PyMuPDF
10
+
11
+ # Initialize PaddleOCR
12
+ print("Loading PaddleOCR models...")
13
+ ocr = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
14
+ print("PaddleOCR models loaded!")
15
+
16
+ def process_document(file):
17
+ """Process uploaded document with PaddleOCR"""
18
+ if file is None:
19
+ return "No file uploaded", "", ""
20
+
21
+ start_time = time.time()
22
+
23
+ try:
24
+ # Get file info
25
+ filename = os.path.basename(file.name)
26
+ print(f"Processing: {filename}")
27
+
28
+ # Count pages if PDF
29
+ total_pages = 1
30
+ if filename.lower().endswith('.pdf'):
31
+ try:
32
+ doc = fitz.open(file.name)
33
+ total_pages = len(doc)
34
+ doc.close()
35
+ except Exception as e:
36
+ print(f"Could not count PDF pages: {e}")
37
+
38
+ # Run OCR
39
+ result = ocr.ocr(file.name, cls=True)
40
+
41
+ # Extract text
42
+ extracted_text = ""
43
+ pages_processed = 0
44
+
45
+ for page_idx, page_result in enumerate(result):
46
+ if page_result:
47
+ pages_processed += 1
48
+ for line in page_result:
49
+ if len(line) >= 2 and line[1][1] > 0.5: # Confidence > 50%
50
+ extracted_text += line[1][0] + "\n"
51
+
52
+ processing_time = time.time() - start_time
53
+
54
+ # Create summary
55
+ summary = f"""
56
+ πŸ“„ **File**: {filename}
57
+ πŸ“Š **Pages Processed**: {pages_processed}/{total_pages}
58
+ ⏱️ **Processing Time**: {processing_time:.2f} seconds
59
+ πŸ“ **Text Length**: {len(extracted_text)} characters
60
+ """
61
+
62
+ # For API compatibility, also return JSON format
63
+ api_response = json.dumps({
64
+ "success": True,
65
+ "text": extracted_text,
66
+ "filename": filename,
67
+ "pages_processed": pages_processed,
68
+ "total_pages": total_pages,
69
+ "processing_time": processing_time
70
+ }, indent=2)
71
+
72
+ return summary, extracted_text, api_response
73
+
74
+ except Exception as e:
75
+ error_msg = f"Error processing file: {str(e)}"
76
+ return error_msg, "", json.dumps({"success": False, "error": str(e)})
77
+
78
+ def process_api_request(api_data):
79
+ """Process API-style requests (for integration with your Vercel app)"""
80
+ try:
81
+ data = json.loads(api_data)
82
+
83
+ if 'file' not in data:
84
+ return json.dumps({"success": False, "error": "No file data provided"})
85
+
86
+ # Decode base64 file
87
+ file_data = base64.b64decode(data['file'])
88
+ filename = data.get('filename', 'unknown.pdf')
89
+
90
+ # Save to temp file
91
+ with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file:
92
+ tmp_file.write(file_data)
93
+ tmp_file_path = tmp_file.name
94
+
95
+ try:
96
+ # Run OCR
97
+ result = ocr.ocr(tmp_file_path, cls=True)
98
+
99
+ # Extract text
100
+ text = ""
101
+ for page_result in result:
102
+ if page_result:
103
+ for line in page_result:
104
+ if len(line) >= 2:
105
+ text += line[1][0] + "\n"
106
+
107
+ return json.dumps({
108
+ "success": True,
109
+ "text": text,
110
+ "filename": filename
111
+ })
112
+
113
+ finally:
114
+ os.unlink(tmp_file_path)
115
+
116
+ except Exception as e:
117
+ return json.dumps({"success": False, "error": str(e)})
118
+
119
+ # Create Gradio interface with multiple tabs
120
+ with gr.Blocks(title="PaddleOCR Medical Document Processor") as demo:
121
+ gr.Markdown("# πŸ₯ PaddleOCR Medical Document Processor")
122
+ gr.Markdown("Upload medical documents (PDF/images) to extract text using PaddleOCR")
123
+
124
+ with gr.Tab("πŸ“„ File Upload"):
125
+ with gr.Row():
126
+ with gr.Column():
127
+ file_input = gr.File(
128
+ label="Upload Document (PDF, JPG, PNG)",
129
+ file_types=[".pdf", ".jpg", ".jpeg", ".png"]
130
+ )
131
+ process_btn = gr.Button("πŸ” Process Document", variant="primary")
132
+
133
+ with gr.Column():
134
+ summary_output = gr.Markdown(label="πŸ“Š Processing Summary")
135
+
136
+ with gr.Row():
137
+ text_output = gr.Textbox(
138
+ label="πŸ“ Extracted Text",
139
+ lines=15,
140
+ max_lines=20
141
+ )
142
+
143
+ process_btn.click(
144
+ fn=process_document,
145
+ inputs=[file_input],
146
+ outputs=[summary_output, text_output, gr.Textbox(visible=False)]
147
+ )
148
+
149
+ with gr.Tab("πŸ”Œ API Integration"):
150
+ gr.Markdown("### For integration with your Vercel app:")
151
+ gr.Markdown("**Endpoint**: `https://your-space-name-your-username.hf.space/api/predict`")
152
+ gr.Markdown("**Method**: POST")
153
+ gr.Markdown("**Headers**: `Content-Type: application/json`")
154
+
155
+ api_input = gr.Textbox(
156
+ label="API Request (JSON)",
157
+ placeholder='{"file": "base64_encoded_file_data", "filename": "document.pdf"}',
158
+ lines=5
159
+ )
160
+ api_btn = gr.Button("πŸ§ͺ Test API Request")
161
+ api_output = gr.Textbox(
162
+ label="API Response (JSON)",
163
+ lines=10
164
+ )
165
+
166
+ api_btn.click(
167
+ fn=process_api_request,
168
+ inputs=[api_input],
169
+ outputs=[api_output]
170
+ )
171
+
172
+ with gr.Tab("ℹ️ About"):
173
+ gr.Markdown("""
174
+ ### 🎯 Purpose
175
+ This service extracts text from medical documents using PaddleOCR, specifically designed for lab reports and medical forms.
176
+
177
+ ### πŸ”§ Integration
178
+ This Hugging Face Space can be integrated with your Vercel app as an external OCR service.
179
+
180
+ ### πŸ“š Supported Formats
181
+ - PDF documents (multi-page)
182
+ - JPEG/JPG images
183
+ - PNG images
184
+
185
+ ### πŸš€ Features
186
+ - High accuracy OCR with PaddleOCR
187
+ - Medical document optimization
188
+ - Multi-page PDF support
189
+ - RESTful API integration
190
+ - Free hosting on Hugging Face
191
+ """)
192
+
193
+ # Launch the app
194
+ if __name__ == "__main__":
195
+ demo.launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio==4.8.0
2
+ paddlepaddle==2.5.1
3
+ paddleocr==2.6.1.3
4
+ PyMuPDF==1.23.0
5
+ Pillow==10.0.0