anoopreddyyeddula commited on
Commit
bef72f9
·
1 Parent(s): 9b0cc99

Add robust file validation and error handling

Browse files

- Add file type and size validation
- Improve OCR preprocessing and error handling
- Enhance Gradio interface with better descriptions
- Add detailed logging for debugging
- Update documentation with file size limits

Files changed (1) hide show
  1. app.py +123 -24
app.py CHANGED
@@ -8,6 +8,7 @@ import PyPDF2
8
  import pandas as pd
9
  import io
10
  from datetime import datetime
 
11
 
12
  # Initialize the OCR reader
13
  reader = easyocr.Reader(['en'])
@@ -21,8 +22,34 @@ doc_classifier = pipeline("image-classification", model="microsoft/resnet-50")
21
  def convert_pdf_to_images(pdf_file):
22
  """Convert PDF to list of images"""
23
  try:
24
- return pdf2image.convert_from_bytes(pdf_file.read())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  except Exception as e:
 
26
  return None
27
 
28
  def process_single_image(image):
@@ -72,6 +99,12 @@ def ocr_function(image):
72
  if isinstance(image, Image.Image):
73
  image = np.array(image)
74
 
 
 
 
 
 
 
75
  # Perform OCR
76
  results = reader.readtext(image)
77
 
@@ -83,6 +116,7 @@ def ocr_function(image):
83
 
84
  return text.strip()
85
  except Exception as e:
 
86
  return f"OCR Error: {str(e)}"
87
 
88
  def validate_text(text):
@@ -122,34 +156,86 @@ def validate_image(image):
122
  except Exception as e:
123
  return False, f"Image validation error: {str(e)}"
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def process_claim(file):
126
  try:
127
  if file is None:
128
  return "No file provided", "N/A", "N/A", None
 
 
 
 
 
 
 
 
129
 
130
- # Check if file is PDF
131
- if hasattr(file, 'name') and file.name.lower().endswith('.pdf'):
132
  images = convert_pdf_to_images(file)
133
  if not images:
134
- return "Failed to process PDF", "Error", "Error", None
135
  else:
136
- # Handle single image
137
- if isinstance(file, np.ndarray):
138
- images = [Image.fromarray(file)]
139
- else:
140
- images = [file]
 
141
 
142
- # Process each page
143
  results = []
144
  for idx, img in enumerate(images):
145
- result = process_single_image(img)
146
- result['page'] = idx + 1
147
- results.append(result)
 
 
 
 
 
 
 
 
 
 
148
 
 
 
 
149
  # Generate report
150
- summary, excel_file = generate_report(results)
 
 
 
151
 
152
- # Return results for display
153
  return (
154
  "\n\n=== Page Break ===\n\n".join([r['text'] for r in results]),
155
  "\n".join([f"Page {r['page']}: {r['validation']} ({r['validation_confidence']:.2%})" for r in results]),
@@ -158,22 +244,35 @@ def process_claim(file):
158
  )
159
 
160
  except Exception as e:
 
161
  return f"Processing error: {str(e)}", "Error", "Error", None
162
 
163
- # Create Gradio interface
164
  iface = gr.Interface(
165
  fn=process_claim,
166
- inputs=gr.File(label="Upload Insurance Document (Image or PDF)"),
 
 
 
 
 
 
 
167
  outputs=[
168
- gr.Textbox(label="Extracted Text"),
169
- gr.Textbox(label="Text Validation"),
170
- gr.Textbox(label="Document Classification"),
171
- gr.File(label="Download Report")
172
  ],
173
  title="Insurance Claim Validation System",
174
- description="Upload an insurance claim document (PDF or image) to validate and classify it.",
175
- # Remove the examples for now
176
- theme=gr.themes.Soft()
 
 
 
 
 
177
  )
178
 
179
  if __name__ == "__main__":
 
8
  import pandas as pd
9
  import io
10
  from datetime import datetime
11
+ import cv2
12
 
13
  # Initialize the OCR reader
14
  reader = easyocr.Reader(['en'])
 
22
  def convert_pdf_to_images(pdf_file):
23
  """Convert PDF to list of images"""
24
  try:
25
+ # Save PDF content to a temporary file
26
+ pdf_content = pdf_file.read()
27
+ pdf_buffer = io.BytesIO(pdf_content)
28
+
29
+ # Check if PDF is valid
30
+ try:
31
+ pdf_reader = PyPDF2.PdfReader(pdf_buffer)
32
+ if len(pdf_reader.pages) == 0:
33
+ raise ValueError("PDF has no pages")
34
+ except Exception as e:
35
+ raise ValueError(f"Invalid PDF file: {str(e)}")
36
+
37
+ # Reset buffer position
38
+ pdf_buffer.seek(0)
39
+
40
+ # Convert to images
41
+ images = pdf2image.convert_from_bytes(
42
+ pdf_buffer.read(),
43
+ dpi=300, # Increase DPI for better quality
44
+ fmt='PNG'
45
+ )
46
+
47
+ if not images:
48
+ raise ValueError("No images extracted from PDF")
49
+
50
+ return images
51
  except Exception as e:
52
+ print(f"PDF conversion error: {str(e)}") # Debug logging
53
  return None
54
 
55
  def process_single_image(image):
 
99
  if isinstance(image, Image.Image):
100
  image = np.array(image)
101
 
102
+ # Image preprocessing
103
+ if len(image.shape) == 2: # Convert grayscale to RGB
104
+ image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
105
+ elif len(image.shape) == 3 and image.shape[2] == 4: # Convert RGBA to RGB
106
+ image = cv2.cvtColor(image, cv2.COLOR_RGBA2RGB)
107
+
108
  # Perform OCR
109
  results = reader.readtext(image)
110
 
 
116
 
117
  return text.strip()
118
  except Exception as e:
119
+ print(f"OCR Error: {str(e)}") # Debug logging
120
  return f"OCR Error: {str(e)}"
121
 
122
  def validate_text(text):
 
156
  except Exception as e:
157
  return False, f"Image validation error: {str(e)}"
158
 
159
+ def validate_file_type(file):
160
+ """Validate file type and size"""
161
+ try:
162
+ if not hasattr(file, 'name'):
163
+ return False, "Invalid file object"
164
+
165
+ # Get file extension
166
+ file_ext = file.name.lower().split('.')[-1]
167
+
168
+ # Check allowed extensions
169
+ allowed_extensions = {'pdf', 'png', 'jpg', 'jpeg', 'tiff'}
170
+ if file_ext not in allowed_extensions:
171
+ return False, f"Unsupported file type. Allowed types: {', '.join(allowed_extensions)}"
172
+
173
+ # Check file size (max 10MB)
174
+ MAX_FILE_SIZE = 10 * 1024 * 1024 # 10MB in bytes
175
+ file.seek(0, 2) # Seek to end of file
176
+ file_size = file.tell()
177
+ file.seek(0) # Reset file pointer
178
+
179
+ if file_size > MAX_FILE_SIZE:
180
+ return False, "File too large. Maximum size: 10MB"
181
+
182
+ return True, None
183
+ except Exception as e:
184
+ return False, f"File validation error: {str(e)}"
185
+
186
  def process_claim(file):
187
  try:
188
  if file is None:
189
  return "No file provided", "N/A", "N/A", None
190
+
191
+ # Validate file type and size
192
+ is_valid, error_message = validate_file_type(file)
193
+ if not is_valid:
194
+ return error_message, "Error", "Error", None
195
+
196
+ print(f"Processing file: {file.name}")
197
+ print(f"File type: {type(file)}")
198
 
199
+ # Process PDF
200
+ if file.name.lower().endswith('.pdf'):
201
  images = convert_pdf_to_images(file)
202
  if not images:
203
+ return "Failed to convert PDF to images. Please check if the PDF is valid.", "Error", "Error", None
204
  else:
205
+ # Process image
206
+ try:
207
+ img = Image.open(file)
208
+ images = [img]
209
+ except Exception as e:
210
+ return f"Image processing error: {str(e)}", "Error", "Error", None
211
 
212
+ # Process each page/image
213
  results = []
214
  for idx, img in enumerate(images):
215
+ try:
216
+ # Validate image
217
+ valid, validated_img = validate_image(img)
218
+ if not valid:
219
+ return f"Invalid image on page {idx + 1}: {validated_img}", "Error", "Error", None
220
+
221
+ # Process image
222
+ result = process_single_image(validated_img)
223
+ result['page'] = idx + 1
224
+ results.append(result)
225
+
226
+ except Exception as e:
227
+ return f"Error processing page {idx + 1}: {str(e)}", "Error", "Error", None
228
 
229
+ if not results:
230
+ return "No valid results obtained from processing", "Error", "Error", None
231
+
232
  # Generate report
233
+ try:
234
+ summary, excel_file = generate_report(results)
235
+ except Exception as e:
236
+ return f"Error generating report: {str(e)}", "Error", "Error", None
237
 
238
+ # Return results
239
  return (
240
  "\n\n=== Page Break ===\n\n".join([r['text'] for r in results]),
241
  "\n".join([f"Page {r['page']}: {r['validation']} ({r['validation_confidence']:.2%})" for r in results]),
 
244
  )
245
 
246
  except Exception as e:
247
+ print(f"Error in process_claim: {str(e)}")
248
  return f"Processing error: {str(e)}", "Error", "Error", None
249
 
250
+ # Update the Gradio interface with better descriptions and examples
251
  iface = gr.Interface(
252
  fn=process_claim,
253
+ inputs=[
254
+ gr.File(
255
+ label="Upload Insurance Document",
256
+ type="file",
257
+ file_types=['.pdf', '.png', '.jpg', '.jpeg', '.tiff'],
258
+ description="Supported formats: PDF, PNG, JPG, JPEG, TIFF (Max size: 10MB)"
259
+ )
260
+ ],
261
  outputs=[
262
+ gr.Textbox(label="Extracted Text", lines=10),
263
+ gr.Textbox(label="Text Validation Results", lines=5),
264
+ gr.Textbox(label="Document Classification Results", lines=5),
265
+ gr.File(label="Download Report (Excel)")
266
  ],
267
  title="Insurance Claim Validation System",
268
+ description="""
269
+ Upload an insurance claim document to:
270
+ 1. Extract and validate text content
271
+ 2. Classify document type
272
+ 3. Generate detailed analysis report
273
+ """,
274
+ theme=gr.themes.Soft(),
275
+ allow_flagging="never"
276
  )
277
 
278
  if __name__ == "__main__":